xref: /freebsd/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: BSD-2-Clause
2 /*
3  * Copyright (c) 2021 Klara Systems, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/kmem.h>
31 #include <linux/file.h>
32 #include <linux/magic.h>
33 #include <sys/zone.h>
34 #include <sys/string.h>
35 
36 #if defined(CONFIG_USER_NS)
37 #include <linux/statfs.h>
38 #include <linux/proc_ns.h>
39 #endif
40 
41 #include <sys/mutex.h>
42 
43 static kmutex_t zone_datasets_lock;
44 static struct list_head zone_datasets;
45 
46 typedef struct zone_datasets {
47 	struct list_head zds_list;	/* zone_datasets linkage */
48 	struct user_namespace *zds_userns; /* namespace reference */
49 	struct list_head zds_datasets;	/* datasets for the namespace */
50 } zone_datasets_t;
51 
52 typedef struct zone_dataset {
53 	struct list_head zd_list;	/* zone_dataset linkage */
54 	size_t zd_dsnamelen;		/* length of name */
55 	char zd_dsname[];		/* name of the member dataset */
56 } zone_dataset_t;
57 
58 #ifdef CONFIG_USER_NS
59 /*
60  * Returns:
61  * - 0 on success
62  * - EBADF if it cannot open the provided file descriptor
63  * - ENOTTY if the file itself is a not a user namespace file. We want to
64  *   intercept this error in the ZFS layer. We cannot just return one of the
65  *   ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
66  *   and the SPL layers.
67  */
68 static int
user_ns_get(int fd,struct user_namespace ** userns)69 user_ns_get(int fd, struct user_namespace **userns)
70 {
71 	struct kstatfs st;
72 	struct file *nsfile;
73 	struct ns_common *ns;
74 	int error;
75 
76 	if ((nsfile = fget(fd)) == NULL)
77 		return (EBADF);
78 	if (vfs_statfs(&nsfile->f_path, &st) != 0) {
79 		error = ENOTTY;
80 		goto done;
81 	}
82 	if (st.f_type != NSFS_MAGIC) {
83 		error = ENOTTY;
84 		goto done;
85 	}
86 	ns = get_proc_ns(file_inode(nsfile));
87 	if (ns->ops->type != CLONE_NEWUSER) {
88 		error = ENOTTY;
89 		goto done;
90 	}
91 	*userns = container_of(ns, struct user_namespace, ns);
92 
93 	error = 0;
94 done:
95 	fput(nsfile);
96 
97 	return (error);
98 }
99 #endif /* CONFIG_USER_NS */
100 
101 static unsigned int
user_ns_zoneid(struct user_namespace * user_ns)102 user_ns_zoneid(struct user_namespace *user_ns)
103 {
104 	unsigned int r;
105 
106 	r = user_ns->ns.inum;
107 
108 	return (r);
109 }
110 
111 static struct zone_datasets *
zone_datasets_lookup(unsigned int nsinum)112 zone_datasets_lookup(unsigned int nsinum)
113 {
114 	zone_datasets_t *zds;
115 
116 	list_for_each_entry(zds, &zone_datasets, zds_list) {
117 		if (user_ns_zoneid(zds->zds_userns) == nsinum)
118 			return (zds);
119 	}
120 	return (NULL);
121 }
122 
123 #ifdef CONFIG_USER_NS
124 static struct zone_dataset *
zone_dataset_lookup(zone_datasets_t * zds,const char * dataset,size_t dsnamelen)125 zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
126 {
127 	zone_dataset_t *zd;
128 
129 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
130 		if (zd->zd_dsnamelen != dsnamelen)
131 			continue;
132 		if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
133 			return (zd);
134 	}
135 
136 	return (NULL);
137 }
138 
139 static int
zone_dataset_cred_check(cred_t * cred)140 zone_dataset_cred_check(cred_t *cred)
141 {
142 
143 	if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
144 		return (EPERM);
145 
146 	return (0);
147 }
148 #endif /* CONFIG_USER_NS */
149 
150 static int
zone_dataset_name_check(const char * dataset,size_t * dsnamelen)151 zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
152 {
153 
154 	if (dataset[0] == '\0' || dataset[0] == '/')
155 		return (ENOENT);
156 
157 	*dsnamelen = strlen(dataset);
158 	/* Ignore trailing slash, if supplied. */
159 	if (dataset[*dsnamelen - 1] == '/')
160 		(*dsnamelen)--;
161 
162 	return (0);
163 }
164 
165 int
zone_dataset_attach(cred_t * cred,const char * dataset,int userns_fd)166 zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
167 {
168 #ifdef CONFIG_USER_NS
169 	struct user_namespace *userns;
170 	zone_datasets_t *zds;
171 	zone_dataset_t *zd;
172 	int error;
173 	size_t dsnamelen;
174 
175 	if ((error = zone_dataset_cred_check(cred)) != 0)
176 		return (error);
177 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
178 		return (error);
179 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
180 		return (error);
181 
182 	mutex_enter(&zone_datasets_lock);
183 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
184 	if (zds == NULL) {
185 		zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
186 		INIT_LIST_HEAD(&zds->zds_list);
187 		INIT_LIST_HEAD(&zds->zds_datasets);
188 		zds->zds_userns = userns;
189 		/*
190 		 * Lock the namespace by incresing its refcount to prevent
191 		 * the namespace ID from being reused.
192 		 */
193 		get_user_ns(userns);
194 		list_add_tail(&zds->zds_list, &zone_datasets);
195 	} else {
196 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
197 		if (zd != NULL) {
198 			mutex_exit(&zone_datasets_lock);
199 			return (EEXIST);
200 		}
201 	}
202 
203 	zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
204 	zd->zd_dsnamelen = dsnamelen;
205 	strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
206 	INIT_LIST_HEAD(&zd->zd_list);
207 	list_add_tail(&zd->zd_list, &zds->zds_datasets);
208 
209 	mutex_exit(&zone_datasets_lock);
210 	return (0);
211 #else
212 	return (ENXIO);
213 #endif /* CONFIG_USER_NS */
214 }
215 EXPORT_SYMBOL(zone_dataset_attach);
216 
217 int
zone_dataset_detach(cred_t * cred,const char * dataset,int userns_fd)218 zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
219 {
220 #ifdef CONFIG_USER_NS
221 	struct user_namespace *userns;
222 	zone_datasets_t *zds;
223 	zone_dataset_t *zd;
224 	int error;
225 	size_t dsnamelen;
226 
227 	if ((error = zone_dataset_cred_check(cred)) != 0)
228 		return (error);
229 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
230 		return (error);
231 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
232 		return (error);
233 
234 	mutex_enter(&zone_datasets_lock);
235 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
236 	if (zds != NULL)
237 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
238 	if (zds == NULL || zd == NULL) {
239 		mutex_exit(&zone_datasets_lock);
240 		return (ENOENT);
241 	}
242 
243 	list_del(&zd->zd_list);
244 	kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
245 
246 	/* Prune the namespace entry if it has no more delegations. */
247 	if (list_empty(&zds->zds_datasets)) {
248 		/*
249 		 * Decrease the refcount now that the namespace is no longer
250 		 * used. It is no longer necessary to prevent the namespace ID
251 		 * from being reused.
252 		 */
253 		put_user_ns(userns);
254 		list_del(&zds->zds_list);
255 		kmem_free(zds, sizeof (*zds));
256 	}
257 
258 	mutex_exit(&zone_datasets_lock);
259 	return (0);
260 #else
261 	return (ENXIO);
262 #endif /* CONFIG_USER_NS */
263 }
264 EXPORT_SYMBOL(zone_dataset_detach);
265 
266 /*
267  * A dataset is visible if:
268  * - It is a parent of a namespace entry.
269  * - It is one of the namespace entries.
270  * - It is a child of a namespace entry.
271  *
272  * A dataset is writable if:
273  * - It is one of the namespace entries.
274  * - It is a child of a namespace entry.
275  *
276  * The parent datasets of namespace entries are visible and
277  * read-only to provide a path back to the root of the pool.
278  */
279 int
zone_dataset_visible(const char * dataset,int * write)280 zone_dataset_visible(const char *dataset, int *write)
281 {
282 	zone_datasets_t *zds;
283 	zone_dataset_t *zd;
284 	size_t dsnamelen, zd_len;
285 	int visible;
286 
287 	/* Default to read-only, in case visible is returned. */
288 	if (write != NULL)
289 		*write = 0;
290 	if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
291 		return (0);
292 	if (INGLOBALZONE(curproc)) {
293 		if (write != NULL)
294 			*write = 1;
295 		return (1);
296 	}
297 
298 	mutex_enter(&zone_datasets_lock);
299 	zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
300 	if (zds == NULL) {
301 		mutex_exit(&zone_datasets_lock);
302 		return (0);
303 	}
304 
305 	visible = 0;
306 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
307 		zd_len = strlen(zd->zd_dsname);
308 		if (zd_len > dsnamelen) {
309 			/*
310 			 * The name of the namespace entry is longer than that
311 			 * of the dataset, so it could be that the dataset is a
312 			 * parent of the namespace entry.
313 			 */
314 			visible = memcmp(zd->zd_dsname, dataset,
315 			    dsnamelen) == 0 &&
316 			    zd->zd_dsname[dsnamelen] == '/';
317 			if (visible)
318 				break;
319 		} else if (zd_len == dsnamelen) {
320 			/*
321 			 * The name of the namespace entry is as long as that
322 			 * of the dataset, so perhaps the dataset itself is the
323 			 * namespace entry.
324 			 */
325 			visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
326 			if (visible) {
327 				if (write != NULL)
328 					*write = 1;
329 				break;
330 			}
331 		} else {
332 			/*
333 			 * The name of the namespace entry is shorter than that
334 			 * of the dataset, so perhaps the dataset is a child of
335 			 * the namespace entry.
336 			 */
337 			visible = memcmp(zd->zd_dsname, dataset,
338 			    zd_len) == 0 && dataset[zd_len] == '/';
339 			if (visible) {
340 				if (write != NULL)
341 					*write = 1;
342 				break;
343 			}
344 		}
345 	}
346 
347 	mutex_exit(&zone_datasets_lock);
348 	return (visible);
349 }
350 EXPORT_SYMBOL(zone_dataset_visible);
351 
352 unsigned int
global_zoneid(void)353 global_zoneid(void)
354 {
355 	unsigned int z = 0;
356 
357 #if defined(CONFIG_USER_NS)
358 	z = user_ns_zoneid(&init_user_ns);
359 #endif
360 
361 	return (z);
362 }
363 EXPORT_SYMBOL(global_zoneid);
364 
365 unsigned int
crgetzoneid(const cred_t * cr)366 crgetzoneid(const cred_t *cr)
367 {
368 	unsigned int r = 0;
369 
370 #if defined(CONFIG_USER_NS)
371 	r = user_ns_zoneid(cr->user_ns);
372 #endif
373 
374 	return (r);
375 }
376 EXPORT_SYMBOL(crgetzoneid);
377 
378 boolean_t
inglobalzone(proc_t * proc)379 inglobalzone(proc_t *proc)
380 {
381 #if defined(CONFIG_USER_NS)
382 	return (proc->cred->user_ns == &init_user_ns);
383 #else
384 	return (B_TRUE);
385 #endif
386 }
387 EXPORT_SYMBOL(inglobalzone);
388 
389 int
spl_zone_init(void)390 spl_zone_init(void)
391 {
392 	mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
393 	INIT_LIST_HEAD(&zone_datasets);
394 	return (0);
395 }
396 
397 void
spl_zone_fini(void)398 spl_zone_fini(void)
399 {
400 	zone_datasets_t *zds;
401 	zone_dataset_t *zd;
402 
403 	/*
404 	 * It would be better to assert an empty zone_datasets, but since
405 	 * there's no automatic mechanism for cleaning them up if the user
406 	 * namespace is destroyed, just do it here, since spl is about to go
407 	 * out of context.
408 	 */
409 	while (!list_empty(&zone_datasets)) {
410 		zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
411 		while (!list_empty(&zds->zds_datasets)) {
412 			zd = list_entry(zds->zds_datasets.next,
413 			    zone_dataset_t, zd_list);
414 			list_del(&zd->zd_list);
415 			kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
416 		}
417 		put_user_ns(zds->zds_userns);
418 		list_del(&zds->zds_list);
419 		kmem_free(zds, sizeof (*zds));
420 	}
421 	mutex_destroy(&zone_datasets_lock);
422 }
423