1 // SPDX-License-Identifier: BSD-2-Clause
2 /*
3 * Copyright (c) 2021 Klara Systems, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/kmem.h>
31 #include <linux/file.h>
32 #include <linux/magic.h>
33 #include <sys/zone.h>
34 #include <sys/string.h>
35
36 #if defined(CONFIG_USER_NS)
37 #include <linux/statfs.h>
38 #include <linux/proc_ns.h>
39 #endif
40
41 #include <sys/mutex.h>
42
43 static kmutex_t zone_datasets_lock;
44 static struct list_head zone_datasets;
45
46 typedef struct zone_datasets {
47 struct list_head zds_list; /* zone_datasets linkage */
48 struct user_namespace *zds_userns; /* namespace reference */
49 struct list_head zds_datasets; /* datasets for the namespace */
50 } zone_datasets_t;
51
52 typedef struct zone_dataset {
53 struct list_head zd_list; /* zone_dataset linkage */
54 size_t zd_dsnamelen; /* length of name */
55 char zd_dsname[]; /* name of the member dataset */
56 } zone_dataset_t;
57
58 #ifdef CONFIG_USER_NS
59 /*
60 * Returns:
61 * - 0 on success
62 * - EBADF if it cannot open the provided file descriptor
63 * - ENOTTY if the file itself is a not a user namespace file. We want to
64 * intercept this error in the ZFS layer. We cannot just return one of the
65 * ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
66 * and the SPL layers.
67 */
68 static int
user_ns_get(int fd,struct user_namespace ** userns)69 user_ns_get(int fd, struct user_namespace **userns)
70 {
71 struct kstatfs st;
72 struct file *nsfile;
73 struct ns_common *ns;
74 int error;
75
76 if ((nsfile = fget(fd)) == NULL)
77 return (EBADF);
78 if (vfs_statfs(&nsfile->f_path, &st) != 0) {
79 error = ENOTTY;
80 goto done;
81 }
82 if (st.f_type != NSFS_MAGIC) {
83 error = ENOTTY;
84 goto done;
85 }
86 ns = get_proc_ns(file_inode(nsfile));
87 if (ns->ops->type != CLONE_NEWUSER) {
88 error = ENOTTY;
89 goto done;
90 }
91 *userns = container_of(ns, struct user_namespace, ns);
92
93 error = 0;
94 done:
95 fput(nsfile);
96
97 return (error);
98 }
99 #endif /* CONFIG_USER_NS */
100
101 static unsigned int
user_ns_zoneid(struct user_namespace * user_ns)102 user_ns_zoneid(struct user_namespace *user_ns)
103 {
104 unsigned int r;
105
106 r = user_ns->ns.inum;
107
108 return (r);
109 }
110
111 static struct zone_datasets *
zone_datasets_lookup(unsigned int nsinum)112 zone_datasets_lookup(unsigned int nsinum)
113 {
114 zone_datasets_t *zds;
115
116 list_for_each_entry(zds, &zone_datasets, zds_list) {
117 if (user_ns_zoneid(zds->zds_userns) == nsinum)
118 return (zds);
119 }
120 return (NULL);
121 }
122
123 #ifdef CONFIG_USER_NS
124 static struct zone_dataset *
zone_dataset_lookup(zone_datasets_t * zds,const char * dataset,size_t dsnamelen)125 zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
126 {
127 zone_dataset_t *zd;
128
129 list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
130 if (zd->zd_dsnamelen != dsnamelen)
131 continue;
132 if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
133 return (zd);
134 }
135
136 return (NULL);
137 }
138
139 static int
zone_dataset_cred_check(cred_t * cred)140 zone_dataset_cred_check(cred_t *cred)
141 {
142
143 if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
144 return (EPERM);
145
146 return (0);
147 }
148 #endif /* CONFIG_USER_NS */
149
150 static int
zone_dataset_name_check(const char * dataset,size_t * dsnamelen)151 zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
152 {
153
154 if (dataset[0] == '\0' || dataset[0] == '/')
155 return (ENOENT);
156
157 *dsnamelen = strlen(dataset);
158 /* Ignore trailing slash, if supplied. */
159 if (dataset[*dsnamelen - 1] == '/')
160 (*dsnamelen)--;
161
162 return (0);
163 }
164
165 int
zone_dataset_attach(cred_t * cred,const char * dataset,int userns_fd)166 zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
167 {
168 #ifdef CONFIG_USER_NS
169 struct user_namespace *userns;
170 zone_datasets_t *zds;
171 zone_dataset_t *zd;
172 int error;
173 size_t dsnamelen;
174
175 if ((error = zone_dataset_cred_check(cred)) != 0)
176 return (error);
177 if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
178 return (error);
179 if ((error = user_ns_get(userns_fd, &userns)) != 0)
180 return (error);
181
182 mutex_enter(&zone_datasets_lock);
183 zds = zone_datasets_lookup(user_ns_zoneid(userns));
184 if (zds == NULL) {
185 zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
186 INIT_LIST_HEAD(&zds->zds_list);
187 INIT_LIST_HEAD(&zds->zds_datasets);
188 zds->zds_userns = userns;
189 /*
190 * Lock the namespace by incresing its refcount to prevent
191 * the namespace ID from being reused.
192 */
193 get_user_ns(userns);
194 list_add_tail(&zds->zds_list, &zone_datasets);
195 } else {
196 zd = zone_dataset_lookup(zds, dataset, dsnamelen);
197 if (zd != NULL) {
198 mutex_exit(&zone_datasets_lock);
199 return (EEXIST);
200 }
201 }
202
203 zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
204 zd->zd_dsnamelen = dsnamelen;
205 strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
206 INIT_LIST_HEAD(&zd->zd_list);
207 list_add_tail(&zd->zd_list, &zds->zds_datasets);
208
209 mutex_exit(&zone_datasets_lock);
210 return (0);
211 #else
212 return (ENXIO);
213 #endif /* CONFIG_USER_NS */
214 }
215 EXPORT_SYMBOL(zone_dataset_attach);
216
217 int
zone_dataset_detach(cred_t * cred,const char * dataset,int userns_fd)218 zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
219 {
220 #ifdef CONFIG_USER_NS
221 struct user_namespace *userns;
222 zone_datasets_t *zds;
223 zone_dataset_t *zd;
224 int error;
225 size_t dsnamelen;
226
227 if ((error = zone_dataset_cred_check(cred)) != 0)
228 return (error);
229 if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
230 return (error);
231 if ((error = user_ns_get(userns_fd, &userns)) != 0)
232 return (error);
233
234 mutex_enter(&zone_datasets_lock);
235 zds = zone_datasets_lookup(user_ns_zoneid(userns));
236 if (zds != NULL)
237 zd = zone_dataset_lookup(zds, dataset, dsnamelen);
238 if (zds == NULL || zd == NULL) {
239 mutex_exit(&zone_datasets_lock);
240 return (ENOENT);
241 }
242
243 list_del(&zd->zd_list);
244 kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
245
246 /* Prune the namespace entry if it has no more delegations. */
247 if (list_empty(&zds->zds_datasets)) {
248 /*
249 * Decrease the refcount now that the namespace is no longer
250 * used. It is no longer necessary to prevent the namespace ID
251 * from being reused.
252 */
253 put_user_ns(userns);
254 list_del(&zds->zds_list);
255 kmem_free(zds, sizeof (*zds));
256 }
257
258 mutex_exit(&zone_datasets_lock);
259 return (0);
260 #else
261 return (ENXIO);
262 #endif /* CONFIG_USER_NS */
263 }
264 EXPORT_SYMBOL(zone_dataset_detach);
265
266 /*
267 * A dataset is visible if:
268 * - It is a parent of a namespace entry.
269 * - It is one of the namespace entries.
270 * - It is a child of a namespace entry.
271 *
272 * A dataset is writable if:
273 * - It is one of the namespace entries.
274 * - It is a child of a namespace entry.
275 *
276 * The parent datasets of namespace entries are visible and
277 * read-only to provide a path back to the root of the pool.
278 */
279 int
zone_dataset_visible(const char * dataset,int * write)280 zone_dataset_visible(const char *dataset, int *write)
281 {
282 zone_datasets_t *zds;
283 zone_dataset_t *zd;
284 size_t dsnamelen, zd_len;
285 int visible;
286
287 /* Default to read-only, in case visible is returned. */
288 if (write != NULL)
289 *write = 0;
290 if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
291 return (0);
292 if (INGLOBALZONE(curproc)) {
293 if (write != NULL)
294 *write = 1;
295 return (1);
296 }
297
298 mutex_enter(&zone_datasets_lock);
299 zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
300 if (zds == NULL) {
301 mutex_exit(&zone_datasets_lock);
302 return (0);
303 }
304
305 visible = 0;
306 list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
307 zd_len = strlen(zd->zd_dsname);
308 if (zd_len > dsnamelen) {
309 /*
310 * The name of the namespace entry is longer than that
311 * of the dataset, so it could be that the dataset is a
312 * parent of the namespace entry.
313 */
314 visible = memcmp(zd->zd_dsname, dataset,
315 dsnamelen) == 0 &&
316 zd->zd_dsname[dsnamelen] == '/';
317 if (visible)
318 break;
319 } else if (zd_len == dsnamelen) {
320 /*
321 * The name of the namespace entry is as long as that
322 * of the dataset, so perhaps the dataset itself is the
323 * namespace entry.
324 */
325 visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
326 if (visible) {
327 if (write != NULL)
328 *write = 1;
329 break;
330 }
331 } else {
332 /*
333 * The name of the namespace entry is shorter than that
334 * of the dataset, so perhaps the dataset is a child of
335 * the namespace entry.
336 */
337 visible = memcmp(zd->zd_dsname, dataset,
338 zd_len) == 0 && dataset[zd_len] == '/';
339 if (visible) {
340 if (write != NULL)
341 *write = 1;
342 break;
343 }
344 }
345 }
346
347 mutex_exit(&zone_datasets_lock);
348 return (visible);
349 }
350 EXPORT_SYMBOL(zone_dataset_visible);
351
352 unsigned int
global_zoneid(void)353 global_zoneid(void)
354 {
355 unsigned int z = 0;
356
357 #if defined(CONFIG_USER_NS)
358 z = user_ns_zoneid(&init_user_ns);
359 #endif
360
361 return (z);
362 }
363 EXPORT_SYMBOL(global_zoneid);
364
365 unsigned int
crgetzoneid(const cred_t * cr)366 crgetzoneid(const cred_t *cr)
367 {
368 unsigned int r = 0;
369
370 #if defined(CONFIG_USER_NS)
371 r = user_ns_zoneid(cr->user_ns);
372 #endif
373
374 return (r);
375 }
376 EXPORT_SYMBOL(crgetzoneid);
377
378 boolean_t
inglobalzone(proc_t * proc)379 inglobalzone(proc_t *proc)
380 {
381 #if defined(CONFIG_USER_NS)
382 return (proc->cred->user_ns == &init_user_ns);
383 #else
384 return (B_TRUE);
385 #endif
386 }
387 EXPORT_SYMBOL(inglobalzone);
388
389 int
spl_zone_init(void)390 spl_zone_init(void)
391 {
392 mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
393 INIT_LIST_HEAD(&zone_datasets);
394 return (0);
395 }
396
397 void
spl_zone_fini(void)398 spl_zone_fini(void)
399 {
400 zone_datasets_t *zds;
401 zone_dataset_t *zd;
402
403 /*
404 * It would be better to assert an empty zone_datasets, but since
405 * there's no automatic mechanism for cleaning them up if the user
406 * namespace is destroyed, just do it here, since spl is about to go
407 * out of context.
408 */
409 while (!list_empty(&zone_datasets)) {
410 zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
411 while (!list_empty(&zds->zds_datasets)) {
412 zd = list_entry(zds->zds_datasets.next,
413 zone_dataset_t, zd_list);
414 list_del(&zd->zd_list);
415 kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
416 }
417 put_user_ns(zds->zds_userns);
418 list_del(&zds->zds_list);
419 kmem_free(zds, sizeof (*zds));
420 }
421 mutex_destroy(&zone_datasets_lock);
422 }
423