xref: /linux/kernel/nscommon.c (revision 415d34b92c1f921a9ff3c38f56319cbc5536f642)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
3 
4 #include <linux/ns_common.h>
5 #include <linux/nstree.h>
6 #include <linux/proc_ns.h>
7 #include <linux/user_namespace.h>
8 #include <linux/vfsdebug.h>
9 
10 #ifdef CONFIG_DEBUG_VFS
ns_debug(struct ns_common * ns,const struct proc_ns_operations * ops)11 static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
12 {
13 	switch (ns->ns_type) {
14 #ifdef CONFIG_CGROUPS
15 	case CLONE_NEWCGROUP:
16 		VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
17 		break;
18 #endif
19 #ifdef CONFIG_IPC_NS
20 	case CLONE_NEWIPC:
21 		VFS_WARN_ON_ONCE(ops != &ipcns_operations);
22 		break;
23 #endif
24 	case CLONE_NEWNS:
25 		VFS_WARN_ON_ONCE(ops != &mntns_operations);
26 		break;
27 #ifdef CONFIG_NET_NS
28 	case CLONE_NEWNET:
29 		VFS_WARN_ON_ONCE(ops != &netns_operations);
30 		break;
31 #endif
32 #ifdef CONFIG_PID_NS
33 	case CLONE_NEWPID:
34 		VFS_WARN_ON_ONCE(ops != &pidns_operations);
35 		break;
36 #endif
37 #ifdef CONFIG_TIME_NS
38 	case CLONE_NEWTIME:
39 		VFS_WARN_ON_ONCE(ops != &timens_operations);
40 		break;
41 #endif
42 #ifdef CONFIG_USER_NS
43 	case CLONE_NEWUSER:
44 		VFS_WARN_ON_ONCE(ops != &userns_operations);
45 		break;
46 #endif
47 #ifdef CONFIG_UTS_NS
48 	case CLONE_NEWUTS:
49 		VFS_WARN_ON_ONCE(ops != &utsns_operations);
50 		break;
51 #endif
52 	}
53 }
54 #endif
55 
__ns_common_init(struct ns_common * ns,u32 ns_type,const struct proc_ns_operations * ops,int inum)56 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
57 {
58 	int ret = 0;
59 
60 	refcount_set(&ns->__ns_ref, 1);
61 	ns->stashed = NULL;
62 	ns->ops = ops;
63 	ns->ns_id = 0;
64 	ns->ns_type = ns_type;
65 	ns_tree_node_init(&ns->ns_tree_node);
66 	ns_tree_node_init(&ns->ns_unified_node);
67 	ns_tree_node_init(&ns->ns_owner_node);
68 	ns_tree_root_init(&ns->ns_owner_root);
69 
70 #ifdef CONFIG_DEBUG_VFS
71 	ns_debug(ns, ops);
72 #endif
73 
74 	if (inum)
75 		ns->inum = inum;
76 	else
77 		ret = proc_alloc_inum(&ns->inum);
78 	if (ret)
79 		return ret;
80 	/*
81 	 * Tree ref starts at 0. It's incremented when namespace enters
82 	 * active use (installed in nsproxy) and decremented when all
83 	 * active uses are gone. Initial namespaces are always active.
84 	 */
85 	if (is_ns_init_inum(ns))
86 		atomic_set(&ns->__ns_ref_active, 1);
87 	else
88 		atomic_set(&ns->__ns_ref_active, 0);
89 	return 0;
90 }
91 
__ns_common_free(struct ns_common * ns)92 void __ns_common_free(struct ns_common *ns)
93 {
94 	proc_free_inum(ns->inum);
95 }
96 
ns_owner(struct ns_common * ns)97 struct ns_common *__must_check ns_owner(struct ns_common *ns)
98 {
99 	struct user_namespace *owner;
100 
101 	if (unlikely(!ns->ops))
102 		return NULL;
103 	VFS_WARN_ON_ONCE(!ns->ops->owner);
104 	owner = ns->ops->owner(ns);
105 	VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
106 	if (!owner)
107 		return NULL;
108 	/* Skip init_user_ns as it's always active */
109 	if (owner == &init_user_ns)
110 		return NULL;
111 	return to_ns_common(owner);
112 }
113 
114 /*
115  * The active reference count works by having each namespace that gets
116  * created take a single active reference on its owning user namespace.
117  * That single reference is only released once the child namespace's
118  * active count itself goes down.
119  *
120  * A regular namespace tree might look as follow:
121  * Legend:
122  * + : adding active reference
123  * - : dropping active reference
124  * x : always active (initial namespace)
125  *
126  *
127  *                 net_ns          pid_ns
128  *                       \        /
129  *                        +      +
130  *                        user_ns1 (2)
131  *                            |
132  *                 ipc_ns     |     uts_ns
133  *                       \    |    /
134  *                        +   +   +
135  *                        user_ns2 (3)
136  *                            |
137  *            cgroup_ns       |       mnt_ns
138  *                     \      |      /
139  *                      x     x     x
140  *                      init_user_ns (1)
141  *
142  * If both net_ns and pid_ns put their last active reference on
143  * themselves it will cascade to user_ns1 dropping its own active
144  * reference and dropping one active reference on user_ns2:
145  *
146  *                 net_ns          pid_ns
147  *                       \        /
148  *                        -      -
149  *                        user_ns1 (0)
150  *                            |
151  *                 ipc_ns     |     uts_ns
152  *                       \    |    /
153  *                        +   -   +
154  *                        user_ns2 (2)
155  *                            |
156  *            cgroup_ns       |       mnt_ns
157  *                     \      |      /
158  *                      x     x     x
159  *                      init_user_ns (1)
160  *
161  * The iteration stops once we reach a namespace that still has active
162  * references.
163  */
__ns_ref_active_put(struct ns_common * ns)164 void __ns_ref_active_put(struct ns_common *ns)
165 {
166 	/* Initial namespaces are always active. */
167 	if (is_ns_init_id(ns))
168 		return;
169 
170 	if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
171 		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
172 		return;
173 	}
174 
175 	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
176 	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
177 
178 	for (;;) {
179 		ns = ns_owner(ns);
180 		if (!ns)
181 			return;
182 		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
183 		if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
184 			VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
185 			return;
186 		}
187 	}
188 }
189 
190 /*
191  * The active reference count works by having each namespace that gets
192  * created take a single active reference on its owning user namespace.
193  * That single reference is only released once the child namespace's
194  * active count itself goes down. This makes it possible to efficiently
195  * resurrect a namespace tree:
196  *
197  * A regular namespace tree might look as follow:
198  * Legend:
199  * + : adding active reference
200  * - : dropping active reference
201  * x : always active (initial namespace)
202  *
203  *
204  *                 net_ns          pid_ns
205  *                       \        /
206  *                        +      +
207  *                        user_ns1 (2)
208  *                            |
209  *                 ipc_ns     |     uts_ns
210  *                       \    |    /
211  *                        +   +   +
212  *                        user_ns2 (3)
213  *                            |
214  *            cgroup_ns       |       mnt_ns
215  *                     \      |      /
216  *                      x     x     x
217  *                      init_user_ns (1)
218  *
219  * If both net_ns and pid_ns put their last active reference on
220  * themselves it will cascade to user_ns1 dropping its own active
221  * reference and dropping one active reference on user_ns2:
222  *
223  *                 net_ns          pid_ns
224  *                       \        /
225  *                        -      -
226  *                        user_ns1 (0)
227  *                            |
228  *                 ipc_ns     |     uts_ns
229  *                       \    |    /
230  *                        +   -   +
231  *                        user_ns2 (2)
232  *                            |
233  *            cgroup_ns       |       mnt_ns
234  *                     \      |      /
235  *                      x     x     x
236  *                      init_user_ns (1)
237  *
238  * Assume the whole tree is dead but all namespaces are still active:
239  *
240  *                 net_ns          pid_ns
241  *                       \        /
242  *                        -      -
243  *                        user_ns1 (0)
244  *                            |
245  *                 ipc_ns     |     uts_ns
246  *                       \    |    /
247  *                        -   -   -
248  *                        user_ns2 (0)
249  *                            |
250  *            cgroup_ns       |       mnt_ns
251  *                     \      |      /
252  *                      x     x     x
253  *                      init_user_ns (1)
254  *
255  * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
256  *
257  *                 net_ns          pid_ns
258  *                       \        /
259  *                        +      -
260  *                        user_ns1 (0)
261  *                            |
262  *                 ipc_ns     |     uts_ns
263  *                       \    |    /
264  *                        -   +   -
265  *                        user_ns2 (0)
266  *                            |
267  *            cgroup_ns       |       mnt_ns
268  *                     \      |      /
269  *                      x     x     x
270  *                      init_user_ns (1)
271  *
272  * If net_ns had a zero reference count and we bumped it we also need to
273  * take another reference on its owning user namespace. Similarly, if
274  * pid_ns had a zero reference count it also needs to take another
275  * reference on its owning user namespace. So both net_ns and pid_ns
276  * will each have their own reference on the owning user namespace.
277  *
278  * If the owning user namespace user_ns1 had a zero reference count then
279  * it also needs to take another reference on its owning user namespace
280  * and so on.
281  */
__ns_ref_active_get(struct ns_common * ns)282 void __ns_ref_active_get(struct ns_common *ns)
283 {
284 	int prev;
285 
286 	/* Initial namespaces are always active. */
287 	if (is_ns_init_id(ns))
288 		return;
289 
290 	/* If we didn't resurrect the namespace we're done. */
291 	prev = atomic_fetch_add(1, &ns->__ns_ref_active);
292 	VFS_WARN_ON_ONCE(prev < 0);
293 	if (likely(prev))
294 		return;
295 
296 	/*
297 	 * We did resurrect it. Walk the ownership hierarchy upwards
298 	 * until we found an owning user namespace that is active.
299 	 */
300 	for (;;) {
301 		ns = ns_owner(ns);
302 		if (!ns)
303 			return;
304 
305 		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
306 		prev = atomic_fetch_add(1, &ns->__ns_ref_active);
307 		VFS_WARN_ON_ONCE(prev < 0);
308 		if (likely(prev))
309 			return;
310 	}
311 }
312