xref: /linux/kernel/nscommon.c (revision 88efd7c6997ee9da3e0274106f72b99fa105f45f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
3 
4 #include <linux/ns_common.h>
5 #include <linux/proc_ns.h>
6 #include <linux/user_namespace.h>
7 #include <linux/vfsdebug.h>
8 
9 #ifdef CONFIG_DEBUG_VFS
10 static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
11 {
12 	switch (ns->ns_type) {
13 #ifdef CONFIG_CGROUPS
14 	case CLONE_NEWCGROUP:
15 		VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
16 		break;
17 #endif
18 #ifdef CONFIG_IPC_NS
19 	case CLONE_NEWIPC:
20 		VFS_WARN_ON_ONCE(ops != &ipcns_operations);
21 		break;
22 #endif
23 	case CLONE_NEWNS:
24 		VFS_WARN_ON_ONCE(ops != &mntns_operations);
25 		break;
26 #ifdef CONFIG_NET_NS
27 	case CLONE_NEWNET:
28 		VFS_WARN_ON_ONCE(ops != &netns_operations);
29 		break;
30 #endif
31 #ifdef CONFIG_PID_NS
32 	case CLONE_NEWPID:
33 		VFS_WARN_ON_ONCE(ops != &pidns_operations);
34 		break;
35 #endif
36 #ifdef CONFIG_TIME_NS
37 	case CLONE_NEWTIME:
38 		VFS_WARN_ON_ONCE(ops != &timens_operations);
39 		break;
40 #endif
41 #ifdef CONFIG_USER_NS
42 	case CLONE_NEWUSER:
43 		VFS_WARN_ON_ONCE(ops != &userns_operations);
44 		break;
45 #endif
46 #ifdef CONFIG_UTS_NS
47 	case CLONE_NEWUTS:
48 		VFS_WARN_ON_ONCE(ops != &utsns_operations);
49 		break;
50 #endif
51 	}
52 }
53 #endif
54 
55 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
56 {
57 	int ret = 0;
58 
59 	refcount_set(&ns->__ns_ref, 1);
60 	ns->stashed = NULL;
61 	ns->ops = ops;
62 	ns->ns_id = 0;
63 	ns->ns_type = ns_type;
64 	RB_CLEAR_NODE(&ns->ns_tree_node);
65 	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
66 	RB_CLEAR_NODE(&ns->ns_owner_tree_node);
67 	INIT_LIST_HEAD(&ns->ns_list_node);
68 	INIT_LIST_HEAD(&ns->ns_unified_list_node);
69 	ns->ns_owner_tree = RB_ROOT;
70 	INIT_LIST_HEAD(&ns->ns_owner);
71 	INIT_LIST_HEAD(&ns->ns_owner_entry);
72 
73 #ifdef CONFIG_DEBUG_VFS
74 	ns_debug(ns, ops);
75 #endif
76 
77 	if (inum)
78 		ns->inum = inum;
79 	else
80 		ret = proc_alloc_inum(&ns->inum);
81 	if (ret)
82 		return ret;
83 	/*
84 	 * Tree ref starts at 0. It's incremented when namespace enters
85 	 * active use (installed in nsproxy) and decremented when all
86 	 * active uses are gone. Initial namespaces are always active.
87 	 */
88 	if (is_initial_namespace(ns))
89 		atomic_set(&ns->__ns_ref_active, 1);
90 	else
91 		atomic_set(&ns->__ns_ref_active, 0);
92 	return 0;
93 }
94 
95 void __ns_common_free(struct ns_common *ns)
96 {
97 	proc_free_inum(ns->inum);
98 }
99 
100 struct ns_common *__must_check ns_owner(struct ns_common *ns)
101 {
102 	struct user_namespace *owner;
103 
104 	if (unlikely(!ns->ops))
105 		return NULL;
106 	VFS_WARN_ON_ONCE(!ns->ops->owner);
107 	owner = ns->ops->owner(ns);
108 	VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
109 	if (!owner)
110 		return NULL;
111 	/* Skip init_user_ns as it's always active */
112 	if (owner == &init_user_ns)
113 		return NULL;
114 	return to_ns_common(owner);
115 }
116 
117 /*
118  * The active reference count works by having each namespace that gets
119  * created take a single active reference on its owning user namespace.
120  * That single reference is only released once the child namespace's
121  * active count itself goes down.
122  *
123  * A regular namespace tree might look as follow:
124  * Legend:
125  * + : adding active reference
126  * - : dropping active reference
127  * x : always active (initial namespace)
128  *
129  *
130  *                 net_ns          pid_ns
131  *                       \        /
132  *                        +      +
133  *                        user_ns1 (2)
134  *                            |
135  *                 ipc_ns     |     uts_ns
136  *                       \    |    /
137  *                        +   +   +
138  *                        user_ns2 (3)
139  *                            |
140  *            cgroup_ns       |       mnt_ns
141  *                     \      |      /
142  *                      x     x     x
143  *                      init_user_ns (1)
144  *
145  * If both net_ns and pid_ns put their last active reference on
146  * themselves it will cascade to user_ns1 dropping its own active
147  * reference and dropping one active reference on user_ns2:
148  *
149  *                 net_ns          pid_ns
150  *                       \        /
151  *                        -      -
152  *                        user_ns1 (0)
153  *                            |
154  *                 ipc_ns     |     uts_ns
155  *                       \    |    /
156  *                        +   -   +
157  *                        user_ns2 (2)
158  *                            |
159  *            cgroup_ns       |       mnt_ns
160  *                     \      |      /
161  *                      x     x     x
162  *                      init_user_ns (1)
163  *
164  * The iteration stops once we reach a namespace that still has active
165  * references.
166  */
167 void __ns_ref_active_put(struct ns_common *ns)
168 {
169 	/* Initial namespaces are always active. */
170 	if (is_ns_init_id(ns))
171 		return;
172 
173 	if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
174 		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
175 		return;
176 	}
177 
178 	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
179 	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
180 
181 	for (;;) {
182 		ns = ns_owner(ns);
183 		if (!ns)
184 			return;
185 		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
186 		if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
187 			VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
188 			return;
189 		}
190 	}
191 }
192 
193 /*
194  * The active reference count works by having each namespace that gets
195  * created take a single active reference on its owning user namespace.
196  * That single reference is only released once the child namespace's
197  * active count itself goes down. This makes it possible to efficiently
198  * resurrect a namespace tree:
199  *
200  * A regular namespace tree might look as follow:
201  * Legend:
202  * + : adding active reference
203  * - : dropping active reference
204  * x : always active (initial namespace)
205  *
206  *
207  *                 net_ns          pid_ns
208  *                       \        /
209  *                        +      +
210  *                        user_ns1 (2)
211  *                            |
212  *                 ipc_ns     |     uts_ns
213  *                       \    |    /
214  *                        +   +   +
215  *                        user_ns2 (3)
216  *                            |
217  *            cgroup_ns       |       mnt_ns
218  *                     \      |      /
219  *                      x     x     x
220  *                      init_user_ns (1)
221  *
222  * If both net_ns and pid_ns put their last active reference on
223  * themselves it will cascade to user_ns1 dropping its own active
224  * reference and dropping one active reference on user_ns2:
225  *
226  *                 net_ns          pid_ns
227  *                       \        /
228  *                        -      -
229  *                        user_ns1 (0)
230  *                            |
231  *                 ipc_ns     |     uts_ns
232  *                       \    |    /
233  *                        +   -   +
234  *                        user_ns2 (2)
235  *                            |
236  *            cgroup_ns       |       mnt_ns
237  *                     \      |      /
238  *                      x     x     x
239  *                      init_user_ns (1)
240  *
241  * Assume the whole tree is dead but all namespaces are still active:
242  *
243  *                 net_ns          pid_ns
244  *                       \        /
245  *                        -      -
246  *                        user_ns1 (0)
247  *                            |
248  *                 ipc_ns     |     uts_ns
249  *                       \    |    /
250  *                        -   -   -
251  *                        user_ns2 (0)
252  *                            |
253  *            cgroup_ns       |       mnt_ns
254  *                     \      |      /
255  *                      x     x     x
256  *                      init_user_ns (1)
257  *
258  * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
259  *
260  *                 net_ns          pid_ns
261  *                       \        /
262  *                        +      -
263  *                        user_ns1 (0)
264  *                            |
265  *                 ipc_ns     |     uts_ns
266  *                       \    |    /
267  *                        -   +   -
268  *                        user_ns2 (0)
269  *                            |
270  *            cgroup_ns       |       mnt_ns
271  *                     \      |      /
272  *                      x     x     x
273  *                      init_user_ns (1)
274  *
275  * If net_ns had a zero reference count and we bumped it we also need to
276  * take another reference on its owning user namespace. Similarly, if
277  * pid_ns had a zero reference count it also needs to take another
278  * reference on its owning user namespace. So both net_ns and pid_ns
279  * will each have their own reference on the owning user namespace.
280  *
281  * If the owning user namespace user_ns1 had a zero reference count then
282  * it also needs to take another reference on its owning user namespace
283  * and so on.
284  */
285 void __ns_ref_active_get(struct ns_common *ns)
286 {
287 	int prev;
288 
289 	/* Initial namespaces are always active. */
290 	if (is_ns_init_id(ns))
291 		return;
292 
293 	/* If we didn't resurrect the namespace we're done. */
294 	prev = atomic_fetch_add(1, &ns->__ns_ref_active);
295 	VFS_WARN_ON_ONCE(prev < 0);
296 	if (likely(prev))
297 		return;
298 
299 	/*
300 	 * We did resurrect it. Walk the ownership hierarchy upwards
301 	 * until we found an owning user namespace that is active.
302 	 */
303 	for (;;) {
304 		ns = ns_owner(ns);
305 		if (!ns)
306 			return;
307 
308 		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
309 		prev = atomic_fetch_add(1, &ns->__ns_ref_active);
310 		VFS_WARN_ON_ONCE(prev < 0);
311 		if (likely(prev))
312 			return;
313 	}
314 }
315