xref: /linux/kernel/nscommon.c (revision b36d4b6aa88ef039647228b98c59a875e92f8c8e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
3 
4 #include <linux/ns_common.h>
5 #include <linux/proc_ns.h>
6 #include <linux/user_namespace.h>
7 #include <linux/vfsdebug.h>
8 
9 #ifdef CONFIG_DEBUG_VFS
10 static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
11 {
12 	switch (ns->ns_type) {
13 #ifdef CONFIG_CGROUPS
14 	case CLONE_NEWCGROUP:
15 		VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
16 		break;
17 #endif
18 #ifdef CONFIG_IPC_NS
19 	case CLONE_NEWIPC:
20 		VFS_WARN_ON_ONCE(ops != &ipcns_operations);
21 		break;
22 #endif
23 	case CLONE_NEWNS:
24 		VFS_WARN_ON_ONCE(ops != &mntns_operations);
25 		break;
26 #ifdef CONFIG_NET_NS
27 	case CLONE_NEWNET:
28 		VFS_WARN_ON_ONCE(ops != &netns_operations);
29 		break;
30 #endif
31 #ifdef CONFIG_PID_NS
32 	case CLONE_NEWPID:
33 		VFS_WARN_ON_ONCE(ops != &pidns_operations);
34 		break;
35 #endif
36 #ifdef CONFIG_TIME_NS
37 	case CLONE_NEWTIME:
38 		VFS_WARN_ON_ONCE(ops != &timens_operations);
39 		break;
40 #endif
41 #ifdef CONFIG_USER_NS
42 	case CLONE_NEWUSER:
43 		VFS_WARN_ON_ONCE(ops != &userns_operations);
44 		break;
45 #endif
46 #ifdef CONFIG_UTS_NS
47 	case CLONE_NEWUTS:
48 		VFS_WARN_ON_ONCE(ops != &utsns_operations);
49 		break;
50 #endif
51 	}
52 }
53 #endif
54 
55 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
56 {
57 	int ret;
58 
59 	refcount_set(&ns->__ns_ref, 1);
60 	ns->stashed = NULL;
61 	ns->ops = ops;
62 	ns->ns_id = 0;
63 	ns->ns_type = ns_type;
64 	RB_CLEAR_NODE(&ns->ns_tree_node);
65 	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
66 	RB_CLEAR_NODE(&ns->ns_owner_tree_node);
67 	INIT_LIST_HEAD(&ns->ns_list_node);
68 	INIT_LIST_HEAD(&ns->ns_unified_list_node);
69 	ns->ns_owner_tree = RB_ROOT;
70 	INIT_LIST_HEAD(&ns->ns_owner);
71 	INIT_LIST_HEAD(&ns->ns_owner_entry);
72 
73 #ifdef CONFIG_DEBUG_VFS
74 	ns_debug(ns, ops);
75 #endif
76 
77 	if (inum) {
78 		ns->inum = inum;
79 		return 0;
80 	}
81 	ret = proc_alloc_inum(&ns->inum);
82 	if (ret)
83 		return ret;
84 	/*
85 	 * Tree ref starts at 0. It's incremented when namespace enters
86 	 * active use (installed in nsproxy) and decremented when all
87 	 * active uses are gone. Initial namespaces are always active.
88 	 */
89 	if (is_initial_namespace(ns))
90 		atomic_set(&ns->__ns_ref_active, 1);
91 	else
92 		atomic_set(&ns->__ns_ref_active, 0);
93 	return 0;
94 }
95 
96 void __ns_common_free(struct ns_common *ns)
97 {
98 	proc_free_inum(ns->inum);
99 }
100 
101 struct ns_common *__must_check ns_owner(struct ns_common *ns)
102 {
103 	struct user_namespace *owner;
104 
105 	if (unlikely(!ns->ops))
106 		return NULL;
107 	VFS_WARN_ON_ONCE(!ns->ops->owner);
108 	owner = ns->ops->owner(ns);
109 	VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
110 	if (!owner)
111 		return NULL;
112 	/* Skip init_user_ns as it's always active */
113 	if (owner == &init_user_ns)
114 		return NULL;
115 	return to_ns_common(owner);
116 }
117 
118 void __ns_ref_active_get_owner(struct ns_common *ns)
119 {
120 	ns = ns_owner(ns);
121 	if (ns)
122 		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
123 }
124 
125 /*
126  * The active reference count works by having each namespace that gets
127  * created take a single active reference on its owning user namespace.
128  * That single reference is only released once the child namespace's
129  * active count itself goes down.
130  *
131  * A regular namespace tree might look as follow:
132  * Legend:
133  * + : adding active reference
134  * - : dropping active reference
135  * x : always active (initial namespace)
136  *
137  *
138  *                 net_ns          pid_ns
139  *                       \        /
140  *                        +      +
141  *                        user_ns1 (2)
142  *                            |
143  *                 ipc_ns     |     uts_ns
144  *                       \    |    /
145  *                        +   +   +
146  *                        user_ns2 (3)
147  *                            |
148  *            cgroup_ns       |       mnt_ns
149  *                     \      |      /
150  *                      x     x     x
151  *                      init_user_ns (1)
152  *
153  * If both net_ns and pid_ns put their last active reference on
154  * themselves it will cascade to user_ns1 dropping its own active
155  * reference and dropping one active reference on user_ns2:
156  *
157  *                 net_ns          pid_ns
158  *                       \        /
159  *                        -      -
160  *                        user_ns1 (0)
161  *                            |
162  *                 ipc_ns     |     uts_ns
163  *                       \    |    /
164  *                        +   -   +
165  *                        user_ns2 (2)
166  *                            |
167  *            cgroup_ns       |       mnt_ns
168  *                     \      |      /
169  *                      x     x     x
170  *                      init_user_ns (1)
171  *
172  * The iteration stops once we reach a namespace that still has active
173  * references.
174  */
175 void __ns_ref_active_put_owner(struct ns_common *ns)
176 {
177 	for (;;) {
178 		ns = ns_owner(ns);
179 		if (!ns)
180 			return;
181 		if (!atomic_dec_and_test(&ns->__ns_ref_active))
182 			return;
183 	}
184 }
185 
186 /*
187  * The active reference count works by having each namespace that gets
188  * created take a single active reference on its owning user namespace.
189  * That single reference is only released once the child namespace's
190  * active count itself goes down. This makes it possible to efficiently
191  * resurrect a namespace tree:
192  *
193  * A regular namespace tree might look as follow:
194  * Legend:
195  * + : adding active reference
196  * - : dropping active reference
197  * x : always active (initial namespace)
198  *
199  *
200  *                 net_ns          pid_ns
201  *                       \        /
202  *                        +      +
203  *                        user_ns1 (2)
204  *                            |
205  *                 ipc_ns     |     uts_ns
206  *                       \    |    /
207  *                        +   +   +
208  *                        user_ns2 (3)
209  *                            |
210  *            cgroup_ns       |       mnt_ns
211  *                     \      |      /
212  *                      x     x     x
213  *                      init_user_ns (1)
214  *
215  * If both net_ns and pid_ns put their last active reference on
216  * themselves it will cascade to user_ns1 dropping its own active
217  * reference and dropping one active reference on user_ns2:
218  *
219  *                 net_ns          pid_ns
220  *                       \        /
221  *                        -      -
222  *                        user_ns1 (0)
223  *                            |
224  *                 ipc_ns     |     uts_ns
225  *                       \    |    /
226  *                        +   -   +
227  *                        user_ns2 (2)
228  *                            |
229  *            cgroup_ns       |       mnt_ns
230  *                     \      |      /
231  *                      x     x     x
232  *                      init_user_ns (1)
233  *
234  * Assume the whole tree is dead but all namespaces are still active:
235  *
236  *                 net_ns          pid_ns
237  *                       \        /
238  *                        -      -
239  *                        user_ns1 (0)
240  *                            |
241  *                 ipc_ns     |     uts_ns
242  *                       \    |    /
243  *                        -   -   -
244  *                        user_ns2 (0)
245  *                            |
246  *            cgroup_ns       |       mnt_ns
247  *                     \      |      /
248  *                      x     x     x
249  *                      init_user_ns (1)
250  *
251  * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
252  *
253  *                 net_ns          pid_ns
254  *                       \        /
255  *                        +      -
256  *                        user_ns1 (0)
257  *                            |
258  *                 ipc_ns     |     uts_ns
259  *                       \    |    /
260  *                        -   +   -
261  *                        user_ns2 (0)
262  *                            |
263  *            cgroup_ns       |       mnt_ns
264  *                     \      |      /
265  *                      x     x     x
266  *                      init_user_ns (1)
267  *
268  * If net_ns had a zero reference count and we bumped it we also need to
269  * take another reference on its owning user namespace. Similarly, if
270  * pid_ns had a zero reference count it also needs to take another
271  * reference on its owning user namespace. So both net_ns and pid_ns
272  * will each have their own reference on the owning user namespace.
273  *
274  * If the owning user namespace user_ns1 had a zero reference count then
275  * it also needs to take another reference on its owning user namespace
276  * and so on.
277  */
278 void __ns_ref_active_resurrect(struct ns_common *ns)
279 {
280 	/* If we didn't resurrect the namespace we're done. */
281 	if (atomic_fetch_add(1, &ns->__ns_ref_active))
282 		return;
283 
284 	/*
285 	 * We did resurrect it. Walk the ownership hierarchy upwards
286 	 * until we found an owning user namespace that is active.
287 	 */
288 	for (;;) {
289 		ns = ns_owner(ns);
290 		if (!ns)
291 			return;
292 
293 		if (atomic_fetch_add(1, &ns->__ns_ref_active))
294 			return;
295 	}
296 }
297