1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
3
4 #include <linux/ns_common.h>
5 #include <linux/nstree.h>
6 #include <linux/proc_ns.h>
7 #include <linux/user_namespace.h>
8 #include <linux/vfsdebug.h>
9
10 #ifdef CONFIG_DEBUG_VFS
ns_debug(struct ns_common * ns,const struct proc_ns_operations * ops)11 static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
12 {
13 switch (ns->ns_type) {
14 #ifdef CONFIG_CGROUPS
15 case CLONE_NEWCGROUP:
16 VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
17 break;
18 #endif
19 #ifdef CONFIG_IPC_NS
20 case CLONE_NEWIPC:
21 VFS_WARN_ON_ONCE(ops != &ipcns_operations);
22 break;
23 #endif
24 case CLONE_NEWNS:
25 VFS_WARN_ON_ONCE(ops != &mntns_operations);
26 break;
27 #ifdef CONFIG_NET_NS
28 case CLONE_NEWNET:
29 VFS_WARN_ON_ONCE(ops != &netns_operations);
30 break;
31 #endif
32 #ifdef CONFIG_PID_NS
33 case CLONE_NEWPID:
34 VFS_WARN_ON_ONCE(ops != &pidns_operations);
35 break;
36 #endif
37 #ifdef CONFIG_TIME_NS
38 case CLONE_NEWTIME:
39 VFS_WARN_ON_ONCE(ops != &timens_operations);
40 break;
41 #endif
42 #ifdef CONFIG_USER_NS
43 case CLONE_NEWUSER:
44 VFS_WARN_ON_ONCE(ops != &userns_operations);
45 break;
46 #endif
47 #ifdef CONFIG_UTS_NS
48 case CLONE_NEWUTS:
49 VFS_WARN_ON_ONCE(ops != &utsns_operations);
50 break;
51 #endif
52 }
53 }
54 #endif
55
__ns_common_init(struct ns_common * ns,u32 ns_type,const struct proc_ns_operations * ops,int inum)56 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
57 {
58 int ret = 0;
59
60 refcount_set(&ns->__ns_ref, 1);
61 ns->stashed = NULL;
62 ns->ops = ops;
63 ns->ns_id = 0;
64 ns->ns_type = ns_type;
65 ns_tree_node_init(&ns->ns_tree_node);
66 ns_tree_node_init(&ns->ns_unified_node);
67 ns_tree_node_init(&ns->ns_owner_node);
68 ns_tree_root_init(&ns->ns_owner_root);
69
70 #ifdef CONFIG_DEBUG_VFS
71 ns_debug(ns, ops);
72 #endif
73
74 if (inum)
75 ns->inum = inum;
76 else
77 ret = proc_alloc_inum(&ns->inum);
78 if (ret)
79 return ret;
80 /*
81 * Tree ref starts at 0. It's incremented when namespace enters
82 * active use (installed in nsproxy) and decremented when all
83 * active uses are gone. Initial namespaces are always active.
84 */
85 if (is_ns_init_inum(ns))
86 atomic_set(&ns->__ns_ref_active, 1);
87 else
88 atomic_set(&ns->__ns_ref_active, 0);
89 return 0;
90 }
91
__ns_common_free(struct ns_common * ns)92 void __ns_common_free(struct ns_common *ns)
93 {
94 proc_free_inum(ns->inum);
95 }
96
ns_owner(struct ns_common * ns)97 struct ns_common *__must_check ns_owner(struct ns_common *ns)
98 {
99 struct user_namespace *owner;
100
101 if (unlikely(!ns->ops))
102 return NULL;
103 VFS_WARN_ON_ONCE(!ns->ops->owner);
104 owner = ns->ops->owner(ns);
105 VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
106 if (!owner)
107 return NULL;
108 /* Skip init_user_ns as it's always active */
109 if (owner == &init_user_ns)
110 return NULL;
111 return to_ns_common(owner);
112 }
113
114 /*
115 * The active reference count works by having each namespace that gets
116 * created take a single active reference on its owning user namespace.
117 * That single reference is only released once the child namespace's
118 * active count itself goes down.
119 *
120 * A regular namespace tree might look as follow:
121 * Legend:
122 * + : adding active reference
123 * - : dropping active reference
124 * x : always active (initial namespace)
125 *
126 *
127 * net_ns pid_ns
128 * \ /
129 * + +
130 * user_ns1 (2)
131 * |
132 * ipc_ns | uts_ns
133 * \ | /
134 * + + +
135 * user_ns2 (3)
136 * |
137 * cgroup_ns | mnt_ns
138 * \ | /
139 * x x x
140 * init_user_ns (1)
141 *
142 * If both net_ns and pid_ns put their last active reference on
143 * themselves it will cascade to user_ns1 dropping its own active
144 * reference and dropping one active reference on user_ns2:
145 *
146 * net_ns pid_ns
147 * \ /
148 * - -
149 * user_ns1 (0)
150 * |
151 * ipc_ns | uts_ns
152 * \ | /
153 * + - +
154 * user_ns2 (2)
155 * |
156 * cgroup_ns | mnt_ns
157 * \ | /
158 * x x x
159 * init_user_ns (1)
160 *
161 * The iteration stops once we reach a namespace that still has active
162 * references.
163 */
__ns_ref_active_put(struct ns_common * ns)164 void __ns_ref_active_put(struct ns_common *ns)
165 {
166 /* Initial namespaces are always active. */
167 if (is_ns_init_id(ns))
168 return;
169
170 if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
171 VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
172 return;
173 }
174
175 VFS_WARN_ON_ONCE(is_ns_init_id(ns));
176 VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
177
178 for (;;) {
179 ns = ns_owner(ns);
180 if (!ns)
181 return;
182 VFS_WARN_ON_ONCE(is_ns_init_id(ns));
183 if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
184 VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
185 return;
186 }
187 }
188 }
189
190 /*
191 * The active reference count works by having each namespace that gets
192 * created take a single active reference on its owning user namespace.
193 * That single reference is only released once the child namespace's
194 * active count itself goes down. This makes it possible to efficiently
195 * resurrect a namespace tree:
196 *
197 * A regular namespace tree might look as follow:
198 * Legend:
199 * + : adding active reference
200 * - : dropping active reference
201 * x : always active (initial namespace)
202 *
203 *
204 * net_ns pid_ns
205 * \ /
206 * + +
207 * user_ns1 (2)
208 * |
209 * ipc_ns | uts_ns
210 * \ | /
211 * + + +
212 * user_ns2 (3)
213 * |
214 * cgroup_ns | mnt_ns
215 * \ | /
216 * x x x
217 * init_user_ns (1)
218 *
219 * If both net_ns and pid_ns put their last active reference on
220 * themselves it will cascade to user_ns1 dropping its own active
221 * reference and dropping one active reference on user_ns2:
222 *
223 * net_ns pid_ns
224 * \ /
225 * - -
226 * user_ns1 (0)
227 * |
228 * ipc_ns | uts_ns
229 * \ | /
230 * + - +
231 * user_ns2 (2)
232 * |
233 * cgroup_ns | mnt_ns
234 * \ | /
235 * x x x
236 * init_user_ns (1)
237 *
238 * Assume the whole tree is dead but all namespaces are still active:
239 *
240 * net_ns pid_ns
241 * \ /
242 * - -
243 * user_ns1 (0)
244 * |
245 * ipc_ns | uts_ns
246 * \ | /
247 * - - -
248 * user_ns2 (0)
249 * |
250 * cgroup_ns | mnt_ns
251 * \ | /
252 * x x x
253 * init_user_ns (1)
254 *
255 * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
256 *
257 * net_ns pid_ns
258 * \ /
259 * + -
260 * user_ns1 (0)
261 * |
262 * ipc_ns | uts_ns
263 * \ | /
264 * - + -
265 * user_ns2 (0)
266 * |
267 * cgroup_ns | mnt_ns
268 * \ | /
269 * x x x
270 * init_user_ns (1)
271 *
272 * If net_ns had a zero reference count and we bumped it we also need to
273 * take another reference on its owning user namespace. Similarly, if
274 * pid_ns had a zero reference count it also needs to take another
275 * reference on its owning user namespace. So both net_ns and pid_ns
276 * will each have their own reference on the owning user namespace.
277 *
278 * If the owning user namespace user_ns1 had a zero reference count then
279 * it also needs to take another reference on its owning user namespace
280 * and so on.
281 */
__ns_ref_active_get(struct ns_common * ns)282 void __ns_ref_active_get(struct ns_common *ns)
283 {
284 int prev;
285
286 /* Initial namespaces are always active. */
287 if (is_ns_init_id(ns))
288 return;
289
290 /* If we didn't resurrect the namespace we're done. */
291 prev = atomic_fetch_add(1, &ns->__ns_ref_active);
292 VFS_WARN_ON_ONCE(prev < 0);
293 if (likely(prev))
294 return;
295
296 /*
297 * We did resurrect it. Walk the ownership hierarchy upwards
298 * until we found an owning user namespace that is active.
299 */
300 for (;;) {
301 ns = ns_owner(ns);
302 if (!ns)
303 return;
304
305 VFS_WARN_ON_ONCE(is_ns_init_id(ns));
306 prev = atomic_fetch_add(1, &ns->__ns_ref_active);
307 VFS_WARN_ON_ONCE(prev < 0);
308 if (likely(prev))
309 return;
310 }
311 }
312