1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 3 4 #include <linux/ns_common.h> 5 #include <linux/nstree.h> 6 #include <linux/proc_ns.h> 7 #include <linux/user_namespace.h> 8 #include <linux/vfsdebug.h> 9 10 #ifdef CONFIG_DEBUG_VFS 11 static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) 12 { 13 switch (ns->ns_type) { 14 #ifdef CONFIG_CGROUPS 15 case CLONE_NEWCGROUP: 16 VFS_WARN_ON_ONCE(ops != &cgroupns_operations); 17 break; 18 #endif 19 #ifdef CONFIG_IPC_NS 20 case CLONE_NEWIPC: 21 VFS_WARN_ON_ONCE(ops != &ipcns_operations); 22 break; 23 #endif 24 case CLONE_NEWNS: 25 VFS_WARN_ON_ONCE(ops != &mntns_operations); 26 break; 27 #ifdef CONFIG_NET_NS 28 case CLONE_NEWNET: 29 VFS_WARN_ON_ONCE(ops != &netns_operations); 30 break; 31 #endif 32 #ifdef CONFIG_PID_NS 33 case CLONE_NEWPID: 34 VFS_WARN_ON_ONCE(ops != &pidns_operations); 35 break; 36 #endif 37 #ifdef CONFIG_TIME_NS 38 case CLONE_NEWTIME: 39 VFS_WARN_ON_ONCE(ops != &timens_operations); 40 break; 41 #endif 42 #ifdef CONFIG_USER_NS 43 case CLONE_NEWUSER: 44 VFS_WARN_ON_ONCE(ops != &userns_operations); 45 break; 46 #endif 47 #ifdef CONFIG_UTS_NS 48 case CLONE_NEWUTS: 49 VFS_WARN_ON_ONCE(ops != &utsns_operations); 50 break; 51 #endif 52 } 53 } 54 #endif 55 56 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) 57 { 58 int ret = 0; 59 60 refcount_set(&ns->__ns_ref, 1); 61 ns->stashed = NULL; 62 ns->ops = ops; 63 ns->ns_id = 0; 64 ns->ns_type = ns_type; 65 ns_tree_node_init(&ns->ns_tree_node); 66 ns_tree_node_init(&ns->ns_unified_node); 67 ns_tree_node_init(&ns->ns_owner_node); 68 ns_tree_root_init(&ns->ns_owner_root); 69 70 #ifdef CONFIG_DEBUG_VFS 71 ns_debug(ns, ops); 72 #endif 73 74 if (inum) 75 ns->inum = inum; 76 else 77 ret = proc_alloc_inum(&ns->inum); 78 if (ret) 79 return ret; 80 /* 81 * Tree ref starts at 0. It's incremented when namespace enters 82 * active use (installed in nsproxy) and decremented when all 83 * active uses are gone. Initial namespaces are always active. 84 */ 85 if (is_ns_init_inum(ns)) 86 atomic_set(&ns->__ns_ref_active, 1); 87 else 88 atomic_set(&ns->__ns_ref_active, 0); 89 return 0; 90 } 91 92 void __ns_common_free(struct ns_common *ns) 93 { 94 proc_free_inum(ns->inum); 95 } 96 97 struct ns_common *__must_check ns_owner(struct ns_common *ns) 98 { 99 struct user_namespace *owner; 100 101 if (unlikely(!ns->ops)) 102 return NULL; 103 VFS_WARN_ON_ONCE(!ns->ops->owner); 104 owner = ns->ops->owner(ns); 105 VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); 106 if (!owner) 107 return NULL; 108 /* Skip init_user_ns as it's always active */ 109 if (owner == &init_user_ns) 110 return NULL; 111 return to_ns_common(owner); 112 } 113 114 /* 115 * The active reference count works by having each namespace that gets 116 * created take a single active reference on its owning user namespace. 117 * That single reference is only released once the child namespace's 118 * active count itself goes down. 119 * 120 * A regular namespace tree might look as follow: 121 * Legend: 122 * + : adding active reference 123 * - : dropping active reference 124 * x : always active (initial namespace) 125 * 126 * 127 * net_ns pid_ns 128 * \ / 129 * + + 130 * user_ns1 (2) 131 * | 132 * ipc_ns | uts_ns 133 * \ | / 134 * + + + 135 * user_ns2 (3) 136 * | 137 * cgroup_ns | mnt_ns 138 * \ | / 139 * x x x 140 * init_user_ns (1) 141 * 142 * If both net_ns and pid_ns put their last active reference on 143 * themselves it will cascade to user_ns1 dropping its own active 144 * reference and dropping one active reference on user_ns2: 145 * 146 * net_ns pid_ns 147 * \ / 148 * - - 149 * user_ns1 (0) 150 * | 151 * ipc_ns | uts_ns 152 * \ | / 153 * + - + 154 * user_ns2 (2) 155 * | 156 * cgroup_ns | mnt_ns 157 * \ | / 158 * x x x 159 * init_user_ns (1) 160 * 161 * The iteration stops once we reach a namespace that still has active 162 * references. 163 */ 164 void __ns_ref_active_put(struct ns_common *ns) 165 { 166 /* Initial namespaces are always active. */ 167 if (is_ns_init_id(ns)) 168 return; 169 170 if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 171 VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 172 return; 173 } 174 175 VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 176 VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 177 178 for (;;) { 179 ns = ns_owner(ns); 180 if (!ns) 181 return; 182 VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 183 if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 184 VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 185 return; 186 } 187 } 188 } 189 190 /* 191 * The active reference count works by having each namespace that gets 192 * created take a single active reference on its owning user namespace. 193 * That single reference is only released once the child namespace's 194 * active count itself goes down. This makes it possible to efficiently 195 * resurrect a namespace tree: 196 * 197 * A regular namespace tree might look as follow: 198 * Legend: 199 * + : adding active reference 200 * - : dropping active reference 201 * x : always active (initial namespace) 202 * 203 * 204 * net_ns pid_ns 205 * \ / 206 * + + 207 * user_ns1 (2) 208 * | 209 * ipc_ns | uts_ns 210 * \ | / 211 * + + + 212 * user_ns2 (3) 213 * | 214 * cgroup_ns | mnt_ns 215 * \ | / 216 * x x x 217 * init_user_ns (1) 218 * 219 * If both net_ns and pid_ns put their last active reference on 220 * themselves it will cascade to user_ns1 dropping its own active 221 * reference and dropping one active reference on user_ns2: 222 * 223 * net_ns pid_ns 224 * \ / 225 * - - 226 * user_ns1 (0) 227 * | 228 * ipc_ns | uts_ns 229 * \ | / 230 * + - + 231 * user_ns2 (2) 232 * | 233 * cgroup_ns | mnt_ns 234 * \ | / 235 * x x x 236 * init_user_ns (1) 237 * 238 * Assume the whole tree is dead but all namespaces are still active: 239 * 240 * net_ns pid_ns 241 * \ / 242 * - - 243 * user_ns1 (0) 244 * | 245 * ipc_ns | uts_ns 246 * \ | / 247 * - - - 248 * user_ns2 (0) 249 * | 250 * cgroup_ns | mnt_ns 251 * \ | / 252 * x x x 253 * init_user_ns (1) 254 * 255 * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): 256 * 257 * net_ns pid_ns 258 * \ / 259 * + - 260 * user_ns1 (0) 261 * | 262 * ipc_ns | uts_ns 263 * \ | / 264 * - + - 265 * user_ns2 (0) 266 * | 267 * cgroup_ns | mnt_ns 268 * \ | / 269 * x x x 270 * init_user_ns (1) 271 * 272 * If net_ns had a zero reference count and we bumped it we also need to 273 * take another reference on its owning user namespace. Similarly, if 274 * pid_ns had a zero reference count it also needs to take another 275 * reference on its owning user namespace. So both net_ns and pid_ns 276 * will each have their own reference on the owning user namespace. 277 * 278 * If the owning user namespace user_ns1 had a zero reference count then 279 * it also needs to take another reference on its owning user namespace 280 * and so on. 281 */ 282 void __ns_ref_active_get(struct ns_common *ns) 283 { 284 int prev; 285 286 /* Initial namespaces are always active. */ 287 if (is_ns_init_id(ns)) 288 return; 289 290 /* If we didn't resurrect the namespace we're done. */ 291 prev = atomic_fetch_add(1, &ns->__ns_ref_active); 292 VFS_WARN_ON_ONCE(prev < 0); 293 if (likely(prev)) 294 return; 295 296 /* 297 * We did resurrect it. Walk the ownership hierarchy upwards 298 * until we found an owning user namespace that is active. 299 */ 300 for (;;) { 301 ns = ns_owner(ns); 302 if (!ns) 303 return; 304 305 VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 306 prev = atomic_fetch_add(1, &ns->__ns_ref_active); 307 VFS_WARN_ON_ONCE(prev < 0); 308 if (likely(prev)) 309 return; 310 } 311 } 312