1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 3 4 #include <linux/ns_common.h> 5 #include <linux/proc_ns.h> 6 #include <linux/user_namespace.h> 7 #include <linux/vfsdebug.h> 8 9 #ifdef CONFIG_DEBUG_VFS 10 static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) 11 { 12 switch (ns->ns_type) { 13 #ifdef CONFIG_CGROUPS 14 case CLONE_NEWCGROUP: 15 VFS_WARN_ON_ONCE(ops != &cgroupns_operations); 16 break; 17 #endif 18 #ifdef CONFIG_IPC_NS 19 case CLONE_NEWIPC: 20 VFS_WARN_ON_ONCE(ops != &ipcns_operations); 21 break; 22 #endif 23 case CLONE_NEWNS: 24 VFS_WARN_ON_ONCE(ops != &mntns_operations); 25 break; 26 #ifdef CONFIG_NET_NS 27 case CLONE_NEWNET: 28 VFS_WARN_ON_ONCE(ops != &netns_operations); 29 break; 30 #endif 31 #ifdef CONFIG_PID_NS 32 case CLONE_NEWPID: 33 VFS_WARN_ON_ONCE(ops != &pidns_operations); 34 break; 35 #endif 36 #ifdef CONFIG_TIME_NS 37 case CLONE_NEWTIME: 38 VFS_WARN_ON_ONCE(ops != &timens_operations); 39 break; 40 #endif 41 #ifdef CONFIG_USER_NS 42 case CLONE_NEWUSER: 43 VFS_WARN_ON_ONCE(ops != &userns_operations); 44 break; 45 #endif 46 #ifdef CONFIG_UTS_NS 47 case CLONE_NEWUTS: 48 VFS_WARN_ON_ONCE(ops != &utsns_operations); 49 break; 50 #endif 51 } 52 } 53 #endif 54 55 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) 56 { 57 int ret = 0; 58 59 refcount_set(&ns->__ns_ref, 1); 60 ns->stashed = NULL; 61 ns->ops = ops; 62 ns->ns_id = 0; 63 ns->ns_type = ns_type; 64 RB_CLEAR_NODE(&ns->ns_tree_node); 65 RB_CLEAR_NODE(&ns->ns_unified_tree_node); 66 RB_CLEAR_NODE(&ns->ns_owner_tree_node); 67 INIT_LIST_HEAD(&ns->ns_list_node); 68 INIT_LIST_HEAD(&ns->ns_unified_list_node); 69 ns->ns_owner_tree = RB_ROOT; 70 INIT_LIST_HEAD(&ns->ns_owner); 71 INIT_LIST_HEAD(&ns->ns_owner_entry); 72 73 #ifdef CONFIG_DEBUG_VFS 74 ns_debug(ns, ops); 75 #endif 76 77 if (inum) 78 ns->inum = inum; 79 else 80 ret = proc_alloc_inum(&ns->inum); 81 if (ret) 82 return ret; 83 /* 84 * Tree ref starts at 0. It's incremented when namespace enters 85 * active use (installed in nsproxy) and decremented when all 86 * active uses are gone. Initial namespaces are always active. 87 */ 88 if (is_initial_namespace(ns)) 89 atomic_set(&ns->__ns_ref_active, 1); 90 else 91 atomic_set(&ns->__ns_ref_active, 0); 92 return 0; 93 } 94 95 void __ns_common_free(struct ns_common *ns) 96 { 97 proc_free_inum(ns->inum); 98 } 99 100 struct ns_common *__must_check ns_owner(struct ns_common *ns) 101 { 102 struct user_namespace *owner; 103 104 if (unlikely(!ns->ops)) 105 return NULL; 106 VFS_WARN_ON_ONCE(!ns->ops->owner); 107 owner = ns->ops->owner(ns); 108 VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); 109 if (!owner) 110 return NULL; 111 /* Skip init_user_ns as it's always active */ 112 if (owner == &init_user_ns) 113 return NULL; 114 return to_ns_common(owner); 115 } 116 117 /* 118 * The active reference count works by having each namespace that gets 119 * created take a single active reference on its owning user namespace. 120 * That single reference is only released once the child namespace's 121 * active count itself goes down. 122 * 123 * A regular namespace tree might look as follow: 124 * Legend: 125 * + : adding active reference 126 * - : dropping active reference 127 * x : always active (initial namespace) 128 * 129 * 130 * net_ns pid_ns 131 * \ / 132 * + + 133 * user_ns1 (2) 134 * | 135 * ipc_ns | uts_ns 136 * \ | / 137 * + + + 138 * user_ns2 (3) 139 * | 140 * cgroup_ns | mnt_ns 141 * \ | / 142 * x x x 143 * init_user_ns (1) 144 * 145 * If both net_ns and pid_ns put their last active reference on 146 * themselves it will cascade to user_ns1 dropping its own active 147 * reference and dropping one active reference on user_ns2: 148 * 149 * net_ns pid_ns 150 * \ / 151 * - - 152 * user_ns1 (0) 153 * | 154 * ipc_ns | uts_ns 155 * \ | / 156 * + - + 157 * user_ns2 (2) 158 * | 159 * cgroup_ns | mnt_ns 160 * \ | / 161 * x x x 162 * init_user_ns (1) 163 * 164 * The iteration stops once we reach a namespace that still has active 165 * references. 166 */ 167 void __ns_ref_active_put(struct ns_common *ns) 168 { 169 /* Initial namespaces are always active. */ 170 if (is_ns_init_id(ns)) 171 return; 172 173 if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 174 VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 175 return; 176 } 177 178 VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 179 VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 180 181 for (;;) { 182 ns = ns_owner(ns); 183 if (!ns) 184 return; 185 VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 186 if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 187 VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 188 return; 189 } 190 } 191 } 192 193 /* 194 * The active reference count works by having each namespace that gets 195 * created take a single active reference on its owning user namespace. 196 * That single reference is only released once the child namespace's 197 * active count itself goes down. This makes it possible to efficiently 198 * resurrect a namespace tree: 199 * 200 * A regular namespace tree might look as follow: 201 * Legend: 202 * + : adding active reference 203 * - : dropping active reference 204 * x : always active (initial namespace) 205 * 206 * 207 * net_ns pid_ns 208 * \ / 209 * + + 210 * user_ns1 (2) 211 * | 212 * ipc_ns | uts_ns 213 * \ | / 214 * + + + 215 * user_ns2 (3) 216 * | 217 * cgroup_ns | mnt_ns 218 * \ | / 219 * x x x 220 * init_user_ns (1) 221 * 222 * If both net_ns and pid_ns put their last active reference on 223 * themselves it will cascade to user_ns1 dropping its own active 224 * reference and dropping one active reference on user_ns2: 225 * 226 * net_ns pid_ns 227 * \ / 228 * - - 229 * user_ns1 (0) 230 * | 231 * ipc_ns | uts_ns 232 * \ | / 233 * + - + 234 * user_ns2 (2) 235 * | 236 * cgroup_ns | mnt_ns 237 * \ | / 238 * x x x 239 * init_user_ns (1) 240 * 241 * Assume the whole tree is dead but all namespaces are still active: 242 * 243 * net_ns pid_ns 244 * \ / 245 * - - 246 * user_ns1 (0) 247 * | 248 * ipc_ns | uts_ns 249 * \ | / 250 * - - - 251 * user_ns2 (0) 252 * | 253 * cgroup_ns | mnt_ns 254 * \ | / 255 * x x x 256 * init_user_ns (1) 257 * 258 * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): 259 * 260 * net_ns pid_ns 261 * \ / 262 * + - 263 * user_ns1 (0) 264 * | 265 * ipc_ns | uts_ns 266 * \ | / 267 * - + - 268 * user_ns2 (0) 269 * | 270 * cgroup_ns | mnt_ns 271 * \ | / 272 * x x x 273 * init_user_ns (1) 274 * 275 * If net_ns had a zero reference count and we bumped it we also need to 276 * take another reference on its owning user namespace. Similarly, if 277 * pid_ns had a zero reference count it also needs to take another 278 * reference on its owning user namespace. So both net_ns and pid_ns 279 * will each have their own reference on the owning user namespace. 280 * 281 * If the owning user namespace user_ns1 had a zero reference count then 282 * it also needs to take another reference on its owning user namespace 283 * and so on. 284 */ 285 void __ns_ref_active_get(struct ns_common *ns) 286 { 287 int prev; 288 289 /* Initial namespaces are always active. */ 290 if (is_ns_init_id(ns)) 291 return; 292 293 /* If we didn't resurrect the namespace we're done. */ 294 prev = atomic_fetch_add(1, &ns->__ns_ref_active); 295 VFS_WARN_ON_ONCE(prev < 0); 296 if (likely(prev)) 297 return; 298 299 /* 300 * We did resurrect it. Walk the ownership hierarchy upwards 301 * until we found an owning user namespace that is active. 302 */ 303 for (;;) { 304 ns = ns_owner(ns); 305 if (!ns) 306 return; 307 308 VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 309 prev = atomic_fetch_add(1, &ns->__ns_ref_active); 310 VFS_WARN_ON_ONCE(prev < 0); 311 if (likely(prev)) 312 return; 313 } 314 } 315