1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 3 4 #include <linux/ns_common.h> 5 #include <linux/proc_ns.h> 6 #include <linux/user_namespace.h> 7 #include <linux/vfsdebug.h> 8 9 #ifdef CONFIG_DEBUG_VFS 10 static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) 11 { 12 switch (ns->ns_type) { 13 #ifdef CONFIG_CGROUPS 14 case CLONE_NEWCGROUP: 15 VFS_WARN_ON_ONCE(ops != &cgroupns_operations); 16 break; 17 #endif 18 #ifdef CONFIG_IPC_NS 19 case CLONE_NEWIPC: 20 VFS_WARN_ON_ONCE(ops != &ipcns_operations); 21 break; 22 #endif 23 case CLONE_NEWNS: 24 VFS_WARN_ON_ONCE(ops != &mntns_operations); 25 break; 26 #ifdef CONFIG_NET_NS 27 case CLONE_NEWNET: 28 VFS_WARN_ON_ONCE(ops != &netns_operations); 29 break; 30 #endif 31 #ifdef CONFIG_PID_NS 32 case CLONE_NEWPID: 33 VFS_WARN_ON_ONCE(ops != &pidns_operations); 34 break; 35 #endif 36 #ifdef CONFIG_TIME_NS 37 case CLONE_NEWTIME: 38 VFS_WARN_ON_ONCE(ops != &timens_operations); 39 break; 40 #endif 41 #ifdef CONFIG_USER_NS 42 case CLONE_NEWUSER: 43 VFS_WARN_ON_ONCE(ops != &userns_operations); 44 break; 45 #endif 46 #ifdef CONFIG_UTS_NS 47 case CLONE_NEWUTS: 48 VFS_WARN_ON_ONCE(ops != &utsns_operations); 49 break; 50 #endif 51 } 52 } 53 #endif 54 55 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) 56 { 57 int ret; 58 59 refcount_set(&ns->__ns_ref, 1); 60 ns->stashed = NULL; 61 ns->ops = ops; 62 ns->ns_id = 0; 63 ns->ns_type = ns_type; 64 RB_CLEAR_NODE(&ns->ns_tree_node); 65 RB_CLEAR_NODE(&ns->ns_unified_tree_node); 66 RB_CLEAR_NODE(&ns->ns_owner_tree_node); 67 INIT_LIST_HEAD(&ns->ns_list_node); 68 INIT_LIST_HEAD(&ns->ns_unified_list_node); 69 ns->ns_owner_tree = RB_ROOT; 70 INIT_LIST_HEAD(&ns->ns_owner); 71 INIT_LIST_HEAD(&ns->ns_owner_entry); 72 73 #ifdef CONFIG_DEBUG_VFS 74 ns_debug(ns, ops); 75 #endif 76 77 if (inum) { 78 ns->inum = inum; 79 return 0; 80 } 81 ret = proc_alloc_inum(&ns->inum); 82 if (ret) 83 return ret; 84 /* 85 * Tree ref starts at 0. It's incremented when namespace enters 86 * active use (installed in nsproxy) and decremented when all 87 * active uses are gone. Initial namespaces are always active. 88 */ 89 if (is_initial_namespace(ns)) 90 atomic_set(&ns->__ns_ref_active, 1); 91 else 92 atomic_set(&ns->__ns_ref_active, 0); 93 return 0; 94 } 95 96 void __ns_common_free(struct ns_common *ns) 97 { 98 proc_free_inum(ns->inum); 99 } 100 101 struct ns_common *__must_check ns_owner(struct ns_common *ns) 102 { 103 struct user_namespace *owner; 104 105 if (unlikely(!ns->ops)) 106 return NULL; 107 VFS_WARN_ON_ONCE(!ns->ops->owner); 108 owner = ns->ops->owner(ns); 109 VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); 110 if (!owner) 111 return NULL; 112 /* Skip init_user_ns as it's always active */ 113 if (owner == &init_user_ns) 114 return NULL; 115 return to_ns_common(owner); 116 } 117 118 void __ns_ref_active_get_owner(struct ns_common *ns) 119 { 120 ns = ns_owner(ns); 121 if (ns) 122 WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); 123 } 124 125 /* 126 * The active reference count works by having each namespace that gets 127 * created take a single active reference on its owning user namespace. 128 * That single reference is only released once the child namespace's 129 * active count itself goes down. 130 * 131 * A regular namespace tree might look as follow: 132 * Legend: 133 * + : adding active reference 134 * - : dropping active reference 135 * x : always active (initial namespace) 136 * 137 * 138 * net_ns pid_ns 139 * \ / 140 * + + 141 * user_ns1 (2) 142 * | 143 * ipc_ns | uts_ns 144 * \ | / 145 * + + + 146 * user_ns2 (3) 147 * | 148 * cgroup_ns | mnt_ns 149 * \ | / 150 * x x x 151 * init_user_ns (1) 152 * 153 * If both net_ns and pid_ns put their last active reference on 154 * themselves it will cascade to user_ns1 dropping its own active 155 * reference and dropping one active reference on user_ns2: 156 * 157 * net_ns pid_ns 158 * \ / 159 * - - 160 * user_ns1 (0) 161 * | 162 * ipc_ns | uts_ns 163 * \ | / 164 * + - + 165 * user_ns2 (2) 166 * | 167 * cgroup_ns | mnt_ns 168 * \ | / 169 * x x x 170 * init_user_ns (1) 171 * 172 * The iteration stops once we reach a namespace that still has active 173 * references. 174 */ 175 void __ns_ref_active_put_owner(struct ns_common *ns) 176 { 177 for (;;) { 178 ns = ns_owner(ns); 179 if (!ns) 180 return; 181 if (!atomic_dec_and_test(&ns->__ns_ref_active)) 182 return; 183 } 184 } 185 186 /* 187 * The active reference count works by having each namespace that gets 188 * created take a single active reference on its owning user namespace. 189 * That single reference is only released once the child namespace's 190 * active count itself goes down. This makes it possible to efficiently 191 * resurrect a namespace tree: 192 * 193 * A regular namespace tree might look as follow: 194 * Legend: 195 * + : adding active reference 196 * - : dropping active reference 197 * x : always active (initial namespace) 198 * 199 * 200 * net_ns pid_ns 201 * \ / 202 * + + 203 * user_ns1 (2) 204 * | 205 * ipc_ns | uts_ns 206 * \ | / 207 * + + + 208 * user_ns2 (3) 209 * | 210 * cgroup_ns | mnt_ns 211 * \ | / 212 * x x x 213 * init_user_ns (1) 214 * 215 * If both net_ns and pid_ns put their last active reference on 216 * themselves it will cascade to user_ns1 dropping its own active 217 * reference and dropping one active reference on user_ns2: 218 * 219 * net_ns pid_ns 220 * \ / 221 * - - 222 * user_ns1 (0) 223 * | 224 * ipc_ns | uts_ns 225 * \ | / 226 * + - + 227 * user_ns2 (2) 228 * | 229 * cgroup_ns | mnt_ns 230 * \ | / 231 * x x x 232 * init_user_ns (1) 233 * 234 * Assume the whole tree is dead but all namespaces are still active: 235 * 236 * net_ns pid_ns 237 * \ / 238 * - - 239 * user_ns1 (0) 240 * | 241 * ipc_ns | uts_ns 242 * \ | / 243 * - - - 244 * user_ns2 (0) 245 * | 246 * cgroup_ns | mnt_ns 247 * \ | / 248 * x x x 249 * init_user_ns (1) 250 * 251 * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): 252 * 253 * net_ns pid_ns 254 * \ / 255 * + - 256 * user_ns1 (0) 257 * | 258 * ipc_ns | uts_ns 259 * \ | / 260 * - + - 261 * user_ns2 (0) 262 * | 263 * cgroup_ns | mnt_ns 264 * \ | / 265 * x x x 266 * init_user_ns (1) 267 * 268 * If net_ns had a zero reference count and we bumped it we also need to 269 * take another reference on its owning user namespace. Similarly, if 270 * pid_ns had a zero reference count it also needs to take another 271 * reference on its owning user namespace. So both net_ns and pid_ns 272 * will each have their own reference on the owning user namespace. 273 * 274 * If the owning user namespace user_ns1 had a zero reference count then 275 * it also needs to take another reference on its owning user namespace 276 * and so on. 277 */ 278 void __ns_ref_active_resurrect(struct ns_common *ns) 279 { 280 /* If we didn't resurrect the namespace we're done. */ 281 if (atomic_fetch_add(1, &ns->__ns_ref_active)) 282 return; 283 284 /* 285 * We did resurrect it. Walk the ownership hierarchy upwards 286 * until we found an owning user namespace that is active. 287 */ 288 for (;;) { 289 ns = ns_owner(ns); 290 if (!ns) 291 return; 292 293 if (atomic_fetch_add(1, &ns->__ns_ref_active)) 294 return; 295 } 296 } 297