1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Google */ 3 #include <linux/bpf.h> 4 #include <linux/btf_ids.h> 5 #include <linux/cgroup.h> 6 #include <linux/kernel.h> 7 #include <linux/seq_file.h> 8 9 #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ 10 11 /* cgroup_iter provides five modes of traversal to the cgroup hierarchy. 12 * 13 * 1. Walk the descendants of a cgroup in pre-order. 14 * 2. Walk the descendants of a cgroup in post-order. 15 * 3. Walk the ancestors of a cgroup. 16 * 4. Show the given cgroup only. 17 * 5. Walk the children of a given parent cgroup. 18 * 19 * For walking descendants, cgroup_iter can walk in either pre-order or 20 * post-order. For walking ancestors, the iter walks up from a cgroup to 21 * the root. 22 * 23 * The iter program can terminate the walk early by returning 1. Walk 24 * continues if prog returns 0. 25 * 26 * The prog can check (seq->num == 0) to determine whether this is 27 * the first element. The prog may also be passed a NULL cgroup, 28 * which means the walk has completed and the prog has a chance to 29 * do post-processing, such as outputting an epilogue. 30 * 31 * Note: the iter_prog is called with cgroup_mutex held. 32 * 33 * Currently only one session is supported, which means, depending on the 34 * volume of data bpf program intends to send to user space, the number 35 * of cgroups that can be walked is limited. For example, given the current 36 * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each 37 * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can 38 * be walked is 512. This is a limitation of cgroup_iter. If the output data 39 * is larger than the kernel buffer size, after all data in the kernel buffer 40 * is consumed by user space, the subsequent read() syscall will signal 41 * EOPNOTSUPP. In order to work around, the user may have to update their 42 * program to reduce the volume of data sent to output. For example, skip 43 * some uninteresting cgroups. 44 */ 45 46 struct bpf_iter__cgroup { 47 __bpf_md_ptr(struct bpf_iter_meta *, meta); 48 __bpf_md_ptr(struct cgroup *, cgroup); 49 }; 50 51 struct cgroup_iter_priv { 52 struct cgroup_subsys_state *start_css; 53 bool visited_all; 54 bool terminate; 55 int order; 56 }; 57 58 static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) 59 { 60 struct cgroup_iter_priv *p = seq->private; 61 62 cgroup_lock(); 63 64 /* cgroup_iter doesn't support read across multiple sessions. */ 65 if (*pos > 0) { 66 if (p->visited_all) 67 return NULL; 68 69 /* Haven't visited all, but because cgroup_mutex has dropped, 70 * return -EOPNOTSUPP to indicate incomplete iteration. 71 */ 72 return ERR_PTR(-EOPNOTSUPP); 73 } 74 75 ++*pos; 76 p->terminate = false; 77 p->visited_all = false; 78 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 79 return css_next_descendant_pre(NULL, p->start_css); 80 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 81 return css_next_descendant_post(NULL, p->start_css); 82 else if (p->order == BPF_CGROUP_ITER_CHILDREN) 83 return css_next_child(NULL, p->start_css); 84 else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ 85 return p->start_css; 86 } 87 88 static int __cgroup_iter_seq_show(struct seq_file *seq, 89 struct cgroup_subsys_state *css, int in_stop); 90 91 static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) 92 { 93 struct cgroup_iter_priv *p = seq->private; 94 95 cgroup_unlock(); 96 97 /* pass NULL to the prog for post-processing */ 98 if (!v) { 99 __cgroup_iter_seq_show(seq, NULL, true); 100 p->visited_all = true; 101 } 102 } 103 104 static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) 105 { 106 struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; 107 struct cgroup_iter_priv *p = seq->private; 108 109 ++*pos; 110 if (p->terminate) 111 return NULL; 112 113 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 114 return css_next_descendant_pre(curr, p->start_css); 115 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 116 return css_next_descendant_post(curr, p->start_css); 117 else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) 118 return curr->parent; 119 else if (p->order == BPF_CGROUP_ITER_CHILDREN) 120 return css_next_child(curr, p->start_css); 121 else /* BPF_CGROUP_ITER_SELF_ONLY */ 122 return NULL; 123 } 124 125 static int __cgroup_iter_seq_show(struct seq_file *seq, 126 struct cgroup_subsys_state *css, int in_stop) 127 { 128 struct cgroup_iter_priv *p = seq->private; 129 struct bpf_iter__cgroup ctx; 130 struct bpf_iter_meta meta; 131 struct bpf_prog *prog; 132 int ret = 0; 133 134 /* cgroup is dead, skip this element */ 135 if (css && cgroup_is_dead(css->cgroup)) 136 return 0; 137 138 ctx.meta = &meta; 139 ctx.cgroup = css ? css->cgroup : NULL; 140 meta.seq = seq; 141 prog = bpf_iter_get_info(&meta, in_stop); 142 if (prog) 143 ret = bpf_iter_run_prog(prog, &ctx); 144 145 /* if prog returns > 0, terminate after this element. */ 146 if (ret != 0) 147 p->terminate = true; 148 149 return 0; 150 } 151 152 static int cgroup_iter_seq_show(struct seq_file *seq, void *v) 153 { 154 return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, 155 false); 156 } 157 158 static const struct seq_operations cgroup_iter_seq_ops = { 159 .start = cgroup_iter_seq_start, 160 .next = cgroup_iter_seq_next, 161 .stop = cgroup_iter_seq_stop, 162 .show = cgroup_iter_seq_show, 163 }; 164 165 BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup) 166 167 static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) 168 { 169 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; 170 struct cgroup *cgrp = aux->cgroup.start; 171 172 /* bpf_iter_attach_cgroup() has already acquired an extra reference 173 * for the start cgroup, but the reference may be released after 174 * cgroup_iter_seq_init(), so acquire another reference for the 175 * start cgroup. 176 */ 177 p->start_css = &cgrp->self; 178 css_get(p->start_css); 179 p->terminate = false; 180 p->visited_all = false; 181 p->order = aux->cgroup.order; 182 return 0; 183 } 184 185 static void cgroup_iter_seq_fini(void *priv) 186 { 187 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; 188 189 css_put(p->start_css); 190 } 191 192 static const struct bpf_iter_seq_info cgroup_iter_seq_info = { 193 .seq_ops = &cgroup_iter_seq_ops, 194 .init_seq_private = cgroup_iter_seq_init, 195 .fini_seq_private = cgroup_iter_seq_fini, 196 .seq_priv_size = sizeof(struct cgroup_iter_priv), 197 }; 198 199 static int bpf_iter_attach_cgroup(struct bpf_prog *prog, 200 union bpf_iter_link_info *linfo, 201 struct bpf_iter_aux_info *aux) 202 { 203 int fd = linfo->cgroup.cgroup_fd; 204 u64 id = linfo->cgroup.cgroup_id; 205 int order = linfo->cgroup.order; 206 struct cgroup *cgrp; 207 208 switch (order) { 209 case BPF_CGROUP_ITER_DESCENDANTS_PRE: 210 case BPF_CGROUP_ITER_DESCENDANTS_POST: 211 case BPF_CGROUP_ITER_ANCESTORS_UP: 212 case BPF_CGROUP_ITER_SELF_ONLY: 213 case BPF_CGROUP_ITER_CHILDREN: 214 break; 215 default: 216 return -EINVAL; 217 } 218 219 if (fd && id) 220 return -EINVAL; 221 222 if (fd) 223 cgrp = cgroup_v1v2_get_from_fd(fd); 224 else if (id) 225 cgrp = cgroup_get_from_id(id); 226 else /* walk the entire hierarchy by default. */ 227 cgrp = cgroup_get_from_path("/"); 228 229 if (IS_ERR(cgrp)) 230 return PTR_ERR(cgrp); 231 232 aux->cgroup.start = cgrp; 233 aux->cgroup.order = order; 234 return 0; 235 } 236 237 static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) 238 { 239 cgroup_put(aux->cgroup.start); 240 } 241 242 static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, 243 struct seq_file *seq) 244 { 245 char *buf; 246 247 buf = kzalloc(PATH_MAX, GFP_KERNEL); 248 if (!buf) { 249 seq_puts(seq, "cgroup_path:\t<unknown>\n"); 250 goto show_order; 251 } 252 253 /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path 254 * will print nothing. 255 * 256 * Path is in the calling process's cgroup namespace. 257 */ 258 cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, 259 current->nsproxy->cgroup_ns); 260 seq_printf(seq, "cgroup_path:\t%s\n", buf); 261 kfree(buf); 262 263 show_order: 264 if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 265 seq_puts(seq, "order: descendants_pre\n"); 266 else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) 267 seq_puts(seq, "order: descendants_post\n"); 268 else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) 269 seq_puts(seq, "order: ancestors_up\n"); 270 else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) 271 seq_puts(seq, "order: children\n"); 272 else /* BPF_CGROUP_ITER_SELF_ONLY */ 273 seq_puts(seq, "order: self_only\n"); 274 } 275 276 static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, 277 struct bpf_link_info *info) 278 { 279 info->iter.cgroup.order = aux->cgroup.order; 280 info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); 281 return 0; 282 } 283 284 DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, 285 struct cgroup *cgroup) 286 287 static struct bpf_iter_reg bpf_cgroup_reg_info = { 288 .target = "cgroup", 289 .feature = BPF_ITER_RESCHED, 290 .attach_target = bpf_iter_attach_cgroup, 291 .detach_target = bpf_iter_detach_cgroup, 292 .show_fdinfo = bpf_iter_cgroup_show_fdinfo, 293 .fill_link_info = bpf_iter_cgroup_fill_link_info, 294 .ctx_arg_info_size = 1, 295 .ctx_arg_info = { 296 { offsetof(struct bpf_iter__cgroup, cgroup), 297 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 298 }, 299 .seq_info = &cgroup_iter_seq_info, 300 }; 301 302 static int __init bpf_cgroup_iter_init(void) 303 { 304 bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; 305 return bpf_iter_reg_target(&bpf_cgroup_reg_info); 306 } 307 308 late_initcall(bpf_cgroup_iter_init); 309 310 struct bpf_iter_css { 311 __u64 __opaque[3]; 312 } __attribute__((aligned(8))); 313 314 struct bpf_iter_css_kern { 315 struct cgroup_subsys_state *start; 316 struct cgroup_subsys_state *pos; 317 unsigned int flags; 318 } __attribute__((aligned(8))); 319 320 __bpf_kfunc_start_defs(); 321 322 __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, 323 struct cgroup_subsys_state *start, unsigned int flags) 324 { 325 struct bpf_iter_css_kern *kit = (void *)it; 326 327 BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css)); 328 BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css)); 329 330 kit->start = NULL; 331 switch (flags) { 332 case BPF_CGROUP_ITER_DESCENDANTS_PRE: 333 case BPF_CGROUP_ITER_DESCENDANTS_POST: 334 case BPF_CGROUP_ITER_ANCESTORS_UP: 335 case BPF_CGROUP_ITER_CHILDREN: 336 break; 337 default: 338 return -EINVAL; 339 } 340 341 kit->start = start; 342 kit->pos = NULL; 343 kit->flags = flags; 344 return 0; 345 } 346 347 __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) 348 { 349 struct bpf_iter_css_kern *kit = (void *)it; 350 351 if (!kit->start) 352 return NULL; 353 354 switch (kit->flags) { 355 case BPF_CGROUP_ITER_DESCENDANTS_PRE: 356 kit->pos = css_next_descendant_pre(kit->pos, kit->start); 357 break; 358 case BPF_CGROUP_ITER_DESCENDANTS_POST: 359 kit->pos = css_next_descendant_post(kit->pos, kit->start); 360 break; 361 case BPF_CGROUP_ITER_CHILDREN: 362 kit->pos = css_next_child(kit->pos, kit->start); 363 break; 364 case BPF_CGROUP_ITER_ANCESTORS_UP: 365 kit->pos = kit->pos ? kit->pos->parent : kit->start; 366 } 367 368 return kit->pos; 369 } 370 371 __bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it) 372 { 373 } 374 375 __bpf_kfunc_end_defs(); 376