1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Google */ 3 #include <linux/bpf.h> 4 #include <linux/btf_ids.h> 5 #include <linux/cgroup.h> 6 #include <linux/kernel.h> 7 #include <linux/seq_file.h> 8 9 #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ 10 11 /* cgroup_iter provides four modes of traversal to the cgroup hierarchy. 12 * 13 * 1. Walk the descendants of a cgroup in pre-order. 14 * 2. Walk the descendants of a cgroup in post-order. 15 * 3. Walk the ancestors of a cgroup. 16 * 4. Show the given cgroup only. 17 * 18 * For walking descendants, cgroup_iter can walk in either pre-order or 19 * post-order. For walking ancestors, the iter walks up from a cgroup to 20 * the root. 21 * 22 * The iter program can terminate the walk early by returning 1. Walk 23 * continues if prog returns 0. 24 * 25 * The prog can check (seq->num == 0) to determine whether this is 26 * the first element. The prog may also be passed a NULL cgroup, 27 * which means the walk has completed and the prog has a chance to 28 * do post-processing, such as outputting an epilogue. 29 * 30 * Note: the iter_prog is called with cgroup_mutex held. 31 * 32 * Currently only one session is supported, which means, depending on the 33 * volume of data bpf program intends to send to user space, the number 34 * of cgroups that can be walked is limited. For example, given the current 35 * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each 36 * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can 37 * be walked is 512. This is a limitation of cgroup_iter. If the output data 38 * is larger than the kernel buffer size, after all data in the kernel buffer 39 * is consumed by user space, the subsequent read() syscall will signal 40 * EOPNOTSUPP. In order to work around, the user may have to update their 41 * program to reduce the volume of data sent to output. For example, skip 42 * some uninteresting cgroups. 43 */ 44 45 struct bpf_iter__cgroup { 46 __bpf_md_ptr(struct bpf_iter_meta *, meta); 47 __bpf_md_ptr(struct cgroup *, cgroup); 48 }; 49 50 struct cgroup_iter_priv { 51 struct cgroup_subsys_state *start_css; 52 bool visited_all; 53 bool terminate; 54 int order; 55 }; 56 57 static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) 58 { 59 struct cgroup_iter_priv *p = seq->private; 60 61 mutex_lock(&cgroup_mutex); 62 63 /* cgroup_iter doesn't support read across multiple sessions. */ 64 if (*pos > 0) { 65 if (p->visited_all) 66 return NULL; 67 68 /* Haven't visited all, but because cgroup_mutex has dropped, 69 * return -EOPNOTSUPP to indicate incomplete iteration. 70 */ 71 return ERR_PTR(-EOPNOTSUPP); 72 } 73 74 ++*pos; 75 p->terminate = false; 76 p->visited_all = false; 77 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 78 return css_next_descendant_pre(NULL, p->start_css); 79 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 80 return css_next_descendant_post(NULL, p->start_css); 81 else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) 82 return p->start_css; 83 else /* BPF_CGROUP_ITER_SELF_ONLY */ 84 return p->start_css; 85 } 86 87 static int __cgroup_iter_seq_show(struct seq_file *seq, 88 struct cgroup_subsys_state *css, int in_stop); 89 90 static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) 91 { 92 struct cgroup_iter_priv *p = seq->private; 93 94 mutex_unlock(&cgroup_mutex); 95 96 /* pass NULL to the prog for post-processing */ 97 if (!v) { 98 __cgroup_iter_seq_show(seq, NULL, true); 99 p->visited_all = true; 100 } 101 } 102 103 static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) 104 { 105 struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; 106 struct cgroup_iter_priv *p = seq->private; 107 108 ++*pos; 109 if (p->terminate) 110 return NULL; 111 112 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 113 return css_next_descendant_pre(curr, p->start_css); 114 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 115 return css_next_descendant_post(curr, p->start_css); 116 else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) 117 return curr->parent; 118 else /* BPF_CGROUP_ITER_SELF_ONLY */ 119 return NULL; 120 } 121 122 static int __cgroup_iter_seq_show(struct seq_file *seq, 123 struct cgroup_subsys_state *css, int in_stop) 124 { 125 struct cgroup_iter_priv *p = seq->private; 126 struct bpf_iter__cgroup ctx; 127 struct bpf_iter_meta meta; 128 struct bpf_prog *prog; 129 int ret = 0; 130 131 /* cgroup is dead, skip this element */ 132 if (css && cgroup_is_dead(css->cgroup)) 133 return 0; 134 135 ctx.meta = &meta; 136 ctx.cgroup = css ? css->cgroup : NULL; 137 meta.seq = seq; 138 prog = bpf_iter_get_info(&meta, in_stop); 139 if (prog) 140 ret = bpf_iter_run_prog(prog, &ctx); 141 142 /* if prog returns > 0, terminate after this element. */ 143 if (ret != 0) 144 p->terminate = true; 145 146 return 0; 147 } 148 149 static int cgroup_iter_seq_show(struct seq_file *seq, void *v) 150 { 151 return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, 152 false); 153 } 154 155 static const struct seq_operations cgroup_iter_seq_ops = { 156 .start = cgroup_iter_seq_start, 157 .next = cgroup_iter_seq_next, 158 .stop = cgroup_iter_seq_stop, 159 .show = cgroup_iter_seq_show, 160 }; 161 162 BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) 163 164 static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) 165 { 166 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; 167 struct cgroup *cgrp = aux->cgroup.start; 168 169 p->start_css = &cgrp->self; 170 p->terminate = false; 171 p->visited_all = false; 172 p->order = aux->cgroup.order; 173 return 0; 174 } 175 176 static const struct bpf_iter_seq_info cgroup_iter_seq_info = { 177 .seq_ops = &cgroup_iter_seq_ops, 178 .init_seq_private = cgroup_iter_seq_init, 179 .seq_priv_size = sizeof(struct cgroup_iter_priv), 180 }; 181 182 static int bpf_iter_attach_cgroup(struct bpf_prog *prog, 183 union bpf_iter_link_info *linfo, 184 struct bpf_iter_aux_info *aux) 185 { 186 int fd = linfo->cgroup.cgroup_fd; 187 u64 id = linfo->cgroup.cgroup_id; 188 int order = linfo->cgroup.order; 189 struct cgroup *cgrp; 190 191 if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && 192 order != BPF_CGROUP_ITER_DESCENDANTS_POST && 193 order != BPF_CGROUP_ITER_ANCESTORS_UP && 194 order != BPF_CGROUP_ITER_SELF_ONLY) 195 return -EINVAL; 196 197 if (fd && id) 198 return -EINVAL; 199 200 if (fd) 201 cgrp = cgroup_get_from_fd(fd); 202 else if (id) 203 cgrp = cgroup_get_from_id(id); 204 else /* walk the entire hierarchy by default. */ 205 cgrp = cgroup_get_from_path("/"); 206 207 if (IS_ERR(cgrp)) 208 return PTR_ERR(cgrp); 209 210 aux->cgroup.start = cgrp; 211 aux->cgroup.order = order; 212 return 0; 213 } 214 215 static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) 216 { 217 cgroup_put(aux->cgroup.start); 218 } 219 220 static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, 221 struct seq_file *seq) 222 { 223 char *buf; 224 225 buf = kzalloc(PATH_MAX, GFP_KERNEL); 226 if (!buf) { 227 seq_puts(seq, "cgroup_path:\t<unknown>\n"); 228 goto show_order; 229 } 230 231 /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path 232 * will print nothing. 233 * 234 * Path is in the calling process's cgroup namespace. 235 */ 236 cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, 237 current->nsproxy->cgroup_ns); 238 seq_printf(seq, "cgroup_path:\t%s\n", buf); 239 kfree(buf); 240 241 show_order: 242 if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 243 seq_puts(seq, "order: descendants_pre\n"); 244 else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) 245 seq_puts(seq, "order: descendants_post\n"); 246 else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) 247 seq_puts(seq, "order: ancestors_up\n"); 248 else /* BPF_CGROUP_ITER_SELF_ONLY */ 249 seq_puts(seq, "order: self_only\n"); 250 } 251 252 static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, 253 struct bpf_link_info *info) 254 { 255 info->iter.cgroup.order = aux->cgroup.order; 256 info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); 257 return 0; 258 } 259 260 DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, 261 struct cgroup *cgroup) 262 263 static struct bpf_iter_reg bpf_cgroup_reg_info = { 264 .target = "cgroup", 265 .feature = BPF_ITER_RESCHED, 266 .attach_target = bpf_iter_attach_cgroup, 267 .detach_target = bpf_iter_detach_cgroup, 268 .show_fdinfo = bpf_iter_cgroup_show_fdinfo, 269 .fill_link_info = bpf_iter_cgroup_fill_link_info, 270 .ctx_arg_info_size = 1, 271 .ctx_arg_info = { 272 { offsetof(struct bpf_iter__cgroup, cgroup), 273 PTR_TO_BTF_ID_OR_NULL }, 274 }, 275 .seq_info = &cgroup_iter_seq_info, 276 }; 277 278 static int __init bpf_cgroup_iter_init(void) 279 { 280 bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; 281 return bpf_iter_reg_target(&bpf_cgroup_reg_info); 282 } 283 284 late_initcall(bpf_cgroup_iter_init); 285