1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Google */ 3 #include <linux/bpf.h> 4 #include <linux/btf_ids.h> 5 #include <linux/cgroup.h> 6 #include <linux/kernel.h> 7 #include <linux/seq_file.h> 8 9 #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ 10 11 /* cgroup_iter provides four modes of traversal to the cgroup hierarchy. 12 * 13 * 1. Walk the descendants of a cgroup in pre-order. 14 * 2. Walk the descendants of a cgroup in post-order. 15 * 3. Walk the ancestors of a cgroup. 16 * 4. Show the given cgroup only. 17 * 18 * For walking descendants, cgroup_iter can walk in either pre-order or 19 * post-order. For walking ancestors, the iter walks up from a cgroup to 20 * the root. 21 * 22 * The iter program can terminate the walk early by returning 1. Walk 23 * continues if prog returns 0. 24 * 25 * The prog can check (seq->num == 0) to determine whether this is 26 * the first element. The prog may also be passed a NULL cgroup, 27 * which means the walk has completed and the prog has a chance to 28 * do post-processing, such as outputting an epilogue. 29 * 30 * Note: the iter_prog is called with cgroup_mutex held. 31 * 32 * Currently only one session is supported, which means, depending on the 33 * volume of data bpf program intends to send to user space, the number 34 * of cgroups that can be walked is limited. For example, given the current 35 * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each 36 * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can 37 * be walked is 512. This is a limitation of cgroup_iter. If the output data 38 * is larger than the kernel buffer size, after all data in the kernel buffer 39 * is consumed by user space, the subsequent read() syscall will signal 40 * EOPNOTSUPP. In order to work around, the user may have to update their 41 * program to reduce the volume of data sent to output. For example, skip 42 * some uninteresting cgroups. 43 */ 44 45 struct bpf_iter__cgroup { 46 __bpf_md_ptr(struct bpf_iter_meta *, meta); 47 __bpf_md_ptr(struct cgroup *, cgroup); 48 }; 49 50 struct cgroup_iter_priv { 51 struct cgroup_subsys_state *start_css; 52 bool visited_all; 53 bool terminate; 54 int order; 55 }; 56 57 static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) 58 { 59 struct cgroup_iter_priv *p = seq->private; 60 61 mutex_lock(&cgroup_mutex); 62 63 /* cgroup_iter doesn't support read across multiple sessions. */ 64 if (*pos > 0) { 65 if (p->visited_all) 66 return NULL; 67 68 /* Haven't visited all, but because cgroup_mutex has dropped, 69 * return -EOPNOTSUPP to indicate incomplete iteration. 70 */ 71 return ERR_PTR(-EOPNOTSUPP); 72 } 73 74 ++*pos; 75 p->terminate = false; 76 p->visited_all = false; 77 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 78 return css_next_descendant_pre(NULL, p->start_css); 79 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 80 return css_next_descendant_post(NULL, p->start_css); 81 else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ 82 return p->start_css; 83 } 84 85 static int __cgroup_iter_seq_show(struct seq_file *seq, 86 struct cgroup_subsys_state *css, int in_stop); 87 88 static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) 89 { 90 struct cgroup_iter_priv *p = seq->private; 91 92 mutex_unlock(&cgroup_mutex); 93 94 /* pass NULL to the prog for post-processing */ 95 if (!v) { 96 __cgroup_iter_seq_show(seq, NULL, true); 97 p->visited_all = true; 98 } 99 } 100 101 static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) 102 { 103 struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; 104 struct cgroup_iter_priv *p = seq->private; 105 106 ++*pos; 107 if (p->terminate) 108 return NULL; 109 110 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 111 return css_next_descendant_pre(curr, p->start_css); 112 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 113 return css_next_descendant_post(curr, p->start_css); 114 else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) 115 return curr->parent; 116 else /* BPF_CGROUP_ITER_SELF_ONLY */ 117 return NULL; 118 } 119 120 static int __cgroup_iter_seq_show(struct seq_file *seq, 121 struct cgroup_subsys_state *css, int in_stop) 122 { 123 struct cgroup_iter_priv *p = seq->private; 124 struct bpf_iter__cgroup ctx; 125 struct bpf_iter_meta meta; 126 struct bpf_prog *prog; 127 int ret = 0; 128 129 /* cgroup is dead, skip this element */ 130 if (css && cgroup_is_dead(css->cgroup)) 131 return 0; 132 133 ctx.meta = &meta; 134 ctx.cgroup = css ? css->cgroup : NULL; 135 meta.seq = seq; 136 prog = bpf_iter_get_info(&meta, in_stop); 137 if (prog) 138 ret = bpf_iter_run_prog(prog, &ctx); 139 140 /* if prog returns > 0, terminate after this element. */ 141 if (ret != 0) 142 p->terminate = true; 143 144 return 0; 145 } 146 147 static int cgroup_iter_seq_show(struct seq_file *seq, void *v) 148 { 149 return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, 150 false); 151 } 152 153 static const struct seq_operations cgroup_iter_seq_ops = { 154 .start = cgroup_iter_seq_start, 155 .next = cgroup_iter_seq_next, 156 .stop = cgroup_iter_seq_stop, 157 .show = cgroup_iter_seq_show, 158 }; 159 160 BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) 161 162 static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) 163 { 164 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; 165 struct cgroup *cgrp = aux->cgroup.start; 166 167 p->start_css = &cgrp->self; 168 p->terminate = false; 169 p->visited_all = false; 170 p->order = aux->cgroup.order; 171 return 0; 172 } 173 174 static const struct bpf_iter_seq_info cgroup_iter_seq_info = { 175 .seq_ops = &cgroup_iter_seq_ops, 176 .init_seq_private = cgroup_iter_seq_init, 177 .seq_priv_size = sizeof(struct cgroup_iter_priv), 178 }; 179 180 static int bpf_iter_attach_cgroup(struct bpf_prog *prog, 181 union bpf_iter_link_info *linfo, 182 struct bpf_iter_aux_info *aux) 183 { 184 int fd = linfo->cgroup.cgroup_fd; 185 u64 id = linfo->cgroup.cgroup_id; 186 int order = linfo->cgroup.order; 187 struct cgroup *cgrp; 188 189 if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && 190 order != BPF_CGROUP_ITER_DESCENDANTS_POST && 191 order != BPF_CGROUP_ITER_ANCESTORS_UP && 192 order != BPF_CGROUP_ITER_SELF_ONLY) 193 return -EINVAL; 194 195 if (fd && id) 196 return -EINVAL; 197 198 if (fd) 199 cgrp = cgroup_get_from_fd(fd); 200 else if (id) 201 cgrp = cgroup_get_from_id(id); 202 else /* walk the entire hierarchy by default. */ 203 cgrp = cgroup_get_from_path("/"); 204 205 if (IS_ERR(cgrp)) 206 return PTR_ERR(cgrp); 207 208 aux->cgroup.start = cgrp; 209 aux->cgroup.order = order; 210 return 0; 211 } 212 213 static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) 214 { 215 cgroup_put(aux->cgroup.start); 216 } 217 218 static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, 219 struct seq_file *seq) 220 { 221 char *buf; 222 223 buf = kzalloc(PATH_MAX, GFP_KERNEL); 224 if (!buf) { 225 seq_puts(seq, "cgroup_path:\t<unknown>\n"); 226 goto show_order; 227 } 228 229 /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path 230 * will print nothing. 231 * 232 * Path is in the calling process's cgroup namespace. 233 */ 234 cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, 235 current->nsproxy->cgroup_ns); 236 seq_printf(seq, "cgroup_path:\t%s\n", buf); 237 kfree(buf); 238 239 show_order: 240 if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 241 seq_puts(seq, "order: descendants_pre\n"); 242 else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) 243 seq_puts(seq, "order: descendants_post\n"); 244 else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) 245 seq_puts(seq, "order: ancestors_up\n"); 246 else /* BPF_CGROUP_ITER_SELF_ONLY */ 247 seq_puts(seq, "order: self_only\n"); 248 } 249 250 static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, 251 struct bpf_link_info *info) 252 { 253 info->iter.cgroup.order = aux->cgroup.order; 254 info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); 255 return 0; 256 } 257 258 DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, 259 struct cgroup *cgroup) 260 261 static struct bpf_iter_reg bpf_cgroup_reg_info = { 262 .target = "cgroup", 263 .feature = BPF_ITER_RESCHED, 264 .attach_target = bpf_iter_attach_cgroup, 265 .detach_target = bpf_iter_detach_cgroup, 266 .show_fdinfo = bpf_iter_cgroup_show_fdinfo, 267 .fill_link_info = bpf_iter_cgroup_fill_link_info, 268 .ctx_arg_info_size = 1, 269 .ctx_arg_info = { 270 { offsetof(struct bpf_iter__cgroup, cgroup), 271 PTR_TO_BTF_ID_OR_NULL }, 272 }, 273 .seq_info = &cgroup_iter_seq_info, 274 }; 275 276 static int __init bpf_cgroup_iter_init(void) 277 { 278 bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; 279 return bpf_iter_reg_target(&bpf_cgroup_reg_info); 280 } 281 282 late_initcall(bpf_cgroup_iter_init); 283