1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Functions to manage eBPF programs attached to cgroups
4 *
5 * Copyright (c) 2016 Daniel Mack
6 */
7
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <linux/bpf_lsm.h>
18 #include <linux/bpf_verifier.h>
19 #include <net/sock.h>
20 #include <net/bpf_sk_storage.h>
21
22 #include "../cgroup/cgroup-internal.h"
23
24 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
25 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
26
27 /*
28 * cgroup bpf destruction makes heavy use of work items and there can be a lot
29 * of concurrent destructions. Use a separate workqueue so that cgroup bpf
30 * destruction work items don't end up filling up max_active of system_wq
31 * which may lead to deadlock.
32 */
33 static struct workqueue_struct *cgroup_bpf_destroy_wq;
34
cgroup_bpf_wq_init(void)35 static int __init cgroup_bpf_wq_init(void)
36 {
37 cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
38 if (!cgroup_bpf_destroy_wq)
39 panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
40 return 0;
41 }
42 core_initcall(cgroup_bpf_wq_init);
43
44 static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
45 unsigned long action, void *data);
46
47 static struct notifier_block cgroup_bpf_lifetime_nb = {
48 .notifier_call = cgroup_bpf_lifetime_notify,
49 };
50
cgroup_bpf_lifetime_notifier_init(void)51 void __init cgroup_bpf_lifetime_notifier_init(void)
52 {
53 BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
54 &cgroup_bpf_lifetime_nb));
55 }
56
57 /* __always_inline is necessary to prevent indirect call through run_prog
58 * function pointer.
59 */
60 static __always_inline int
bpf_prog_run_array_cg(const struct cgroup_bpf * cgrp,enum cgroup_bpf_attach_type atype,const void * ctx,bpf_prog_run_fn run_prog,int retval,u32 * ret_flags)61 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
62 enum cgroup_bpf_attach_type atype,
63 const void *ctx, bpf_prog_run_fn run_prog,
64 int retval, u32 *ret_flags)
65 {
66 const struct bpf_prog_array_item *item;
67 const struct bpf_prog *prog;
68 const struct bpf_prog_array *array;
69 struct bpf_run_ctx *old_run_ctx;
70 struct bpf_cg_run_ctx run_ctx;
71 u32 func_ret;
72
73 run_ctx.retval = retval;
74 migrate_disable();
75 rcu_read_lock();
76 array = rcu_dereference(cgrp->effective[atype]);
77 item = &array->items[0];
78 old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
79 while ((prog = READ_ONCE(item->prog))) {
80 run_ctx.prog_item = item;
81 func_ret = run_prog(prog, ctx);
82 if (ret_flags) {
83 *(ret_flags) |= (func_ret >> 1);
84 func_ret &= 1;
85 }
86 if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
87 run_ctx.retval = -EPERM;
88 item++;
89 }
90 bpf_reset_run_ctx(old_run_ctx);
91 rcu_read_unlock();
92 migrate_enable();
93 return run_ctx.retval;
94 }
95
__cgroup_bpf_run_lsm_sock(const void * ctx,const struct bpf_insn * insn)96 unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
97 const struct bpf_insn *insn)
98 {
99 const struct bpf_prog *shim_prog;
100 struct sock *sk;
101 struct cgroup *cgrp;
102 int ret = 0;
103 u64 *args;
104
105 args = (u64 *)ctx;
106 sk = (void *)(unsigned long)args[0];
107 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
108 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
109
110 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
111 if (likely(cgrp))
112 ret = bpf_prog_run_array_cg(&cgrp->bpf,
113 shim_prog->aux->cgroup_atype,
114 ctx, bpf_prog_run, 0, NULL);
115 return ret;
116 }
117
__cgroup_bpf_run_lsm_socket(const void * ctx,const struct bpf_insn * insn)118 unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
119 const struct bpf_insn *insn)
120 {
121 const struct bpf_prog *shim_prog;
122 struct socket *sock;
123 struct cgroup *cgrp;
124 int ret = 0;
125 u64 *args;
126
127 args = (u64 *)ctx;
128 sock = (void *)(unsigned long)args[0];
129 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
130 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
131
132 cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
133 if (likely(cgrp))
134 ret = bpf_prog_run_array_cg(&cgrp->bpf,
135 shim_prog->aux->cgroup_atype,
136 ctx, bpf_prog_run, 0, NULL);
137 return ret;
138 }
139
__cgroup_bpf_run_lsm_current(const void * ctx,const struct bpf_insn * insn)140 unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
141 const struct bpf_insn *insn)
142 {
143 const struct bpf_prog *shim_prog;
144 struct cgroup *cgrp;
145 int ret = 0;
146
147 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
148 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
149
150 /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
151 cgrp = task_dfl_cgroup(current);
152 if (likely(cgrp))
153 ret = bpf_prog_run_array_cg(&cgrp->bpf,
154 shim_prog->aux->cgroup_atype,
155 ctx, bpf_prog_run, 0, NULL);
156 return ret;
157 }
158
159 #ifdef CONFIG_BPF_LSM
160 struct cgroup_lsm_atype {
161 u32 attach_btf_id;
162 int refcnt;
163 };
164
165 static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
166
167 static enum cgroup_bpf_attach_type
bpf_cgroup_atype_find(enum bpf_attach_type attach_type,u32 attach_btf_id)168 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
169 {
170 int i;
171
172 lockdep_assert_held(&cgroup_mutex);
173
174 if (attach_type != BPF_LSM_CGROUP)
175 return to_cgroup_bpf_attach_type(attach_type);
176
177 for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
178 if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
179 return CGROUP_LSM_START + i;
180
181 for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
182 if (cgroup_lsm_atype[i].attach_btf_id == 0)
183 return CGROUP_LSM_START + i;
184
185 return -E2BIG;
186
187 }
188
bpf_cgroup_atype_get(u32 attach_btf_id,int cgroup_atype)189 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
190 {
191 int i = cgroup_atype - CGROUP_LSM_START;
192
193 lockdep_assert_held(&cgroup_mutex);
194
195 WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
196 cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
197
198 cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
199 cgroup_lsm_atype[i].refcnt++;
200 }
201
bpf_cgroup_atype_put(int cgroup_atype)202 void bpf_cgroup_atype_put(int cgroup_atype)
203 {
204 int i = cgroup_atype - CGROUP_LSM_START;
205
206 cgroup_lock();
207 if (--cgroup_lsm_atype[i].refcnt <= 0)
208 cgroup_lsm_atype[i].attach_btf_id = 0;
209 WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
210 cgroup_unlock();
211 }
212 #else
213 static enum cgroup_bpf_attach_type
bpf_cgroup_atype_find(enum bpf_attach_type attach_type,u32 attach_btf_id)214 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
215 {
216 if (attach_type != BPF_LSM_CGROUP)
217 return to_cgroup_bpf_attach_type(attach_type);
218 return -EOPNOTSUPP;
219 }
220 #endif /* CONFIG_BPF_LSM */
221
cgroup_bpf_offline(struct cgroup * cgrp)222 static void cgroup_bpf_offline(struct cgroup *cgrp)
223 {
224 cgroup_get(cgrp);
225 percpu_ref_kill(&cgrp->bpf.refcnt);
226 }
227
bpf_cgroup_storages_free(struct bpf_cgroup_storage * storages[])228 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
229 {
230 enum bpf_cgroup_storage_type stype;
231
232 for_each_cgroup_storage_type(stype)
233 bpf_cgroup_storage_free(storages[stype]);
234 }
235
bpf_cgroup_storages_alloc(struct bpf_cgroup_storage * storages[],struct bpf_cgroup_storage * new_storages[],enum bpf_attach_type type,struct bpf_prog * prog,struct cgroup * cgrp)236 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
237 struct bpf_cgroup_storage *new_storages[],
238 enum bpf_attach_type type,
239 struct bpf_prog *prog,
240 struct cgroup *cgrp)
241 {
242 enum bpf_cgroup_storage_type stype;
243 struct bpf_cgroup_storage_key key;
244 struct bpf_map *map;
245
246 key.cgroup_inode_id = cgroup_id(cgrp);
247 key.attach_type = type;
248
249 for_each_cgroup_storage_type(stype) {
250 map = prog->aux->cgroup_storage[stype];
251 if (!map)
252 continue;
253
254 storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
255 if (storages[stype])
256 continue;
257
258 storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
259 if (IS_ERR(storages[stype])) {
260 bpf_cgroup_storages_free(new_storages);
261 return -ENOMEM;
262 }
263
264 new_storages[stype] = storages[stype];
265 }
266
267 return 0;
268 }
269
bpf_cgroup_storages_assign(struct bpf_cgroup_storage * dst[],struct bpf_cgroup_storage * src[])270 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
271 struct bpf_cgroup_storage *src[])
272 {
273 enum bpf_cgroup_storage_type stype;
274
275 for_each_cgroup_storage_type(stype)
276 dst[stype] = src[stype];
277 }
278
bpf_cgroup_storages_link(struct bpf_cgroup_storage * storages[],struct cgroup * cgrp,enum bpf_attach_type attach_type)279 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
280 struct cgroup *cgrp,
281 enum bpf_attach_type attach_type)
282 {
283 enum bpf_cgroup_storage_type stype;
284
285 for_each_cgroup_storage_type(stype)
286 bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
287 }
288
289 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
290 * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
291 * doesn't free link memory, which will eventually be done by bpf_link's
292 * release() callback, when its last FD is closed.
293 */
bpf_cgroup_link_auto_detach(struct bpf_cgroup_link * link)294 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
295 {
296 cgroup_put(link->cgroup);
297 link->cgroup = NULL;
298 }
299
300 /**
301 * cgroup_bpf_release() - put references of all bpf programs and
302 * release all cgroup bpf data
303 * @work: work structure embedded into the cgroup to modify
304 */
cgroup_bpf_release(struct work_struct * work)305 static void cgroup_bpf_release(struct work_struct *work)
306 {
307 struct cgroup *p, *cgrp = container_of(work, struct cgroup,
308 bpf.release_work);
309 struct bpf_prog_array *old_array;
310 struct list_head *storages = &cgrp->bpf.storages;
311 struct bpf_cgroup_storage *storage, *stmp;
312
313 unsigned int atype;
314
315 cgroup_lock();
316
317 for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
318 struct hlist_head *progs = &cgrp->bpf.progs[atype];
319 struct bpf_prog_list *pl;
320 struct hlist_node *pltmp;
321
322 hlist_for_each_entry_safe(pl, pltmp, progs, node) {
323 hlist_del(&pl->node);
324 if (pl->prog) {
325 if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
326 bpf_trampoline_unlink_cgroup_shim(pl->prog);
327 bpf_prog_put(pl->prog);
328 }
329 if (pl->link) {
330 if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
331 bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
332 bpf_cgroup_link_auto_detach(pl->link);
333 }
334 kfree(pl);
335 static_branch_dec(&cgroup_bpf_enabled_key[atype]);
336 }
337 old_array = rcu_dereference_protected(
338 cgrp->bpf.effective[atype],
339 lockdep_is_held(&cgroup_mutex));
340 bpf_prog_array_free(old_array);
341 }
342
343 list_for_each_entry_safe(storage, stmp, storages, list_cg) {
344 bpf_cgroup_storage_unlink(storage);
345 bpf_cgroup_storage_free(storage);
346 }
347
348 cgroup_unlock();
349
350 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
351 cgroup_bpf_put(p);
352
353 percpu_ref_exit(&cgrp->bpf.refcnt);
354 cgroup_put(cgrp);
355 }
356
357 /**
358 * cgroup_bpf_release_fn() - callback used to schedule releasing
359 * of bpf cgroup data
360 * @ref: percpu ref counter structure
361 */
cgroup_bpf_release_fn(struct percpu_ref * ref)362 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
363 {
364 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
365
366 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
367 queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
368 }
369
370 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
371 * link or direct prog.
372 */
prog_list_prog(struct bpf_prog_list * pl)373 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
374 {
375 if (pl->prog)
376 return pl->prog;
377 if (pl->link)
378 return pl->link->link.prog;
379 return NULL;
380 }
381
382 /* count number of elements in the list.
383 * it's slow but the list cannot be long
384 */
prog_list_length(struct hlist_head * head,int * preorder_cnt)385 static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
386 {
387 struct bpf_prog_list *pl;
388 u32 cnt = 0;
389
390 hlist_for_each_entry(pl, head, node) {
391 if (!prog_list_prog(pl))
392 continue;
393 if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
394 (*preorder_cnt)++;
395 cnt++;
396 }
397 return cnt;
398 }
399
400 /* if parent has non-overridable prog attached,
401 * disallow attaching new programs to the descendent cgroup.
402 * if parent has overridable or multi-prog, allow attaching
403 */
hierarchy_allows_attach(struct cgroup * cgrp,enum cgroup_bpf_attach_type atype)404 static bool hierarchy_allows_attach(struct cgroup *cgrp,
405 enum cgroup_bpf_attach_type atype)
406 {
407 struct cgroup *p;
408
409 p = cgroup_parent(cgrp);
410 if (!p)
411 return true;
412 do {
413 u32 flags = p->bpf.flags[atype];
414 u32 cnt;
415
416 if (flags & BPF_F_ALLOW_MULTI)
417 return true;
418 cnt = prog_list_length(&p->bpf.progs[atype], NULL);
419 WARN_ON_ONCE(cnt > 1);
420 if (cnt == 1)
421 return !!(flags & BPF_F_ALLOW_OVERRIDE);
422 p = cgroup_parent(p);
423 } while (p);
424 return true;
425 }
426
427 /* compute a chain of effective programs for a given cgroup:
428 * start from the list of programs in this cgroup and add
429 * all parent programs.
430 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
431 * to programs in this cgroup
432 */
compute_effective_progs(struct cgroup * cgrp,enum cgroup_bpf_attach_type atype,struct bpf_prog_array ** array)433 static int compute_effective_progs(struct cgroup *cgrp,
434 enum cgroup_bpf_attach_type atype,
435 struct bpf_prog_array **array)
436 {
437 struct bpf_prog_array_item *item;
438 struct bpf_prog_array *progs;
439 struct bpf_prog_list *pl;
440 struct cgroup *p = cgrp;
441 int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
442
443 /* count number of effective programs by walking parents */
444 do {
445 if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
446 cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
447 p = cgroup_parent(p);
448 } while (p);
449
450 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
451 if (!progs)
452 return -ENOMEM;
453
454 /* populate the array with effective progs */
455 cnt = 0;
456 p = cgrp;
457 fstart = preorder_cnt;
458 bstart = preorder_cnt - 1;
459 do {
460 if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
461 continue;
462
463 init_bstart = bstart;
464 hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
465 if (!prog_list_prog(pl))
466 continue;
467
468 if (pl->flags & BPF_F_PREORDER) {
469 item = &progs->items[bstart];
470 bstart--;
471 } else {
472 item = &progs->items[fstart];
473 fstart++;
474 }
475 item->prog = prog_list_prog(pl);
476 bpf_cgroup_storages_assign(item->cgroup_storage,
477 pl->storage);
478 cnt++;
479 }
480
481 /* reverse pre-ordering progs at this cgroup level */
482 for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
483 swap(progs->items[i], progs->items[j]);
484
485 } while ((p = cgroup_parent(p)));
486
487 *array = progs;
488 return 0;
489 }
490
activate_effective_progs(struct cgroup * cgrp,enum cgroup_bpf_attach_type atype,struct bpf_prog_array * old_array)491 static void activate_effective_progs(struct cgroup *cgrp,
492 enum cgroup_bpf_attach_type atype,
493 struct bpf_prog_array *old_array)
494 {
495 old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
496 lockdep_is_held(&cgroup_mutex));
497 /* free prog array after grace period, since __cgroup_bpf_run_*()
498 * might be still walking the array
499 */
500 bpf_prog_array_free(old_array);
501 }
502
503 /**
504 * cgroup_bpf_inherit() - inherit effective programs from parent
505 * @cgrp: the cgroup to modify
506 */
cgroup_bpf_inherit(struct cgroup * cgrp)507 static int cgroup_bpf_inherit(struct cgroup *cgrp)
508 {
509 /* has to use marco instead of const int, since compiler thinks
510 * that array below is variable length
511 */
512 #define NR ARRAY_SIZE(cgrp->bpf.effective)
513 struct bpf_prog_array *arrays[NR] = {};
514 struct cgroup *p;
515 int ret, i;
516
517 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
518 GFP_KERNEL);
519 if (ret)
520 return ret;
521
522 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
523 cgroup_bpf_get(p);
524
525 for (i = 0; i < NR; i++)
526 INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
527
528 INIT_LIST_HEAD(&cgrp->bpf.storages);
529
530 for (i = 0; i < NR; i++)
531 if (compute_effective_progs(cgrp, i, &arrays[i]))
532 goto cleanup;
533
534 for (i = 0; i < NR; i++)
535 activate_effective_progs(cgrp, i, arrays[i]);
536
537 return 0;
538 cleanup:
539 for (i = 0; i < NR; i++)
540 bpf_prog_array_free(arrays[i]);
541
542 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
543 cgroup_bpf_put(p);
544
545 percpu_ref_exit(&cgrp->bpf.refcnt);
546
547 return -ENOMEM;
548 }
549
cgroup_bpf_lifetime_notify(struct notifier_block * nb,unsigned long action,void * data)550 static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
551 unsigned long action, void *data)
552 {
553 struct cgroup *cgrp = data;
554 int ret = 0;
555
556 if (cgrp->root != &cgrp_dfl_root)
557 return NOTIFY_OK;
558
559 switch (action) {
560 case CGROUP_LIFETIME_ONLINE:
561 ret = cgroup_bpf_inherit(cgrp);
562 break;
563 case CGROUP_LIFETIME_OFFLINE:
564 cgroup_bpf_offline(cgrp);
565 break;
566 }
567
568 return notifier_from_errno(ret);
569 }
570
update_effective_progs(struct cgroup * cgrp,enum cgroup_bpf_attach_type atype)571 static int update_effective_progs(struct cgroup *cgrp,
572 enum cgroup_bpf_attach_type atype)
573 {
574 struct cgroup_subsys_state *css;
575 int err;
576
577 /* allocate and recompute effective prog arrays */
578 css_for_each_descendant_pre(css, &cgrp->self) {
579 struct cgroup *desc = container_of(css, struct cgroup, self);
580
581 if (percpu_ref_is_zero(&desc->bpf.refcnt))
582 continue;
583
584 err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
585 if (err)
586 goto cleanup;
587 }
588
589 /* all allocations were successful. Activate all prog arrays */
590 css_for_each_descendant_pre(css, &cgrp->self) {
591 struct cgroup *desc = container_of(css, struct cgroup, self);
592
593 if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
594 if (unlikely(desc->bpf.inactive)) {
595 bpf_prog_array_free(desc->bpf.inactive);
596 desc->bpf.inactive = NULL;
597 }
598 continue;
599 }
600
601 activate_effective_progs(desc, atype, desc->bpf.inactive);
602 desc->bpf.inactive = NULL;
603 }
604
605 return 0;
606
607 cleanup:
608 /* oom while computing effective. Free all computed effective arrays
609 * since they were not activated
610 */
611 css_for_each_descendant_pre(css, &cgrp->self) {
612 struct cgroup *desc = container_of(css, struct cgroup, self);
613
614 bpf_prog_array_free(desc->bpf.inactive);
615 desc->bpf.inactive = NULL;
616 }
617
618 return err;
619 }
620
621 #define BPF_CGROUP_MAX_PROGS 64
622
find_attach_entry(struct hlist_head * progs,struct bpf_prog * prog,struct bpf_cgroup_link * link,struct bpf_prog * replace_prog,bool allow_multi)623 static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
624 struct bpf_prog *prog,
625 struct bpf_cgroup_link *link,
626 struct bpf_prog *replace_prog,
627 bool allow_multi)
628 {
629 struct bpf_prog_list *pl;
630
631 /* single-attach case */
632 if (!allow_multi) {
633 if (hlist_empty(progs))
634 return NULL;
635 return hlist_entry(progs->first, typeof(*pl), node);
636 }
637
638 hlist_for_each_entry(pl, progs, node) {
639 if (prog && pl->prog == prog && prog != replace_prog)
640 /* disallow attaching the same prog twice */
641 return ERR_PTR(-EINVAL);
642 if (link && pl->link == link)
643 /* disallow attaching the same link twice */
644 return ERR_PTR(-EINVAL);
645 }
646
647 /* direct prog multi-attach w/ replacement case */
648 if (replace_prog) {
649 hlist_for_each_entry(pl, progs, node) {
650 if (pl->prog == replace_prog)
651 /* a match found */
652 return pl;
653 }
654 /* prog to replace not found for cgroup */
655 return ERR_PTR(-ENOENT);
656 }
657
658 return NULL;
659 }
660
661 /**
662 * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
663 * propagate the change to descendants
664 * @cgrp: The cgroup which descendants to traverse
665 * @prog: A program to attach
666 * @link: A link to attach
667 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
668 * @type: Type of attach operation
669 * @flags: Option flags
670 *
671 * Exactly one of @prog or @link can be non-null.
672 * Must be called with cgroup_mutex held.
673 */
__cgroup_bpf_attach(struct cgroup * cgrp,struct bpf_prog * prog,struct bpf_prog * replace_prog,struct bpf_cgroup_link * link,enum bpf_attach_type type,u32 flags)674 static int __cgroup_bpf_attach(struct cgroup *cgrp,
675 struct bpf_prog *prog, struct bpf_prog *replace_prog,
676 struct bpf_cgroup_link *link,
677 enum bpf_attach_type type, u32 flags)
678 {
679 u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
680 struct bpf_prog *old_prog = NULL;
681 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
682 struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
683 struct bpf_prog *new_prog = prog ? : link->link.prog;
684 enum cgroup_bpf_attach_type atype;
685 struct bpf_prog_list *pl;
686 struct hlist_head *progs;
687 int err;
688
689 if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
690 ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
691 /* invalid combination */
692 return -EINVAL;
693 if (link && (prog || replace_prog))
694 /* only either link or prog/replace_prog can be specified */
695 return -EINVAL;
696 if (!!replace_prog != !!(flags & BPF_F_REPLACE))
697 /* replace_prog implies BPF_F_REPLACE, and vice versa */
698 return -EINVAL;
699
700 atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
701 if (atype < 0)
702 return -EINVAL;
703
704 progs = &cgrp->bpf.progs[atype];
705
706 if (!hierarchy_allows_attach(cgrp, atype))
707 return -EPERM;
708
709 if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
710 /* Disallow attaching non-overridable on top
711 * of existing overridable in this cgroup.
712 * Disallow attaching multi-prog if overridable or none
713 */
714 return -EPERM;
715
716 if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
717 return -E2BIG;
718
719 pl = find_attach_entry(progs, prog, link, replace_prog,
720 flags & BPF_F_ALLOW_MULTI);
721 if (IS_ERR(pl))
722 return PTR_ERR(pl);
723
724 if (bpf_cgroup_storages_alloc(storage, new_storage, type,
725 prog ? : link->link.prog, cgrp))
726 return -ENOMEM;
727
728 if (pl) {
729 old_prog = pl->prog;
730 } else {
731 struct hlist_node *last = NULL;
732
733 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
734 if (!pl) {
735 bpf_cgroup_storages_free(new_storage);
736 return -ENOMEM;
737 }
738 if (hlist_empty(progs))
739 hlist_add_head(&pl->node, progs);
740 else
741 hlist_for_each(last, progs) {
742 if (last->next)
743 continue;
744 hlist_add_behind(&pl->node, last);
745 break;
746 }
747 }
748
749 pl->prog = prog;
750 pl->link = link;
751 pl->flags = flags;
752 bpf_cgroup_storages_assign(pl->storage, storage);
753 cgrp->bpf.flags[atype] = saved_flags;
754
755 if (type == BPF_LSM_CGROUP) {
756 err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
757 if (err)
758 goto cleanup;
759 }
760
761 err = update_effective_progs(cgrp, atype);
762 if (err)
763 goto cleanup_trampoline;
764
765 if (old_prog) {
766 if (type == BPF_LSM_CGROUP)
767 bpf_trampoline_unlink_cgroup_shim(old_prog);
768 bpf_prog_put(old_prog);
769 } else {
770 static_branch_inc(&cgroup_bpf_enabled_key[atype]);
771 }
772 bpf_cgroup_storages_link(new_storage, cgrp, type);
773 return 0;
774
775 cleanup_trampoline:
776 if (type == BPF_LSM_CGROUP)
777 bpf_trampoline_unlink_cgroup_shim(new_prog);
778
779 cleanup:
780 if (old_prog) {
781 pl->prog = old_prog;
782 pl->link = NULL;
783 }
784 bpf_cgroup_storages_free(new_storage);
785 if (!old_prog) {
786 hlist_del(&pl->node);
787 kfree(pl);
788 }
789 return err;
790 }
791
cgroup_bpf_attach(struct cgroup * cgrp,struct bpf_prog * prog,struct bpf_prog * replace_prog,struct bpf_cgroup_link * link,enum bpf_attach_type type,u32 flags)792 static int cgroup_bpf_attach(struct cgroup *cgrp,
793 struct bpf_prog *prog, struct bpf_prog *replace_prog,
794 struct bpf_cgroup_link *link,
795 enum bpf_attach_type type,
796 u32 flags)
797 {
798 int ret;
799
800 cgroup_lock();
801 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
802 cgroup_unlock();
803 return ret;
804 }
805
806 /* Swap updated BPF program for given link in effective program arrays across
807 * all descendant cgroups. This function is guaranteed to succeed.
808 */
replace_effective_prog(struct cgroup * cgrp,enum cgroup_bpf_attach_type atype,struct bpf_cgroup_link * link)809 static void replace_effective_prog(struct cgroup *cgrp,
810 enum cgroup_bpf_attach_type atype,
811 struct bpf_cgroup_link *link)
812 {
813 struct bpf_prog_array_item *item;
814 struct cgroup_subsys_state *css;
815 struct bpf_prog_array *progs;
816 struct bpf_prog_list *pl;
817 struct hlist_head *head;
818 struct cgroup *cg;
819 int pos;
820
821 css_for_each_descendant_pre(css, &cgrp->self) {
822 struct cgroup *desc = container_of(css, struct cgroup, self);
823
824 if (percpu_ref_is_zero(&desc->bpf.refcnt))
825 continue;
826
827 /* find position of link in effective progs array */
828 for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
829 if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
830 continue;
831
832 head = &cg->bpf.progs[atype];
833 hlist_for_each_entry(pl, head, node) {
834 if (!prog_list_prog(pl))
835 continue;
836 if (pl->link == link)
837 goto found;
838 pos++;
839 }
840 }
841 found:
842 BUG_ON(!cg);
843 progs = rcu_dereference_protected(
844 desc->bpf.effective[atype],
845 lockdep_is_held(&cgroup_mutex));
846 item = &progs->items[pos];
847 WRITE_ONCE(item->prog, link->link.prog);
848 }
849 }
850
851 /**
852 * __cgroup_bpf_replace() - Replace link's program and propagate the change
853 * to descendants
854 * @cgrp: The cgroup which descendants to traverse
855 * @link: A link for which to replace BPF program
856 * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
857 * incremented
858 *
859 * Must be called with cgroup_mutex held.
860 */
__cgroup_bpf_replace(struct cgroup * cgrp,struct bpf_cgroup_link * link,struct bpf_prog * new_prog)861 static int __cgroup_bpf_replace(struct cgroup *cgrp,
862 struct bpf_cgroup_link *link,
863 struct bpf_prog *new_prog)
864 {
865 enum cgroup_bpf_attach_type atype;
866 struct bpf_prog *old_prog;
867 struct bpf_prog_list *pl;
868 struct hlist_head *progs;
869 bool found = false;
870
871 atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
872 if (atype < 0)
873 return -EINVAL;
874
875 progs = &cgrp->bpf.progs[atype];
876
877 if (link->link.prog->type != new_prog->type)
878 return -EINVAL;
879
880 hlist_for_each_entry(pl, progs, node) {
881 if (pl->link == link) {
882 found = true;
883 break;
884 }
885 }
886 if (!found)
887 return -ENOENT;
888
889 old_prog = xchg(&link->link.prog, new_prog);
890 replace_effective_prog(cgrp, atype, link);
891 bpf_prog_put(old_prog);
892 return 0;
893 }
894
cgroup_bpf_replace(struct bpf_link * link,struct bpf_prog * new_prog,struct bpf_prog * old_prog)895 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
896 struct bpf_prog *old_prog)
897 {
898 struct bpf_cgroup_link *cg_link;
899 int ret;
900
901 cg_link = container_of(link, struct bpf_cgroup_link, link);
902
903 cgroup_lock();
904 /* link might have been auto-released by dying cgroup, so fail */
905 if (!cg_link->cgroup) {
906 ret = -ENOLINK;
907 goto out_unlock;
908 }
909 if (old_prog && link->prog != old_prog) {
910 ret = -EPERM;
911 goto out_unlock;
912 }
913 ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
914 out_unlock:
915 cgroup_unlock();
916 return ret;
917 }
918
find_detach_entry(struct hlist_head * progs,struct bpf_prog * prog,struct bpf_cgroup_link * link,bool allow_multi)919 static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
920 struct bpf_prog *prog,
921 struct bpf_cgroup_link *link,
922 bool allow_multi)
923 {
924 struct bpf_prog_list *pl;
925
926 if (!allow_multi) {
927 if (hlist_empty(progs))
928 /* report error when trying to detach and nothing is attached */
929 return ERR_PTR(-ENOENT);
930
931 /* to maintain backward compatibility NONE and OVERRIDE cgroups
932 * allow detaching with invalid FD (prog==NULL) in legacy mode
933 */
934 return hlist_entry(progs->first, typeof(*pl), node);
935 }
936
937 if (!prog && !link)
938 /* to detach MULTI prog the user has to specify valid FD
939 * of the program or link to be detached
940 */
941 return ERR_PTR(-EINVAL);
942
943 /* find the prog or link and detach it */
944 hlist_for_each_entry(pl, progs, node) {
945 if (pl->prog == prog && pl->link == link)
946 return pl;
947 }
948 return ERR_PTR(-ENOENT);
949 }
950
951 /**
952 * purge_effective_progs() - After compute_effective_progs fails to alloc new
953 * cgrp->bpf.inactive table we can recover by
954 * recomputing the array in place.
955 *
956 * @cgrp: The cgroup which descendants to travers
957 * @prog: A program to detach or NULL
958 * @link: A link to detach or NULL
959 * @atype: Type of detach operation
960 */
purge_effective_progs(struct cgroup * cgrp,struct bpf_prog * prog,struct bpf_cgroup_link * link,enum cgroup_bpf_attach_type atype)961 static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
962 struct bpf_cgroup_link *link,
963 enum cgroup_bpf_attach_type atype)
964 {
965 struct cgroup_subsys_state *css;
966 struct bpf_prog_array *progs;
967 struct bpf_prog_list *pl;
968 struct hlist_head *head;
969 struct cgroup *cg;
970 int pos;
971
972 /* recompute effective prog array in place */
973 css_for_each_descendant_pre(css, &cgrp->self) {
974 struct cgroup *desc = container_of(css, struct cgroup, self);
975
976 if (percpu_ref_is_zero(&desc->bpf.refcnt))
977 continue;
978
979 /* find position of link or prog in effective progs array */
980 for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
981 if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
982 continue;
983
984 head = &cg->bpf.progs[atype];
985 hlist_for_each_entry(pl, head, node) {
986 if (!prog_list_prog(pl))
987 continue;
988 if (pl->prog == prog && pl->link == link)
989 goto found;
990 pos++;
991 }
992 }
993
994 /* no link or prog match, skip the cgroup of this layer */
995 continue;
996 found:
997 progs = rcu_dereference_protected(
998 desc->bpf.effective[atype],
999 lockdep_is_held(&cgroup_mutex));
1000
1001 /* Remove the program from the array */
1002 WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
1003 "Failed to purge a prog from array at index %d", pos);
1004 }
1005 }
1006
1007 /**
1008 * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
1009 * propagate the change to descendants
1010 * @cgrp: The cgroup which descendants to traverse
1011 * @prog: A program to detach or NULL
1012 * @link: A link to detach or NULL
1013 * @type: Type of detach operation
1014 *
1015 * At most one of @prog or @link can be non-NULL.
1016 * Must be called with cgroup_mutex held.
1017 */
__cgroup_bpf_detach(struct cgroup * cgrp,struct bpf_prog * prog,struct bpf_cgroup_link * link,enum bpf_attach_type type)1018 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1019 struct bpf_cgroup_link *link, enum bpf_attach_type type)
1020 {
1021 enum cgroup_bpf_attach_type atype;
1022 struct bpf_prog *old_prog;
1023 struct bpf_prog_list *pl;
1024 struct hlist_head *progs;
1025 u32 attach_btf_id = 0;
1026 u32 flags;
1027
1028 if (prog)
1029 attach_btf_id = prog->aux->attach_btf_id;
1030 if (link)
1031 attach_btf_id = link->link.prog->aux->attach_btf_id;
1032
1033 atype = bpf_cgroup_atype_find(type, attach_btf_id);
1034 if (atype < 0)
1035 return -EINVAL;
1036
1037 progs = &cgrp->bpf.progs[atype];
1038 flags = cgrp->bpf.flags[atype];
1039
1040 if (prog && link)
1041 /* only one of prog or link can be specified */
1042 return -EINVAL;
1043
1044 pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
1045 if (IS_ERR(pl))
1046 return PTR_ERR(pl);
1047
1048 /* mark it deleted, so it's ignored while recomputing effective */
1049 old_prog = pl->prog;
1050 pl->prog = NULL;
1051 pl->link = NULL;
1052
1053 if (update_effective_progs(cgrp, atype)) {
1054 /* if update effective array failed replace the prog with a dummy prog*/
1055 pl->prog = old_prog;
1056 pl->link = link;
1057 purge_effective_progs(cgrp, old_prog, link, atype);
1058 }
1059
1060 /* now can actually delete it from this cgroup list */
1061 hlist_del(&pl->node);
1062
1063 kfree(pl);
1064 if (hlist_empty(progs))
1065 /* last program was detached, reset flags to zero */
1066 cgrp->bpf.flags[atype] = 0;
1067 if (old_prog) {
1068 if (type == BPF_LSM_CGROUP)
1069 bpf_trampoline_unlink_cgroup_shim(old_prog);
1070 bpf_prog_put(old_prog);
1071 }
1072 static_branch_dec(&cgroup_bpf_enabled_key[atype]);
1073 return 0;
1074 }
1075
cgroup_bpf_detach(struct cgroup * cgrp,struct bpf_prog * prog,enum bpf_attach_type type)1076 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
1077 enum bpf_attach_type type)
1078 {
1079 int ret;
1080
1081 cgroup_lock();
1082 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
1083 cgroup_unlock();
1084 return ret;
1085 }
1086
1087 /* Must be called with cgroup_mutex held to avoid races. */
__cgroup_bpf_query(struct cgroup * cgrp,const union bpf_attr * attr,union bpf_attr __user * uattr)1088 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1089 union bpf_attr __user *uattr)
1090 {
1091 __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
1092 bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
1093 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
1094 enum bpf_attach_type type = attr->query.attach_type;
1095 enum cgroup_bpf_attach_type from_atype, to_atype;
1096 enum cgroup_bpf_attach_type atype;
1097 struct bpf_prog_array *effective;
1098 int cnt, ret = 0, i;
1099 int total_cnt = 0;
1100 u32 flags;
1101
1102 if (effective_query && prog_attach_flags)
1103 return -EINVAL;
1104
1105 if (type == BPF_LSM_CGROUP) {
1106 if (!effective_query && attr->query.prog_cnt &&
1107 prog_ids && !prog_attach_flags)
1108 return -EINVAL;
1109
1110 from_atype = CGROUP_LSM_START;
1111 to_atype = CGROUP_LSM_END;
1112 flags = 0;
1113 } else {
1114 from_atype = to_cgroup_bpf_attach_type(type);
1115 if (from_atype < 0)
1116 return -EINVAL;
1117 to_atype = from_atype;
1118 flags = cgrp->bpf.flags[from_atype];
1119 }
1120
1121 for (atype = from_atype; atype <= to_atype; atype++) {
1122 if (effective_query) {
1123 effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1124 lockdep_is_held(&cgroup_mutex));
1125 total_cnt += bpf_prog_array_length(effective);
1126 } else {
1127 total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
1128 }
1129 }
1130
1131 /* always output uattr->query.attach_flags as 0 during effective query */
1132 flags = effective_query ? 0 : flags;
1133 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
1134 return -EFAULT;
1135 if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
1136 return -EFAULT;
1137 if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
1138 /* return early if user requested only program count + flags */
1139 return 0;
1140
1141 if (attr->query.prog_cnt < total_cnt) {
1142 total_cnt = attr->query.prog_cnt;
1143 ret = -ENOSPC;
1144 }
1145
1146 for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
1147 if (effective_query) {
1148 effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
1149 lockdep_is_held(&cgroup_mutex));
1150 cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
1151 ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
1152 } else {
1153 struct hlist_head *progs;
1154 struct bpf_prog_list *pl;
1155 struct bpf_prog *prog;
1156 u32 id;
1157
1158 progs = &cgrp->bpf.progs[atype];
1159 cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
1160 i = 0;
1161 hlist_for_each_entry(pl, progs, node) {
1162 prog = prog_list_prog(pl);
1163 id = prog->aux->id;
1164 if (copy_to_user(prog_ids + i, &id, sizeof(id)))
1165 return -EFAULT;
1166 if (++i == cnt)
1167 break;
1168 }
1169
1170 if (prog_attach_flags) {
1171 flags = cgrp->bpf.flags[atype];
1172
1173 for (i = 0; i < cnt; i++)
1174 if (copy_to_user(prog_attach_flags + i,
1175 &flags, sizeof(flags)))
1176 return -EFAULT;
1177 prog_attach_flags += cnt;
1178 }
1179 }
1180
1181 prog_ids += cnt;
1182 total_cnt -= cnt;
1183 }
1184 return ret;
1185 }
1186
cgroup_bpf_query(struct cgroup * cgrp,const union bpf_attr * attr,union bpf_attr __user * uattr)1187 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1188 union bpf_attr __user *uattr)
1189 {
1190 int ret;
1191
1192 cgroup_lock();
1193 ret = __cgroup_bpf_query(cgrp, attr, uattr);
1194 cgroup_unlock();
1195 return ret;
1196 }
1197
cgroup_bpf_prog_attach(const union bpf_attr * attr,enum bpf_prog_type ptype,struct bpf_prog * prog)1198 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
1199 enum bpf_prog_type ptype, struct bpf_prog *prog)
1200 {
1201 struct bpf_prog *replace_prog = NULL;
1202 struct cgroup *cgrp;
1203 int ret;
1204
1205 cgrp = cgroup_get_from_fd(attr->target_fd);
1206 if (IS_ERR(cgrp))
1207 return PTR_ERR(cgrp);
1208
1209 if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
1210 (attr->attach_flags & BPF_F_REPLACE)) {
1211 replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
1212 if (IS_ERR(replace_prog)) {
1213 cgroup_put(cgrp);
1214 return PTR_ERR(replace_prog);
1215 }
1216 }
1217
1218 ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1219 attr->attach_type, attr->attach_flags);
1220
1221 if (replace_prog)
1222 bpf_prog_put(replace_prog);
1223 cgroup_put(cgrp);
1224 return ret;
1225 }
1226
cgroup_bpf_prog_detach(const union bpf_attr * attr,enum bpf_prog_type ptype)1227 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
1228 {
1229 struct bpf_prog *prog;
1230 struct cgroup *cgrp;
1231 int ret;
1232
1233 cgrp = cgroup_get_from_fd(attr->target_fd);
1234 if (IS_ERR(cgrp))
1235 return PTR_ERR(cgrp);
1236
1237 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
1238 if (IS_ERR(prog))
1239 prog = NULL;
1240
1241 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
1242 if (prog)
1243 bpf_prog_put(prog);
1244
1245 cgroup_put(cgrp);
1246 return ret;
1247 }
1248
bpf_cgroup_link_release(struct bpf_link * link)1249 static void bpf_cgroup_link_release(struct bpf_link *link)
1250 {
1251 struct bpf_cgroup_link *cg_link =
1252 container_of(link, struct bpf_cgroup_link, link);
1253 struct cgroup *cg;
1254
1255 /* link might have been auto-detached by dying cgroup already,
1256 * in that case our work is done here
1257 */
1258 if (!cg_link->cgroup)
1259 return;
1260
1261 cgroup_lock();
1262
1263 /* re-check cgroup under lock again */
1264 if (!cg_link->cgroup) {
1265 cgroup_unlock();
1266 return;
1267 }
1268
1269 WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1270 cg_link->type));
1271 if (cg_link->type == BPF_LSM_CGROUP)
1272 bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
1273
1274 cg = cg_link->cgroup;
1275 cg_link->cgroup = NULL;
1276
1277 cgroup_unlock();
1278
1279 cgroup_put(cg);
1280 }
1281
bpf_cgroup_link_dealloc(struct bpf_link * link)1282 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1283 {
1284 struct bpf_cgroup_link *cg_link =
1285 container_of(link, struct bpf_cgroup_link, link);
1286
1287 kfree(cg_link);
1288 }
1289
bpf_cgroup_link_detach(struct bpf_link * link)1290 static int bpf_cgroup_link_detach(struct bpf_link *link)
1291 {
1292 bpf_cgroup_link_release(link);
1293
1294 return 0;
1295 }
1296
bpf_cgroup_link_show_fdinfo(const struct bpf_link * link,struct seq_file * seq)1297 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1298 struct seq_file *seq)
1299 {
1300 struct bpf_cgroup_link *cg_link =
1301 container_of(link, struct bpf_cgroup_link, link);
1302 u64 cg_id = 0;
1303
1304 cgroup_lock();
1305 if (cg_link->cgroup)
1306 cg_id = cgroup_id(cg_link->cgroup);
1307 cgroup_unlock();
1308
1309 seq_printf(seq,
1310 "cgroup_id:\t%llu\n"
1311 "attach_type:\t%d\n",
1312 cg_id,
1313 cg_link->type);
1314 }
1315
bpf_cgroup_link_fill_link_info(const struct bpf_link * link,struct bpf_link_info * info)1316 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1317 struct bpf_link_info *info)
1318 {
1319 struct bpf_cgroup_link *cg_link =
1320 container_of(link, struct bpf_cgroup_link, link);
1321 u64 cg_id = 0;
1322
1323 cgroup_lock();
1324 if (cg_link->cgroup)
1325 cg_id = cgroup_id(cg_link->cgroup);
1326 cgroup_unlock();
1327
1328 info->cgroup.cgroup_id = cg_id;
1329 info->cgroup.attach_type = cg_link->type;
1330 return 0;
1331 }
1332
1333 static const struct bpf_link_ops bpf_cgroup_link_lops = {
1334 .release = bpf_cgroup_link_release,
1335 .dealloc = bpf_cgroup_link_dealloc,
1336 .detach = bpf_cgroup_link_detach,
1337 .update_prog = cgroup_bpf_replace,
1338 .show_fdinfo = bpf_cgroup_link_show_fdinfo,
1339 .fill_link_info = bpf_cgroup_link_fill_link_info,
1340 };
1341
cgroup_bpf_link_attach(const union bpf_attr * attr,struct bpf_prog * prog)1342 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1343 {
1344 struct bpf_link_primer link_primer;
1345 struct bpf_cgroup_link *link;
1346 struct cgroup *cgrp;
1347 int err;
1348
1349 if (attr->link_create.flags)
1350 return -EINVAL;
1351
1352 cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1353 if (IS_ERR(cgrp))
1354 return PTR_ERR(cgrp);
1355
1356 link = kzalloc(sizeof(*link), GFP_USER);
1357 if (!link) {
1358 err = -ENOMEM;
1359 goto out_put_cgroup;
1360 }
1361 bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1362 prog);
1363 link->cgroup = cgrp;
1364 link->type = attr->link_create.attach_type;
1365
1366 err = bpf_link_prime(&link->link, &link_primer);
1367 if (err) {
1368 kfree(link);
1369 goto out_put_cgroup;
1370 }
1371
1372 err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1373 link->type, BPF_F_ALLOW_MULTI);
1374 if (err) {
1375 bpf_link_cleanup(&link_primer);
1376 goto out_put_cgroup;
1377 }
1378
1379 return bpf_link_settle(&link_primer);
1380
1381 out_put_cgroup:
1382 cgroup_put(cgrp);
1383 return err;
1384 }
1385
cgroup_bpf_prog_query(const union bpf_attr * attr,union bpf_attr __user * uattr)1386 int cgroup_bpf_prog_query(const union bpf_attr *attr,
1387 union bpf_attr __user *uattr)
1388 {
1389 struct cgroup *cgrp;
1390 int ret;
1391
1392 cgrp = cgroup_get_from_fd(attr->query.target_fd);
1393 if (IS_ERR(cgrp))
1394 return PTR_ERR(cgrp);
1395
1396 ret = cgroup_bpf_query(cgrp, attr, uattr);
1397
1398 cgroup_put(cgrp);
1399 return ret;
1400 }
1401
1402 /**
1403 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1404 * @sk: The socket sending or receiving traffic
1405 * @skb: The skb that is being sent or received
1406 * @atype: The type of program to be executed
1407 *
1408 * If no socket is passed, or the socket is not of type INET or INET6,
1409 * this function does nothing and returns 0.
1410 *
1411 * The program type passed in via @type must be suitable for network
1412 * filtering. No further check is performed to assert that.
1413 *
1414 * For egress packets, this function can return:
1415 * NET_XMIT_SUCCESS (0) - continue with packet output
1416 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
1417 * NET_XMIT_CN (2) - continue with packet output and notify TCP
1418 * to call cwr
1419 * -err - drop packet
1420 *
1421 * For ingress packets, this function will return -EPERM if any
1422 * attached program was found and if it returned != 1 during execution.
1423 * Otherwise 0 is returned.
1424 */
__cgroup_bpf_run_filter_skb(struct sock * sk,struct sk_buff * skb,enum cgroup_bpf_attach_type atype)1425 int __cgroup_bpf_run_filter_skb(struct sock *sk,
1426 struct sk_buff *skb,
1427 enum cgroup_bpf_attach_type atype)
1428 {
1429 unsigned int offset = -skb_network_offset(skb);
1430 struct sock *save_sk;
1431 void *saved_data_end;
1432 struct cgroup *cgrp;
1433 int ret;
1434
1435 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1436 return 0;
1437
1438 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1439 save_sk = skb->sk;
1440 skb->sk = sk;
1441 __skb_push(skb, offset);
1442
1443 /* compute pointers for the bpf prog */
1444 bpf_compute_and_save_data_end(skb, &saved_data_end);
1445
1446 if (atype == CGROUP_INET_EGRESS) {
1447 u32 flags = 0;
1448 bool cn;
1449
1450 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1451 __bpf_prog_run_save_cb, 0, &flags);
1452
1453 /* Return values of CGROUP EGRESS BPF programs are:
1454 * 0: drop packet
1455 * 1: keep packet
1456 * 2: drop packet and cn
1457 * 3: keep packet and cn
1458 *
1459 * The returned value is then converted to one of the NET_XMIT
1460 * or an error code that is then interpreted as drop packet
1461 * (and no cn):
1462 * 0: NET_XMIT_SUCCESS skb should be transmitted
1463 * 1: NET_XMIT_DROP skb should be dropped and cn
1464 * 2: NET_XMIT_CN skb should be transmitted and cn
1465 * 3: -err skb should be dropped
1466 */
1467
1468 cn = flags & BPF_RET_SET_CN;
1469 if (ret && !IS_ERR_VALUE((long)ret))
1470 ret = -EFAULT;
1471 if (!ret)
1472 ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1473 else
1474 ret = (cn ? NET_XMIT_DROP : ret);
1475 } else {
1476 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1477 skb, __bpf_prog_run_save_cb, 0,
1478 NULL);
1479 if (ret && !IS_ERR_VALUE((long)ret))
1480 ret = -EFAULT;
1481 }
1482 bpf_restore_data_end(skb, saved_data_end);
1483 __skb_pull(skb, offset);
1484 skb->sk = save_sk;
1485
1486 return ret;
1487 }
1488 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1489
1490 /**
1491 * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1492 * @sk: sock structure to manipulate
1493 * @atype: The type of program to be executed
1494 *
1495 * socket is passed is expected to be of type INET or INET6.
1496 *
1497 * The program type passed in via @type must be suitable for sock
1498 * filtering. No further check is performed to assert that.
1499 *
1500 * This function will return %-EPERM if any if an attached program was found
1501 * and if it returned != 1 during execution. In all other cases, 0 is returned.
1502 */
__cgroup_bpf_run_filter_sk(struct sock * sk,enum cgroup_bpf_attach_type atype)1503 int __cgroup_bpf_run_filter_sk(struct sock *sk,
1504 enum cgroup_bpf_attach_type atype)
1505 {
1506 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1507
1508 return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1509 NULL);
1510 }
1511 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1512
1513 /**
1514 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1515 * provided by user sockaddr
1516 * @sk: sock struct that will use sockaddr
1517 * @uaddr: sockaddr struct provided by user
1518 * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
1519 * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
1520 * uaddr.
1521 * @atype: The type of program to be executed
1522 * @t_ctx: Pointer to attach type specific context
1523 * @flags: Pointer to u32 which contains higher bits of BPF program
1524 * return value (OR'ed together).
1525 *
1526 * socket is expected to be of type INET, INET6 or UNIX.
1527 *
1528 * This function will return %-EPERM if an attached program is found and
1529 * returned value != 1 during execution. In all other cases, 0 is returned.
1530 */
__cgroup_bpf_run_filter_sock_addr(struct sock * sk,struct sockaddr * uaddr,int * uaddrlen,enum cgroup_bpf_attach_type atype,void * t_ctx,u32 * flags)1531 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1532 struct sockaddr *uaddr,
1533 int *uaddrlen,
1534 enum cgroup_bpf_attach_type atype,
1535 void *t_ctx,
1536 u32 *flags)
1537 {
1538 struct bpf_sock_addr_kern ctx = {
1539 .sk = sk,
1540 .uaddr = uaddr,
1541 .t_ctx = t_ctx,
1542 };
1543 struct sockaddr_storage unspec;
1544 struct cgroup *cgrp;
1545 int ret;
1546
1547 /* Check socket family since not all sockets represent network
1548 * endpoint (e.g. AF_UNIX).
1549 */
1550 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
1551 sk->sk_family != AF_UNIX)
1552 return 0;
1553
1554 if (!ctx.uaddr) {
1555 memset(&unspec, 0, sizeof(unspec));
1556 ctx.uaddr = (struct sockaddr *)&unspec;
1557 ctx.uaddrlen = 0;
1558 } else {
1559 ctx.uaddrlen = *uaddrlen;
1560 }
1561
1562 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1563 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1564 0, flags);
1565
1566 if (!ret && uaddr)
1567 *uaddrlen = ctx.uaddrlen;
1568
1569 return ret;
1570 }
1571 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1572
1573 /**
1574 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1575 * @sk: socket to get cgroup from
1576 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1577 * sk with connection information (IP addresses, etc.) May not contain
1578 * cgroup info if it is a req sock.
1579 * @atype: The type of program to be executed
1580 *
1581 * socket passed is expected to be of type INET or INET6.
1582 *
1583 * The program type passed in via @type must be suitable for sock_ops
1584 * filtering. No further check is performed to assert that.
1585 *
1586 * This function will return %-EPERM if any if an attached program was found
1587 * and if it returned != 1 during execution. In all other cases, 0 is returned.
1588 */
__cgroup_bpf_run_filter_sock_ops(struct sock * sk,struct bpf_sock_ops_kern * sock_ops,enum cgroup_bpf_attach_type atype)1589 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1590 struct bpf_sock_ops_kern *sock_ops,
1591 enum cgroup_bpf_attach_type atype)
1592 {
1593 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1594
1595 return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1596 0, NULL);
1597 }
1598 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1599
__cgroup_bpf_check_dev_permission(short dev_type,u32 major,u32 minor,short access,enum cgroup_bpf_attach_type atype)1600 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1601 short access, enum cgroup_bpf_attach_type atype)
1602 {
1603 struct cgroup *cgrp;
1604 struct bpf_cgroup_dev_ctx ctx = {
1605 .access_type = (access << 16) | dev_type,
1606 .major = major,
1607 .minor = minor,
1608 };
1609 int ret;
1610
1611 rcu_read_lock();
1612 cgrp = task_dfl_cgroup(current);
1613 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1614 NULL);
1615 rcu_read_unlock();
1616
1617 return ret;
1618 }
1619
BPF_CALL_2(bpf_get_local_storage,struct bpf_map *,map,u64,flags)1620 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
1621 {
1622 /* flags argument is not used now,
1623 * but provides an ability to extend the API.
1624 * verifier checks that its value is correct.
1625 */
1626 enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
1627 struct bpf_cgroup_storage *storage;
1628 struct bpf_cg_run_ctx *ctx;
1629 void *ptr;
1630
1631 /* get current cgroup storage from BPF run context */
1632 ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1633 storage = ctx->prog_item->cgroup_storage[stype];
1634
1635 if (stype == BPF_CGROUP_STORAGE_SHARED)
1636 ptr = &READ_ONCE(storage->buf)->data[0];
1637 else
1638 ptr = this_cpu_ptr(storage->percpu_buf);
1639
1640 return (unsigned long)ptr;
1641 }
1642
1643 const struct bpf_func_proto bpf_get_local_storage_proto = {
1644 .func = bpf_get_local_storage,
1645 .gpl_only = false,
1646 .ret_type = RET_PTR_TO_MAP_VALUE,
1647 .arg1_type = ARG_CONST_MAP_PTR,
1648 .arg2_type = ARG_ANYTHING,
1649 };
1650
BPF_CALL_0(bpf_get_retval)1651 BPF_CALL_0(bpf_get_retval)
1652 {
1653 struct bpf_cg_run_ctx *ctx =
1654 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1655
1656 return ctx->retval;
1657 }
1658
1659 const struct bpf_func_proto bpf_get_retval_proto = {
1660 .func = bpf_get_retval,
1661 .gpl_only = false,
1662 .ret_type = RET_INTEGER,
1663 };
1664
BPF_CALL_1(bpf_set_retval,int,retval)1665 BPF_CALL_1(bpf_set_retval, int, retval)
1666 {
1667 struct bpf_cg_run_ctx *ctx =
1668 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1669
1670 ctx->retval = retval;
1671 return 0;
1672 }
1673
1674 const struct bpf_func_proto bpf_set_retval_proto = {
1675 .func = bpf_set_retval,
1676 .gpl_only = false,
1677 .ret_type = RET_INTEGER,
1678 .arg1_type = ARG_ANYTHING,
1679 };
1680
1681 static const struct bpf_func_proto *
cgroup_dev_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)1682 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1683 {
1684 const struct bpf_func_proto *func_proto;
1685
1686 func_proto = cgroup_common_func_proto(func_id, prog);
1687 if (func_proto)
1688 return func_proto;
1689
1690 switch (func_id) {
1691 case BPF_FUNC_perf_event_output:
1692 return &bpf_event_output_data_proto;
1693 default:
1694 return bpf_base_func_proto(func_id, prog);
1695 }
1696 }
1697
cgroup_dev_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)1698 static bool cgroup_dev_is_valid_access(int off, int size,
1699 enum bpf_access_type type,
1700 const struct bpf_prog *prog,
1701 struct bpf_insn_access_aux *info)
1702 {
1703 const int size_default = sizeof(__u32);
1704
1705 if (type == BPF_WRITE)
1706 return false;
1707
1708 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1709 return false;
1710 /* The verifier guarantees that size > 0. */
1711 if (off % size != 0)
1712 return false;
1713
1714 switch (off) {
1715 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1716 bpf_ctx_record_field_size(info, size_default);
1717 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1718 return false;
1719 break;
1720 default:
1721 if (size != size_default)
1722 return false;
1723 }
1724
1725 return true;
1726 }
1727
1728 const struct bpf_prog_ops cg_dev_prog_ops = {
1729 };
1730
1731 const struct bpf_verifier_ops cg_dev_verifier_ops = {
1732 .get_func_proto = cgroup_dev_func_proto,
1733 .is_valid_access = cgroup_dev_is_valid_access,
1734 };
1735
1736 /**
1737 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1738 *
1739 * @head: sysctl table header
1740 * @table: sysctl table
1741 * @write: sysctl is being read (= 0) or written (= 1)
1742 * @buf: pointer to buffer (in and out)
1743 * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1744 * result is size of @new_buf if program set new value, initial value
1745 * otherwise
1746 * @ppos: value-result argument: value is position at which read from or write
1747 * to sysctl is happening, result is new position if program overrode it,
1748 * initial value otherwise
1749 * @atype: type of program to be executed
1750 *
1751 * Program is run when sysctl is being accessed, either read or written, and
1752 * can allow or deny such access.
1753 *
1754 * This function will return %-EPERM if an attached program is found and
1755 * returned value != 1 during execution. In all other cases 0 is returned.
1756 */
__cgroup_bpf_run_filter_sysctl(struct ctl_table_header * head,const struct ctl_table * table,int write,char ** buf,size_t * pcount,loff_t * ppos,enum cgroup_bpf_attach_type atype)1757 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1758 const struct ctl_table *table, int write,
1759 char **buf, size_t *pcount, loff_t *ppos,
1760 enum cgroup_bpf_attach_type atype)
1761 {
1762 struct bpf_sysctl_kern ctx = {
1763 .head = head,
1764 .table = table,
1765 .write = write,
1766 .ppos = ppos,
1767 .cur_val = NULL,
1768 .cur_len = PAGE_SIZE,
1769 .new_val = NULL,
1770 .new_len = 0,
1771 .new_updated = 0,
1772 };
1773 struct cgroup *cgrp;
1774 loff_t pos = 0;
1775 int ret;
1776
1777 ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1778 if (!ctx.cur_val ||
1779 table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1780 /* Let BPF program decide how to proceed. */
1781 ctx.cur_len = 0;
1782 }
1783
1784 if (write && *buf && *pcount) {
1785 /* BPF program should be able to override new value with a
1786 * buffer bigger than provided by user.
1787 */
1788 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1789 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1790 if (ctx.new_val) {
1791 memcpy(ctx.new_val, *buf, ctx.new_len);
1792 } else {
1793 /* Let BPF program decide how to proceed. */
1794 ctx.new_len = 0;
1795 }
1796 }
1797
1798 rcu_read_lock();
1799 cgrp = task_dfl_cgroup(current);
1800 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1801 NULL);
1802 rcu_read_unlock();
1803
1804 kfree(ctx.cur_val);
1805
1806 if (ret == 1 && ctx.new_updated) {
1807 kfree(*buf);
1808 *buf = ctx.new_val;
1809 *pcount = ctx.new_len;
1810 } else {
1811 kfree(ctx.new_val);
1812 }
1813
1814 return ret;
1815 }
1816
1817 #ifdef CONFIG_NET
sockopt_alloc_buf(struct bpf_sockopt_kern * ctx,int max_optlen,struct bpf_sockopt_buf * buf)1818 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1819 struct bpf_sockopt_buf *buf)
1820 {
1821 if (unlikely(max_optlen < 0))
1822 return -EINVAL;
1823
1824 if (unlikely(max_optlen > PAGE_SIZE)) {
1825 /* We don't expose optvals that are greater than PAGE_SIZE
1826 * to the BPF program.
1827 */
1828 max_optlen = PAGE_SIZE;
1829 }
1830
1831 if (max_optlen <= sizeof(buf->data)) {
1832 /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1833 * bytes avoid the cost of kzalloc.
1834 */
1835 ctx->optval = buf->data;
1836 ctx->optval_end = ctx->optval + max_optlen;
1837 return max_optlen;
1838 }
1839
1840 ctx->optval = kzalloc(max_optlen, GFP_USER);
1841 if (!ctx->optval)
1842 return -ENOMEM;
1843
1844 ctx->optval_end = ctx->optval + max_optlen;
1845
1846 return max_optlen;
1847 }
1848
sockopt_free_buf(struct bpf_sockopt_kern * ctx,struct bpf_sockopt_buf * buf)1849 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1850 struct bpf_sockopt_buf *buf)
1851 {
1852 if (ctx->optval == buf->data)
1853 return;
1854 kfree(ctx->optval);
1855 }
1856
sockopt_buf_allocated(struct bpf_sockopt_kern * ctx,struct bpf_sockopt_buf * buf)1857 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1858 struct bpf_sockopt_buf *buf)
1859 {
1860 return ctx->optval != buf->data;
1861 }
1862
__cgroup_bpf_run_filter_setsockopt(struct sock * sk,int * level,int * optname,sockptr_t optval,int * optlen,char ** kernel_optval)1863 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1864 int *optname, sockptr_t optval,
1865 int *optlen, char **kernel_optval)
1866 {
1867 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1868 struct bpf_sockopt_buf buf = {};
1869 struct bpf_sockopt_kern ctx = {
1870 .sk = sk,
1871 .level = *level,
1872 .optname = *optname,
1873 };
1874 int ret, max_optlen;
1875
1876 /* Allocate a bit more than the initial user buffer for
1877 * BPF program. The canonical use case is overriding
1878 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1879 */
1880 max_optlen = max_t(int, 16, *optlen);
1881 max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1882 if (max_optlen < 0)
1883 return max_optlen;
1884
1885 ctx.optlen = *optlen;
1886
1887 if (copy_from_sockptr(ctx.optval, optval,
1888 min(*optlen, max_optlen))) {
1889 ret = -EFAULT;
1890 goto out;
1891 }
1892
1893 lock_sock(sk);
1894 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1895 &ctx, bpf_prog_run, 0, NULL);
1896 release_sock(sk);
1897
1898 if (ret)
1899 goto out;
1900
1901 if (ctx.optlen == -1) {
1902 /* optlen set to -1, bypass kernel */
1903 ret = 1;
1904 } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1905 /* optlen is out of bounds */
1906 if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
1907 pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
1908 ctx.optlen, max_optlen);
1909 ret = 0;
1910 goto out;
1911 }
1912 ret = -EFAULT;
1913 } else {
1914 /* optlen within bounds, run kernel handler */
1915 ret = 0;
1916
1917 /* export any potential modifications */
1918 *level = ctx.level;
1919 *optname = ctx.optname;
1920
1921 /* optlen == 0 from BPF indicates that we should
1922 * use original userspace data.
1923 */
1924 if (ctx.optlen != 0) {
1925 *optlen = ctx.optlen;
1926 /* We've used bpf_sockopt_kern->buf as an intermediary
1927 * storage, but the BPF program indicates that we need
1928 * to pass this data to the kernel setsockopt handler.
1929 * No way to export on-stack buf, have to allocate a
1930 * new buffer.
1931 */
1932 if (!sockopt_buf_allocated(&ctx, &buf)) {
1933 void *p = kmalloc(ctx.optlen, GFP_USER);
1934
1935 if (!p) {
1936 ret = -ENOMEM;
1937 goto out;
1938 }
1939 memcpy(p, ctx.optval, ctx.optlen);
1940 *kernel_optval = p;
1941 } else {
1942 *kernel_optval = ctx.optval;
1943 }
1944 /* export and don't free sockopt buf */
1945 return 0;
1946 }
1947 }
1948
1949 out:
1950 sockopt_free_buf(&ctx, &buf);
1951 return ret;
1952 }
1953
__cgroup_bpf_run_filter_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen,int max_optlen,int retval)1954 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1955 int optname, sockptr_t optval,
1956 sockptr_t optlen, int max_optlen,
1957 int retval)
1958 {
1959 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1960 struct bpf_sockopt_buf buf = {};
1961 struct bpf_sockopt_kern ctx = {
1962 .sk = sk,
1963 .level = level,
1964 .optname = optname,
1965 .current_task = current,
1966 };
1967 int orig_optlen;
1968 int ret;
1969
1970 orig_optlen = max_optlen;
1971 ctx.optlen = max_optlen;
1972 max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1973 if (max_optlen < 0)
1974 return max_optlen;
1975
1976 if (!retval) {
1977 /* If kernel getsockopt finished successfully,
1978 * copy whatever was returned to the user back
1979 * into our temporary buffer. Set optlen to the
1980 * one that kernel returned as well to let
1981 * BPF programs inspect the value.
1982 */
1983 if (copy_from_sockptr(&ctx.optlen, optlen,
1984 sizeof(ctx.optlen))) {
1985 ret = -EFAULT;
1986 goto out;
1987 }
1988
1989 if (ctx.optlen < 0) {
1990 ret = -EFAULT;
1991 goto out;
1992 }
1993 orig_optlen = ctx.optlen;
1994
1995 if (copy_from_sockptr(ctx.optval, optval,
1996 min(ctx.optlen, max_optlen))) {
1997 ret = -EFAULT;
1998 goto out;
1999 }
2000 }
2001
2002 lock_sock(sk);
2003 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
2004 &ctx, bpf_prog_run, retval, NULL);
2005 release_sock(sk);
2006
2007 if (ret < 0)
2008 goto out;
2009
2010 if (!sockptr_is_null(optval) &&
2011 (ctx.optlen > max_optlen || ctx.optlen < 0)) {
2012 if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
2013 pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
2014 ctx.optlen, max_optlen);
2015 ret = retval;
2016 goto out;
2017 }
2018 ret = -EFAULT;
2019 goto out;
2020 }
2021
2022 if (ctx.optlen != 0) {
2023 if (!sockptr_is_null(optval) &&
2024 copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
2025 ret = -EFAULT;
2026 goto out;
2027 }
2028 if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
2029 ret = -EFAULT;
2030 goto out;
2031 }
2032 }
2033
2034 out:
2035 sockopt_free_buf(&ctx, &buf);
2036 return ret;
2037 }
2038
__cgroup_bpf_run_filter_getsockopt_kern(struct sock * sk,int level,int optname,void * optval,int * optlen,int retval)2039 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
2040 int optname, void *optval,
2041 int *optlen, int retval)
2042 {
2043 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
2044 struct bpf_sockopt_kern ctx = {
2045 .sk = sk,
2046 .level = level,
2047 .optname = optname,
2048 .optlen = *optlen,
2049 .optval = optval,
2050 .optval_end = optval + *optlen,
2051 .current_task = current,
2052 };
2053 int ret;
2054
2055 /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
2056 * user data back into BPF buffer when reval != 0. This is
2057 * done as an optimization to avoid extra copy, assuming
2058 * kernel won't populate the data in case of an error.
2059 * Here we always pass the data and memset() should
2060 * be called if that data shouldn't be "exported".
2061 */
2062
2063 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
2064 &ctx, bpf_prog_run, retval, NULL);
2065 if (ret < 0)
2066 return ret;
2067
2068 if (ctx.optlen > *optlen)
2069 return -EFAULT;
2070
2071 /* BPF programs can shrink the buffer, export the modifications.
2072 */
2073 if (ctx.optlen != 0)
2074 *optlen = ctx.optlen;
2075
2076 return ret;
2077 }
2078 #endif
2079
sysctl_cpy_dir(const struct ctl_dir * dir,char ** bufp,size_t * lenp)2080 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
2081 size_t *lenp)
2082 {
2083 ssize_t tmp_ret = 0, ret;
2084
2085 if (dir->header.parent) {
2086 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
2087 if (tmp_ret < 0)
2088 return tmp_ret;
2089 }
2090
2091 ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
2092 if (ret < 0)
2093 return ret;
2094 *bufp += ret;
2095 *lenp -= ret;
2096 ret += tmp_ret;
2097
2098 /* Avoid leading slash. */
2099 if (!ret)
2100 return ret;
2101
2102 tmp_ret = strscpy(*bufp, "/", *lenp);
2103 if (tmp_ret < 0)
2104 return tmp_ret;
2105 *bufp += tmp_ret;
2106 *lenp -= tmp_ret;
2107
2108 return ret + tmp_ret;
2109 }
2110
BPF_CALL_4(bpf_sysctl_get_name,struct bpf_sysctl_kern *,ctx,char *,buf,size_t,buf_len,u64,flags)2111 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
2112 size_t, buf_len, u64, flags)
2113 {
2114 ssize_t tmp_ret = 0, ret;
2115
2116 if (!buf)
2117 return -EINVAL;
2118
2119 if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
2120 if (!ctx->head)
2121 return -EINVAL;
2122 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
2123 if (tmp_ret < 0)
2124 return tmp_ret;
2125 }
2126
2127 ret = strscpy(buf, ctx->table->procname, buf_len);
2128
2129 return ret < 0 ? ret : tmp_ret + ret;
2130 }
2131
2132 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
2133 .func = bpf_sysctl_get_name,
2134 .gpl_only = false,
2135 .ret_type = RET_INTEGER,
2136 .arg1_type = ARG_PTR_TO_CTX,
2137 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
2138 .arg3_type = ARG_CONST_SIZE,
2139 .arg4_type = ARG_ANYTHING,
2140 };
2141
copy_sysctl_value(char * dst,size_t dst_len,char * src,size_t src_len)2142 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
2143 size_t src_len)
2144 {
2145 if (!dst)
2146 return -EINVAL;
2147
2148 if (!dst_len)
2149 return -E2BIG;
2150
2151 if (!src || !src_len) {
2152 memset(dst, 0, dst_len);
2153 return -EINVAL;
2154 }
2155
2156 memcpy(dst, src, min(dst_len, src_len));
2157
2158 if (dst_len > src_len) {
2159 memset(dst + src_len, '\0', dst_len - src_len);
2160 return src_len;
2161 }
2162
2163 dst[dst_len - 1] = '\0';
2164
2165 return -E2BIG;
2166 }
2167
BPF_CALL_3(bpf_sysctl_get_current_value,struct bpf_sysctl_kern *,ctx,char *,buf,size_t,buf_len)2168 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
2169 char *, buf, size_t, buf_len)
2170 {
2171 return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
2172 }
2173
2174 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
2175 .func = bpf_sysctl_get_current_value,
2176 .gpl_only = false,
2177 .ret_type = RET_INTEGER,
2178 .arg1_type = ARG_PTR_TO_CTX,
2179 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
2180 .arg3_type = ARG_CONST_SIZE,
2181 };
2182
BPF_CALL_3(bpf_sysctl_get_new_value,struct bpf_sysctl_kern *,ctx,char *,buf,size_t,buf_len)2183 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
2184 size_t, buf_len)
2185 {
2186 if (!ctx->write) {
2187 if (buf && buf_len)
2188 memset(buf, '\0', buf_len);
2189 return -EINVAL;
2190 }
2191 return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
2192 }
2193
2194 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
2195 .func = bpf_sysctl_get_new_value,
2196 .gpl_only = false,
2197 .ret_type = RET_INTEGER,
2198 .arg1_type = ARG_PTR_TO_CTX,
2199 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
2200 .arg3_type = ARG_CONST_SIZE,
2201 };
2202
BPF_CALL_3(bpf_sysctl_set_new_value,struct bpf_sysctl_kern *,ctx,const char *,buf,size_t,buf_len)2203 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
2204 const char *, buf, size_t, buf_len)
2205 {
2206 if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
2207 return -EINVAL;
2208
2209 if (buf_len > PAGE_SIZE - 1)
2210 return -E2BIG;
2211
2212 memcpy(ctx->new_val, buf, buf_len);
2213 ctx->new_len = buf_len;
2214 ctx->new_updated = 1;
2215
2216 return 0;
2217 }
2218
2219 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2220 .func = bpf_sysctl_set_new_value,
2221 .gpl_only = false,
2222 .ret_type = RET_INTEGER,
2223 .arg1_type = ARG_PTR_TO_CTX,
2224 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
2225 .arg3_type = ARG_CONST_SIZE,
2226 };
2227
2228 static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)2229 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2230 {
2231 const struct bpf_func_proto *func_proto;
2232
2233 func_proto = cgroup_common_func_proto(func_id, prog);
2234 if (func_proto)
2235 return func_proto;
2236
2237 switch (func_id) {
2238 case BPF_FUNC_sysctl_get_name:
2239 return &bpf_sysctl_get_name_proto;
2240 case BPF_FUNC_sysctl_get_current_value:
2241 return &bpf_sysctl_get_current_value_proto;
2242 case BPF_FUNC_sysctl_get_new_value:
2243 return &bpf_sysctl_get_new_value_proto;
2244 case BPF_FUNC_sysctl_set_new_value:
2245 return &bpf_sysctl_set_new_value_proto;
2246 case BPF_FUNC_ktime_get_coarse_ns:
2247 return &bpf_ktime_get_coarse_ns_proto;
2248 case BPF_FUNC_perf_event_output:
2249 return &bpf_event_output_data_proto;
2250 default:
2251 return bpf_base_func_proto(func_id, prog);
2252 }
2253 }
2254
sysctl_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)2255 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
2256 const struct bpf_prog *prog,
2257 struct bpf_insn_access_aux *info)
2258 {
2259 const int size_default = sizeof(__u32);
2260
2261 if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
2262 return false;
2263
2264 switch (off) {
2265 case bpf_ctx_range(struct bpf_sysctl, write):
2266 if (type != BPF_READ)
2267 return false;
2268 bpf_ctx_record_field_size(info, size_default);
2269 return bpf_ctx_narrow_access_ok(off, size, size_default);
2270 case bpf_ctx_range(struct bpf_sysctl, file_pos):
2271 if (type == BPF_READ) {
2272 bpf_ctx_record_field_size(info, size_default);
2273 return bpf_ctx_narrow_access_ok(off, size, size_default);
2274 } else {
2275 return size == size_default;
2276 }
2277 default:
2278 return false;
2279 }
2280 }
2281
sysctl_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)2282 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
2283 const struct bpf_insn *si,
2284 struct bpf_insn *insn_buf,
2285 struct bpf_prog *prog, u32 *target_size)
2286 {
2287 struct bpf_insn *insn = insn_buf;
2288 u32 read_size;
2289
2290 switch (si->off) {
2291 case offsetof(struct bpf_sysctl, write):
2292 *insn++ = BPF_LDX_MEM(
2293 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
2294 bpf_target_off(struct bpf_sysctl_kern, write,
2295 sizeof_field(struct bpf_sysctl_kern,
2296 write),
2297 target_size));
2298 break;
2299 case offsetof(struct bpf_sysctl, file_pos):
2300 /* ppos is a pointer so it should be accessed via indirect
2301 * loads and stores. Also for stores additional temporary
2302 * register is used since neither src_reg nor dst_reg can be
2303 * overridden.
2304 */
2305 if (type == BPF_WRITE) {
2306 int treg = BPF_REG_9;
2307
2308 if (si->src_reg == treg || si->dst_reg == treg)
2309 --treg;
2310 if (si->src_reg == treg || si->dst_reg == treg)
2311 --treg;
2312 *insn++ = BPF_STX_MEM(
2313 BPF_DW, si->dst_reg, treg,
2314 offsetof(struct bpf_sysctl_kern, tmp_reg));
2315 *insn++ = BPF_LDX_MEM(
2316 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2317 treg, si->dst_reg,
2318 offsetof(struct bpf_sysctl_kern, ppos));
2319 *insn++ = BPF_RAW_INSN(
2320 BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
2321 treg, si->src_reg,
2322 bpf_ctx_narrow_access_offset(
2323 0, sizeof(u32), sizeof(loff_t)),
2324 si->imm);
2325 *insn++ = BPF_LDX_MEM(
2326 BPF_DW, treg, si->dst_reg,
2327 offsetof(struct bpf_sysctl_kern, tmp_reg));
2328 } else {
2329 *insn++ = BPF_LDX_MEM(
2330 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2331 si->dst_reg, si->src_reg,
2332 offsetof(struct bpf_sysctl_kern, ppos));
2333 read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2334 *insn++ = BPF_LDX_MEM(
2335 BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2336 bpf_ctx_narrow_access_offset(
2337 0, read_size, sizeof(loff_t)));
2338 }
2339 *target_size = sizeof(u32);
2340 break;
2341 }
2342
2343 return insn - insn_buf;
2344 }
2345
2346 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2347 .get_func_proto = sysctl_func_proto,
2348 .is_valid_access = sysctl_is_valid_access,
2349 .convert_ctx_access = sysctl_convert_ctx_access,
2350 };
2351
2352 const struct bpf_prog_ops cg_sysctl_prog_ops = {
2353 };
2354
2355 #ifdef CONFIG_NET
BPF_CALL_1(bpf_get_netns_cookie_sockopt,struct bpf_sockopt_kern *,ctx)2356 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2357 {
2358 const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2359
2360 return net->net_cookie;
2361 }
2362
2363 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2364 .func = bpf_get_netns_cookie_sockopt,
2365 .gpl_only = false,
2366 .ret_type = RET_INTEGER,
2367 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
2368 };
2369 #endif
2370
2371 static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)2372 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2373 {
2374 const struct bpf_func_proto *func_proto;
2375
2376 func_proto = cgroup_common_func_proto(func_id, prog);
2377 if (func_proto)
2378 return func_proto;
2379
2380 switch (func_id) {
2381 #ifdef CONFIG_NET
2382 case BPF_FUNC_get_netns_cookie:
2383 return &bpf_get_netns_cookie_sockopt_proto;
2384 case BPF_FUNC_sk_storage_get:
2385 return &bpf_sk_storage_get_proto;
2386 case BPF_FUNC_sk_storage_delete:
2387 return &bpf_sk_storage_delete_proto;
2388 case BPF_FUNC_setsockopt:
2389 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2390 return &bpf_sk_setsockopt_proto;
2391 return NULL;
2392 case BPF_FUNC_getsockopt:
2393 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2394 return &bpf_sk_getsockopt_proto;
2395 return NULL;
2396 #endif
2397 #ifdef CONFIG_INET
2398 case BPF_FUNC_tcp_sock:
2399 return &bpf_tcp_sock_proto;
2400 #endif
2401 case BPF_FUNC_perf_event_output:
2402 return &bpf_event_output_data_proto;
2403 default:
2404 return bpf_base_func_proto(func_id, prog);
2405 }
2406 }
2407
cg_sockopt_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)2408 static bool cg_sockopt_is_valid_access(int off, int size,
2409 enum bpf_access_type type,
2410 const struct bpf_prog *prog,
2411 struct bpf_insn_access_aux *info)
2412 {
2413 const int size_default = sizeof(__u32);
2414
2415 if (off < 0 || off >= sizeof(struct bpf_sockopt))
2416 return false;
2417
2418 if (off % size != 0)
2419 return false;
2420
2421 if (type == BPF_WRITE) {
2422 switch (off) {
2423 case offsetof(struct bpf_sockopt, retval):
2424 if (size != size_default)
2425 return false;
2426 return prog->expected_attach_type ==
2427 BPF_CGROUP_GETSOCKOPT;
2428 case offsetof(struct bpf_sockopt, optname):
2429 fallthrough;
2430 case offsetof(struct bpf_sockopt, level):
2431 if (size != size_default)
2432 return false;
2433 return prog->expected_attach_type ==
2434 BPF_CGROUP_SETSOCKOPT;
2435 case offsetof(struct bpf_sockopt, optlen):
2436 return size == size_default;
2437 default:
2438 return false;
2439 }
2440 }
2441
2442 switch (off) {
2443 case offsetof(struct bpf_sockopt, sk):
2444 if (size != sizeof(__u64))
2445 return false;
2446 info->reg_type = PTR_TO_SOCKET;
2447 break;
2448 case offsetof(struct bpf_sockopt, optval):
2449 if (size != sizeof(__u64))
2450 return false;
2451 info->reg_type = PTR_TO_PACKET;
2452 break;
2453 case offsetof(struct bpf_sockopt, optval_end):
2454 if (size != sizeof(__u64))
2455 return false;
2456 info->reg_type = PTR_TO_PACKET_END;
2457 break;
2458 case offsetof(struct bpf_sockopt, retval):
2459 if (size != size_default)
2460 return false;
2461 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2462 default:
2463 if (size != size_default)
2464 return false;
2465 break;
2466 }
2467 return true;
2468 }
2469
2470 #define CG_SOCKOPT_READ_FIELD(F) \
2471 BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
2472 si->dst_reg, si->src_reg, \
2473 offsetof(struct bpf_sockopt_kern, F))
2474
2475 #define CG_SOCKOPT_WRITE_FIELD(F) \
2476 BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
2477 BPF_MEM | BPF_CLASS(si->code)), \
2478 si->dst_reg, si->src_reg, \
2479 offsetof(struct bpf_sockopt_kern, F), \
2480 si->imm)
2481
cg_sockopt_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)2482 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2483 const struct bpf_insn *si,
2484 struct bpf_insn *insn_buf,
2485 struct bpf_prog *prog,
2486 u32 *target_size)
2487 {
2488 struct bpf_insn *insn = insn_buf;
2489
2490 switch (si->off) {
2491 case offsetof(struct bpf_sockopt, sk):
2492 *insn++ = CG_SOCKOPT_READ_FIELD(sk);
2493 break;
2494 case offsetof(struct bpf_sockopt, level):
2495 if (type == BPF_WRITE)
2496 *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
2497 else
2498 *insn++ = CG_SOCKOPT_READ_FIELD(level);
2499 break;
2500 case offsetof(struct bpf_sockopt, optname):
2501 if (type == BPF_WRITE)
2502 *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
2503 else
2504 *insn++ = CG_SOCKOPT_READ_FIELD(optname);
2505 break;
2506 case offsetof(struct bpf_sockopt, optlen):
2507 if (type == BPF_WRITE)
2508 *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
2509 else
2510 *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
2511 break;
2512 case offsetof(struct bpf_sockopt, retval):
2513 BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2514
2515 if (type == BPF_WRITE) {
2516 int treg = BPF_REG_9;
2517
2518 if (si->src_reg == treg || si->dst_reg == treg)
2519 --treg;
2520 if (si->src_reg == treg || si->dst_reg == treg)
2521 --treg;
2522 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2523 offsetof(struct bpf_sockopt_kern, tmp_reg));
2524 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2525 treg, si->dst_reg,
2526 offsetof(struct bpf_sockopt_kern, current_task));
2527 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2528 treg, treg,
2529 offsetof(struct task_struct, bpf_ctx));
2530 *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
2531 BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2532 treg, si->src_reg,
2533 offsetof(struct bpf_cg_run_ctx, retval),
2534 si->imm);
2535 *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2536 offsetof(struct bpf_sockopt_kern, tmp_reg));
2537 } else {
2538 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2539 si->dst_reg, si->src_reg,
2540 offsetof(struct bpf_sockopt_kern, current_task));
2541 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2542 si->dst_reg, si->dst_reg,
2543 offsetof(struct task_struct, bpf_ctx));
2544 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2545 si->dst_reg, si->dst_reg,
2546 offsetof(struct bpf_cg_run_ctx, retval));
2547 }
2548 break;
2549 case offsetof(struct bpf_sockopt, optval):
2550 *insn++ = CG_SOCKOPT_READ_FIELD(optval);
2551 break;
2552 case offsetof(struct bpf_sockopt, optval_end):
2553 *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
2554 break;
2555 }
2556
2557 return insn - insn_buf;
2558 }
2559
cg_sockopt_get_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)2560 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2561 bool direct_write,
2562 const struct bpf_prog *prog)
2563 {
2564 /* Nothing to do for sockopt argument. The data is kzalloc'ated.
2565 */
2566 return 0;
2567 }
2568
2569 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2570 .get_func_proto = cg_sockopt_func_proto,
2571 .is_valid_access = cg_sockopt_is_valid_access,
2572 .convert_ctx_access = cg_sockopt_convert_ctx_access,
2573 .gen_prologue = cg_sockopt_get_prologue,
2574 };
2575
2576 const struct bpf_prog_ops cg_sockopt_prog_ops = {
2577 };
2578
2579 /* Common helpers for cgroup hooks. */
2580 const struct bpf_func_proto *
cgroup_common_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)2581 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2582 {
2583 switch (func_id) {
2584 case BPF_FUNC_get_local_storage:
2585 return &bpf_get_local_storage_proto;
2586 case BPF_FUNC_get_retval:
2587 switch (prog->expected_attach_type) {
2588 case BPF_CGROUP_INET_INGRESS:
2589 case BPF_CGROUP_INET_EGRESS:
2590 case BPF_CGROUP_SOCK_OPS:
2591 case BPF_CGROUP_UDP4_RECVMSG:
2592 case BPF_CGROUP_UDP6_RECVMSG:
2593 case BPF_CGROUP_UNIX_RECVMSG:
2594 case BPF_CGROUP_INET4_GETPEERNAME:
2595 case BPF_CGROUP_INET6_GETPEERNAME:
2596 case BPF_CGROUP_UNIX_GETPEERNAME:
2597 case BPF_CGROUP_INET4_GETSOCKNAME:
2598 case BPF_CGROUP_INET6_GETSOCKNAME:
2599 case BPF_CGROUP_UNIX_GETSOCKNAME:
2600 return NULL;
2601 default:
2602 return &bpf_get_retval_proto;
2603 }
2604 case BPF_FUNC_set_retval:
2605 switch (prog->expected_attach_type) {
2606 case BPF_CGROUP_INET_INGRESS:
2607 case BPF_CGROUP_INET_EGRESS:
2608 case BPF_CGROUP_SOCK_OPS:
2609 case BPF_CGROUP_UDP4_RECVMSG:
2610 case BPF_CGROUP_UDP6_RECVMSG:
2611 case BPF_CGROUP_UNIX_RECVMSG:
2612 case BPF_CGROUP_INET4_GETPEERNAME:
2613 case BPF_CGROUP_INET6_GETPEERNAME:
2614 case BPF_CGROUP_UNIX_GETPEERNAME:
2615 case BPF_CGROUP_INET4_GETSOCKNAME:
2616 case BPF_CGROUP_INET6_GETSOCKNAME:
2617 case BPF_CGROUP_UNIX_GETSOCKNAME:
2618 return NULL;
2619 default:
2620 return &bpf_set_retval_proto;
2621 }
2622 default:
2623 return NULL;
2624 }
2625 }
2626