1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io 3 */ 4 5 /* Devmaps primary use is as a backend map for XDP BPF helper call 6 * bpf_redirect_map(). Because XDP is mostly concerned with performance we 7 * spent some effort to ensure the datapath with redirect maps does not use 8 * any locking. This is a quick note on the details. 9 * 10 * We have three possible paths to get into the devmap control plane bpf 11 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall 12 * will invoke an update, delete, or lookup operation. To ensure updates and 13 * deletes appear atomic from the datapath side xchg() is used to modify the 14 * netdev_map array. Then because the datapath does a lookup into the netdev_map 15 * array (read-only) from an RCU critical section we use call_rcu() to wait for 16 * an rcu grace period before free'ing the old data structures. This ensures the 17 * datapath always has a valid copy. However, the datapath does a "flush" 18 * operation that pushes any pending packets in the driver outside the RCU 19 * critical section. Each bpf_dtab_netdev tracks these pending operations using 20 * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until 21 * this list is empty, indicating outstanding flush operations have completed. 22 * 23 * BPF syscalls may race with BPF program calls on any of the update, delete 24 * or lookup operations. As noted above the xchg() operation also keep the 25 * netdev_map consistent in this case. From the devmap side BPF programs 26 * calling into these operations are the same as multiple user space threads 27 * making system calls. 28 * 29 * Finally, any of the above may race with a netdev_unregister notifier. The 30 * unregister notifier must search for net devices in the map structure that 31 * contain a reference to the net device and remove them. This is a two step 32 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) 33 * check to see if the ifindex is the same as the net_device being removed. 34 * When removing the dev a cmpxchg() is used to ensure the correct dev is 35 * removed, in the case of a concurrent update or delete operation it is 36 * possible that the initially referenced dev is no longer in the map. As the 37 * notifier hook walks the map we know that new dev references can not be 38 * added by the user because core infrastructure ensures dev_get_by_index() 39 * calls will fail at this point. 40 * 41 * The devmap_hash type is a map type which interprets keys as ifindexes and 42 * indexes these using a hashmap. This allows maps that use ifindex as key to be 43 * densely packed instead of having holes in the lookup array for unused 44 * ifindexes. The setup and packet enqueue/send code is shared between the two 45 * types of devmap; only the lookup and insertion is different. 46 */ 47 #include <linux/bpf.h> 48 #include <net/xdp.h> 49 #include <linux/filter.h> 50 #include <trace/events/xdp.h> 51 #include <linux/btf_ids.h> 52 53 #define DEV_CREATE_FLAG_MASK \ 54 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 55 56 struct xdp_dev_bulk_queue { 57 struct xdp_frame *q[DEV_MAP_BULK_SIZE]; 58 struct list_head flush_node; 59 struct net_device *dev; 60 struct net_device *dev_rx; 61 struct bpf_prog *xdp_prog; 62 unsigned int count; 63 }; 64 65 struct bpf_dtab_netdev { 66 struct net_device *dev; /* must be first member, due to tracepoint */ 67 struct hlist_node index_hlist; 68 struct bpf_prog *xdp_prog; 69 struct rcu_head rcu; 70 unsigned int idx; 71 struct bpf_devmap_val val; 72 }; 73 74 struct bpf_dtab { 75 struct bpf_map map; 76 struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ 77 struct list_head list; 78 79 /* these are only used for DEVMAP_HASH type maps */ 80 struct hlist_head *dev_index_head; 81 spinlock_t index_lock; 82 unsigned int items; 83 u32 n_buckets; 84 }; 85 86 static DEFINE_PER_CPU(struct list_head, dev_flush_list); 87 static DEFINE_SPINLOCK(dev_map_lock); 88 static LIST_HEAD(dev_map_list); 89 90 static struct hlist_head *dev_map_create_hash(unsigned int entries, 91 int numa_node) 92 { 93 int i; 94 struct hlist_head *hash; 95 96 hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); 97 if (hash != NULL) 98 for (i = 0; i < entries; i++) 99 INIT_HLIST_HEAD(&hash[i]); 100 101 return hash; 102 } 103 104 static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, 105 int idx) 106 { 107 return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; 108 } 109 110 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) 111 { 112 u32 valsize = attr->value_size; 113 114 /* check sanity of attributes. 2 value sizes supported: 115 * 4 bytes: ifindex 116 * 8 bytes: ifindex + prog fd 117 */ 118 if (attr->max_entries == 0 || attr->key_size != 4 || 119 (valsize != offsetofend(struct bpf_devmap_val, ifindex) && 120 valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || 121 attr->map_flags & ~DEV_CREATE_FLAG_MASK) 122 return -EINVAL; 123 124 /* Lookup returns a pointer straight to dev->ifindex, so make sure the 125 * verifier prevents writes from the BPF side 126 */ 127 attr->map_flags |= BPF_F_RDONLY_PROG; 128 129 130 bpf_map_init_from_attr(&dtab->map, attr); 131 132 if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 133 /* hash table size must be power of 2; roundup_pow_of_two() can 134 * overflow into UB on 32-bit arches, so check that first 135 */ 136 if (dtab->map.max_entries > 1UL << 31) 137 return -EINVAL; 138 139 dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); 140 141 dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, 142 dtab->map.numa_node); 143 if (!dtab->dev_index_head) 144 return -ENOMEM; 145 146 spin_lock_init(&dtab->index_lock); 147 } else { 148 dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * 149 sizeof(struct bpf_dtab_netdev *), 150 dtab->map.numa_node); 151 if (!dtab->netdev_map) 152 return -ENOMEM; 153 } 154 155 return 0; 156 } 157 158 static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 159 { 160 struct bpf_dtab *dtab; 161 int err; 162 163 dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); 164 if (!dtab) 165 return ERR_PTR(-ENOMEM); 166 167 err = dev_map_init_map(dtab, attr); 168 if (err) { 169 bpf_map_area_free(dtab); 170 return ERR_PTR(err); 171 } 172 173 spin_lock(&dev_map_lock); 174 list_add_tail_rcu(&dtab->list, &dev_map_list); 175 spin_unlock(&dev_map_lock); 176 177 return &dtab->map; 178 } 179 180 static void dev_map_free(struct bpf_map *map) 181 { 182 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 183 int i; 184 185 /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 186 * so the programs (can be more than one that used this map) were 187 * disconnected from events. The following synchronize_rcu() guarantees 188 * both rcu read critical sections complete and waits for 189 * preempt-disable regions (NAPI being the relevant context here) so we 190 * are certain there will be no further reads against the netdev_map and 191 * all flush operations are complete. Flush operations can only be done 192 * from NAPI context for this reason. 193 */ 194 195 spin_lock(&dev_map_lock); 196 list_del_rcu(&dtab->list); 197 spin_unlock(&dev_map_lock); 198 199 /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map() 200 * during NAPI callback and cleared after the XDP redirect. There is no 201 * explicit RCU read section which protects bpf_redirect_info->map but 202 * local_bh_disable() also marks the beginning an RCU section. This 203 * makes the complete softirq callback RCU protected. Thus after 204 * following synchronize_rcu() there no bpf_redirect_info->map == map 205 * assignment. 206 */ 207 synchronize_rcu(); 208 209 /* Make sure prior __dev_map_entry_free() have completed. */ 210 rcu_barrier(); 211 212 if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 213 for (i = 0; i < dtab->n_buckets; i++) { 214 struct bpf_dtab_netdev *dev; 215 struct hlist_head *head; 216 struct hlist_node *next; 217 218 head = dev_map_index_hash(dtab, i); 219 220 hlist_for_each_entry_safe(dev, next, head, index_hlist) { 221 hlist_del_rcu(&dev->index_hlist); 222 if (dev->xdp_prog) 223 bpf_prog_put(dev->xdp_prog); 224 dev_put(dev->dev); 225 kfree(dev); 226 } 227 } 228 229 bpf_map_area_free(dtab->dev_index_head); 230 } else { 231 for (i = 0; i < dtab->map.max_entries; i++) { 232 struct bpf_dtab_netdev *dev; 233 234 dev = rcu_dereference_raw(dtab->netdev_map[i]); 235 if (!dev) 236 continue; 237 238 if (dev->xdp_prog) 239 bpf_prog_put(dev->xdp_prog); 240 dev_put(dev->dev); 241 kfree(dev); 242 } 243 244 bpf_map_area_free(dtab->netdev_map); 245 } 246 247 bpf_map_area_free(dtab); 248 } 249 250 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 251 { 252 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 253 u32 index = key ? *(u32 *)key : U32_MAX; 254 u32 *next = next_key; 255 256 if (index >= dtab->map.max_entries) { 257 *next = 0; 258 return 0; 259 } 260 261 if (index == dtab->map.max_entries - 1) 262 return -ENOENT; 263 *next = index + 1; 264 return 0; 265 } 266 267 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or 268 * by local_bh_disable() (from XDP calls inside NAPI). The 269 * rcu_read_lock_bh_held() below makes lockdep accept both. 270 */ 271 static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) 272 { 273 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 274 struct hlist_head *head = dev_map_index_hash(dtab, key); 275 struct bpf_dtab_netdev *dev; 276 277 hlist_for_each_entry_rcu(dev, head, index_hlist, 278 lockdep_is_held(&dtab->index_lock)) 279 if (dev->idx == key) 280 return dev; 281 282 return NULL; 283 } 284 285 static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, 286 void *next_key) 287 { 288 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 289 u32 idx, *next = next_key; 290 struct bpf_dtab_netdev *dev, *next_dev; 291 struct hlist_head *head; 292 int i = 0; 293 294 if (!key) 295 goto find_first; 296 297 idx = *(u32 *)key; 298 299 dev = __dev_map_hash_lookup_elem(map, idx); 300 if (!dev) 301 goto find_first; 302 303 next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), 304 struct bpf_dtab_netdev, index_hlist); 305 306 if (next_dev) { 307 *next = next_dev->idx; 308 return 0; 309 } 310 311 i = idx & (dtab->n_buckets - 1); 312 i++; 313 314 find_first: 315 for (; i < dtab->n_buckets; i++) { 316 head = dev_map_index_hash(dtab, i); 317 318 next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), 319 struct bpf_dtab_netdev, 320 index_hlist); 321 if (next_dev) { 322 *next = next_dev->idx; 323 return 0; 324 } 325 } 326 327 return -ENOENT; 328 } 329 330 static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, 331 struct xdp_frame **frames, int n, 332 struct net_device *dev) 333 { 334 struct xdp_txq_info txq = { .dev = dev }; 335 struct xdp_buff xdp; 336 int i, nframes = 0; 337 338 for (i = 0; i < n; i++) { 339 struct xdp_frame *xdpf = frames[i]; 340 u32 act; 341 int err; 342 343 xdp_convert_frame_to_buff(xdpf, &xdp); 344 xdp.txq = &txq; 345 346 act = bpf_prog_run_xdp(xdp_prog, &xdp); 347 switch (act) { 348 case XDP_PASS: 349 err = xdp_update_frame_from_buff(&xdp, xdpf); 350 if (unlikely(err < 0)) 351 xdp_return_frame_rx_napi(xdpf); 352 else 353 frames[nframes++] = xdpf; 354 break; 355 default: 356 bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); 357 fallthrough; 358 case XDP_ABORTED: 359 trace_xdp_exception(dev, xdp_prog, act); 360 fallthrough; 361 case XDP_DROP: 362 xdp_return_frame_rx_napi(xdpf); 363 break; 364 } 365 } 366 return nframes; /* sent frames count */ 367 } 368 369 static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) 370 { 371 struct net_device *dev = bq->dev; 372 unsigned int cnt = bq->count; 373 int sent = 0, err = 0; 374 int to_send = cnt; 375 int i; 376 377 if (unlikely(!cnt)) 378 return; 379 380 for (i = 0; i < cnt; i++) { 381 struct xdp_frame *xdpf = bq->q[i]; 382 383 prefetch(xdpf); 384 } 385 386 if (bq->xdp_prog) { 387 to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); 388 if (!to_send) 389 goto out; 390 } 391 392 sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); 393 if (sent < 0) { 394 /* If ndo_xdp_xmit fails with an errno, no frames have 395 * been xmit'ed. 396 */ 397 err = sent; 398 sent = 0; 399 } 400 401 /* If not all frames have been transmitted, it is our 402 * responsibility to free them 403 */ 404 for (i = sent; unlikely(i < to_send); i++) 405 xdp_return_frame_rx_napi(bq->q[i]); 406 407 out: 408 bq->count = 0; 409 trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); 410 } 411 412 /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the 413 * driver before returning from its napi->poll() routine. See the comment above 414 * xdp_do_flush() in filter.c. 415 */ 416 void __dev_flush(void) 417 { 418 struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); 419 struct xdp_dev_bulk_queue *bq, *tmp; 420 421 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { 422 bq_xmit_all(bq, XDP_XMIT_FLUSH); 423 bq->dev_rx = NULL; 424 bq->xdp_prog = NULL; 425 __list_del_clearprev(&bq->flush_node); 426 } 427 } 428 429 #ifdef CONFIG_DEBUG_NET 430 bool dev_check_flush(void) 431 { 432 if (list_empty(this_cpu_ptr(&dev_flush_list))) 433 return false; 434 __dev_flush(); 435 return true; 436 } 437 #endif 438 439 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or 440 * by local_bh_disable() (from XDP calls inside NAPI). The 441 * rcu_read_lock_bh_held() below makes lockdep accept both. 442 */ 443 static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) 444 { 445 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 446 struct bpf_dtab_netdev *obj; 447 448 if (key >= map->max_entries) 449 return NULL; 450 451 obj = rcu_dereference_check(dtab->netdev_map[key], 452 rcu_read_lock_bh_held()); 453 return obj; 454 } 455 456 /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu 457 * variable access, and map elements stick around. See comment above 458 * xdp_do_flush() in filter.c. 459 */ 460 static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, 461 struct net_device *dev_rx, struct bpf_prog *xdp_prog) 462 { 463 struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); 464 struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); 465 466 if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) 467 bq_xmit_all(bq, 0); 468 469 /* Ingress dev_rx will be the same for all xdp_frame's in 470 * bulk_queue, because bq stored per-CPU and must be flushed 471 * from net_device drivers NAPI func end. 472 * 473 * Do the same with xdp_prog and flush_list since these fields 474 * are only ever modified together. 475 */ 476 if (!bq->dev_rx) { 477 bq->dev_rx = dev_rx; 478 bq->xdp_prog = xdp_prog; 479 list_add(&bq->flush_node, flush_list); 480 } 481 482 bq->q[bq->count++] = xdpf; 483 } 484 485 static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, 486 struct net_device *dev_rx, 487 struct bpf_prog *xdp_prog) 488 { 489 int err; 490 491 if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) 492 return -EOPNOTSUPP; 493 494 if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && 495 xdp_frame_has_frags(xdpf))) 496 return -EOPNOTSUPP; 497 498 err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); 499 if (unlikely(err)) 500 return err; 501 502 bq_enqueue(dev, xdpf, dev_rx, xdp_prog); 503 return 0; 504 } 505 506 static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) 507 { 508 struct xdp_txq_info txq = { .dev = dst->dev }; 509 struct xdp_buff xdp; 510 u32 act; 511 512 if (!dst->xdp_prog) 513 return XDP_PASS; 514 515 __skb_pull(skb, skb->mac_len); 516 xdp.txq = &txq; 517 518 act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); 519 switch (act) { 520 case XDP_PASS: 521 __skb_push(skb, skb->mac_len); 522 break; 523 default: 524 bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); 525 fallthrough; 526 case XDP_ABORTED: 527 trace_xdp_exception(dst->dev, dst->xdp_prog, act); 528 fallthrough; 529 case XDP_DROP: 530 kfree_skb(skb); 531 break; 532 } 533 534 return act; 535 } 536 537 int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, 538 struct net_device *dev_rx) 539 { 540 return __xdp_enqueue(dev, xdpf, dev_rx, NULL); 541 } 542 543 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, 544 struct net_device *dev_rx) 545 { 546 struct net_device *dev = dst->dev; 547 548 return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); 549 } 550 551 static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) 552 { 553 if (!obj) 554 return false; 555 556 if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) 557 return false; 558 559 if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && 560 xdp_frame_has_frags(xdpf))) 561 return false; 562 563 if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) 564 return false; 565 566 return true; 567 } 568 569 static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, 570 struct net_device *dev_rx, 571 struct xdp_frame *xdpf) 572 { 573 struct xdp_frame *nxdpf; 574 575 nxdpf = xdpf_clone(xdpf); 576 if (!nxdpf) 577 return -ENOMEM; 578 579 bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); 580 581 return 0; 582 } 583 584 static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) 585 { 586 while (num_excluded--) { 587 if (ifindex == excluded[num_excluded]) 588 return true; 589 } 590 return false; 591 } 592 593 /* Get ifindex of each upper device. 'indexes' must be able to hold at 594 * least MAX_NEST_DEV elements. 595 * Returns the number of ifindexes added. 596 */ 597 static int get_upper_ifindexes(struct net_device *dev, int *indexes) 598 { 599 struct net_device *upper; 600 struct list_head *iter; 601 int n = 0; 602 603 netdev_for_each_upper_dev_rcu(dev, upper, iter) { 604 indexes[n++] = upper->ifindex; 605 } 606 return n; 607 } 608 609 int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, 610 struct bpf_map *map, bool exclude_ingress) 611 { 612 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 613 struct bpf_dtab_netdev *dst, *last_dst = NULL; 614 int excluded_devices[1+MAX_NEST_DEV]; 615 struct hlist_head *head; 616 int num_excluded = 0; 617 unsigned int i; 618 int err; 619 620 if (exclude_ingress) { 621 num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); 622 excluded_devices[num_excluded++] = dev_rx->ifindex; 623 } 624 625 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 626 for (i = 0; i < map->max_entries; i++) { 627 dst = rcu_dereference_check(dtab->netdev_map[i], 628 rcu_read_lock_bh_held()); 629 if (!is_valid_dst(dst, xdpf)) 630 continue; 631 632 if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) 633 continue; 634 635 /* we only need n-1 clones; last_dst enqueued below */ 636 if (!last_dst) { 637 last_dst = dst; 638 continue; 639 } 640 641 err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); 642 if (err) 643 return err; 644 645 last_dst = dst; 646 } 647 } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ 648 for (i = 0; i < dtab->n_buckets; i++) { 649 head = dev_map_index_hash(dtab, i); 650 hlist_for_each_entry_rcu(dst, head, index_hlist, 651 lockdep_is_held(&dtab->index_lock)) { 652 if (!is_valid_dst(dst, xdpf)) 653 continue; 654 655 if (is_ifindex_excluded(excluded_devices, num_excluded, 656 dst->dev->ifindex)) 657 continue; 658 659 /* we only need n-1 clones; last_dst enqueued below */ 660 if (!last_dst) { 661 last_dst = dst; 662 continue; 663 } 664 665 err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); 666 if (err) 667 return err; 668 669 last_dst = dst; 670 } 671 } 672 } 673 674 /* consume the last copy of the frame */ 675 if (last_dst) 676 bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); 677 else 678 xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ 679 680 return 0; 681 } 682 683 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, 684 struct bpf_prog *xdp_prog) 685 { 686 int err; 687 688 err = xdp_ok_fwd_dev(dst->dev, skb->len); 689 if (unlikely(err)) 690 return err; 691 692 /* Redirect has already succeeded semantically at this point, so we just 693 * return 0 even if packet is dropped. Helper below takes care of 694 * freeing skb. 695 */ 696 if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) 697 return 0; 698 699 skb->dev = dst->dev; 700 generic_xdp_tx(skb, xdp_prog); 701 702 return 0; 703 } 704 705 static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, 706 struct sk_buff *skb, 707 struct bpf_prog *xdp_prog) 708 { 709 struct sk_buff *nskb; 710 int err; 711 712 nskb = skb_clone(skb, GFP_ATOMIC); 713 if (!nskb) 714 return -ENOMEM; 715 716 err = dev_map_generic_redirect(dst, nskb, xdp_prog); 717 if (unlikely(err)) { 718 consume_skb(nskb); 719 return err; 720 } 721 722 return 0; 723 } 724 725 int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, 726 struct bpf_prog *xdp_prog, struct bpf_map *map, 727 bool exclude_ingress) 728 { 729 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 730 struct bpf_dtab_netdev *dst, *last_dst = NULL; 731 int excluded_devices[1+MAX_NEST_DEV]; 732 struct hlist_head *head; 733 struct hlist_node *next; 734 int num_excluded = 0; 735 unsigned int i; 736 int err; 737 738 if (exclude_ingress) { 739 num_excluded = get_upper_ifindexes(dev, excluded_devices); 740 excluded_devices[num_excluded++] = dev->ifindex; 741 } 742 743 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 744 for (i = 0; i < map->max_entries; i++) { 745 dst = rcu_dereference_check(dtab->netdev_map[i], 746 rcu_read_lock_bh_held()); 747 if (!dst) 748 continue; 749 750 if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) 751 continue; 752 753 /* we only need n-1 clones; last_dst enqueued below */ 754 if (!last_dst) { 755 last_dst = dst; 756 continue; 757 } 758 759 err = dev_map_redirect_clone(last_dst, skb, xdp_prog); 760 if (err) 761 return err; 762 763 last_dst = dst; 764 765 } 766 } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ 767 for (i = 0; i < dtab->n_buckets; i++) { 768 head = dev_map_index_hash(dtab, i); 769 hlist_for_each_entry_safe(dst, next, head, index_hlist) { 770 if (is_ifindex_excluded(excluded_devices, num_excluded, 771 dst->dev->ifindex)) 772 continue; 773 774 /* we only need n-1 clones; last_dst enqueued below */ 775 if (!last_dst) { 776 last_dst = dst; 777 continue; 778 } 779 780 err = dev_map_redirect_clone(last_dst, skb, xdp_prog); 781 if (err) 782 return err; 783 784 last_dst = dst; 785 } 786 } 787 } 788 789 /* consume the first skb and return */ 790 if (last_dst) 791 return dev_map_generic_redirect(last_dst, skb, xdp_prog); 792 793 /* dtab is empty */ 794 consume_skb(skb); 795 return 0; 796 } 797 798 static void *dev_map_lookup_elem(struct bpf_map *map, void *key) 799 { 800 struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); 801 802 return obj ? &obj->val : NULL; 803 } 804 805 static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) 806 { 807 struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, 808 *(u32 *)key); 809 return obj ? &obj->val : NULL; 810 } 811 812 static void __dev_map_entry_free(struct rcu_head *rcu) 813 { 814 struct bpf_dtab_netdev *dev; 815 816 dev = container_of(rcu, struct bpf_dtab_netdev, rcu); 817 if (dev->xdp_prog) 818 bpf_prog_put(dev->xdp_prog); 819 dev_put(dev->dev); 820 kfree(dev); 821 } 822 823 static long dev_map_delete_elem(struct bpf_map *map, void *key) 824 { 825 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 826 struct bpf_dtab_netdev *old_dev; 827 int k = *(u32 *)key; 828 829 if (k >= map->max_entries) 830 return -EINVAL; 831 832 old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); 833 if (old_dev) { 834 call_rcu(&old_dev->rcu, __dev_map_entry_free); 835 atomic_dec((atomic_t *)&dtab->items); 836 } 837 return 0; 838 } 839 840 static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) 841 { 842 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 843 struct bpf_dtab_netdev *old_dev; 844 int k = *(u32 *)key; 845 unsigned long flags; 846 int ret = -ENOENT; 847 848 spin_lock_irqsave(&dtab->index_lock, flags); 849 850 old_dev = __dev_map_hash_lookup_elem(map, k); 851 if (old_dev) { 852 dtab->items--; 853 hlist_del_init_rcu(&old_dev->index_hlist); 854 call_rcu(&old_dev->rcu, __dev_map_entry_free); 855 ret = 0; 856 } 857 spin_unlock_irqrestore(&dtab->index_lock, flags); 858 859 return ret; 860 } 861 862 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, 863 struct bpf_dtab *dtab, 864 struct bpf_devmap_val *val, 865 unsigned int idx) 866 { 867 struct bpf_prog *prog = NULL; 868 struct bpf_dtab_netdev *dev; 869 870 dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), 871 GFP_NOWAIT | __GFP_NOWARN, 872 dtab->map.numa_node); 873 if (!dev) 874 return ERR_PTR(-ENOMEM); 875 876 dev->dev = dev_get_by_index(net, val->ifindex); 877 if (!dev->dev) 878 goto err_out; 879 880 if (val->bpf_prog.fd > 0) { 881 prog = bpf_prog_get_type_dev(val->bpf_prog.fd, 882 BPF_PROG_TYPE_XDP, false); 883 if (IS_ERR(prog)) 884 goto err_put_dev; 885 if (prog->expected_attach_type != BPF_XDP_DEVMAP || 886 !bpf_prog_map_compatible(&dtab->map, prog)) 887 goto err_put_prog; 888 } 889 890 dev->idx = idx; 891 if (prog) { 892 dev->xdp_prog = prog; 893 dev->val.bpf_prog.id = prog->aux->id; 894 } else { 895 dev->xdp_prog = NULL; 896 dev->val.bpf_prog.id = 0; 897 } 898 dev->val.ifindex = val->ifindex; 899 900 return dev; 901 err_put_prog: 902 bpf_prog_put(prog); 903 err_put_dev: 904 dev_put(dev->dev); 905 err_out: 906 kfree(dev); 907 return ERR_PTR(-EINVAL); 908 } 909 910 static long __dev_map_update_elem(struct net *net, struct bpf_map *map, 911 void *key, void *value, u64 map_flags) 912 { 913 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 914 struct bpf_dtab_netdev *dev, *old_dev; 915 struct bpf_devmap_val val = {}; 916 u32 i = *(u32 *)key; 917 918 if (unlikely(map_flags > BPF_EXIST)) 919 return -EINVAL; 920 if (unlikely(i >= dtab->map.max_entries)) 921 return -E2BIG; 922 if (unlikely(map_flags == BPF_NOEXIST)) 923 return -EEXIST; 924 925 /* already verified value_size <= sizeof val */ 926 memcpy(&val, value, map->value_size); 927 928 if (!val.ifindex) { 929 dev = NULL; 930 /* can not specify fd if ifindex is 0 */ 931 if (val.bpf_prog.fd > 0) 932 return -EINVAL; 933 } else { 934 dev = __dev_map_alloc_node(net, dtab, &val, i); 935 if (IS_ERR(dev)) 936 return PTR_ERR(dev); 937 } 938 939 /* Use call_rcu() here to ensure rcu critical sections have completed 940 * Remembering the driver side flush operation will happen before the 941 * net device is removed. 942 */ 943 old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); 944 if (old_dev) 945 call_rcu(&old_dev->rcu, __dev_map_entry_free); 946 else 947 atomic_inc((atomic_t *)&dtab->items); 948 949 return 0; 950 } 951 952 static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, 953 u64 map_flags) 954 { 955 return __dev_map_update_elem(current->nsproxy->net_ns, 956 map, key, value, map_flags); 957 } 958 959 static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, 960 void *key, void *value, u64 map_flags) 961 { 962 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 963 struct bpf_dtab_netdev *dev, *old_dev; 964 struct bpf_devmap_val val = {}; 965 u32 idx = *(u32 *)key; 966 unsigned long flags; 967 int err = -EEXIST; 968 969 /* already verified value_size <= sizeof val */ 970 memcpy(&val, value, map->value_size); 971 972 if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) 973 return -EINVAL; 974 975 spin_lock_irqsave(&dtab->index_lock, flags); 976 977 old_dev = __dev_map_hash_lookup_elem(map, idx); 978 if (old_dev && (map_flags & BPF_NOEXIST)) 979 goto out_err; 980 981 dev = __dev_map_alloc_node(net, dtab, &val, idx); 982 if (IS_ERR(dev)) { 983 err = PTR_ERR(dev); 984 goto out_err; 985 } 986 987 if (old_dev) { 988 hlist_del_rcu(&old_dev->index_hlist); 989 } else { 990 if (dtab->items >= dtab->map.max_entries) { 991 spin_unlock_irqrestore(&dtab->index_lock, flags); 992 call_rcu(&dev->rcu, __dev_map_entry_free); 993 return -E2BIG; 994 } 995 dtab->items++; 996 } 997 998 hlist_add_head_rcu(&dev->index_hlist, 999 dev_map_index_hash(dtab, idx)); 1000 spin_unlock_irqrestore(&dtab->index_lock, flags); 1001 1002 if (old_dev) 1003 call_rcu(&old_dev->rcu, __dev_map_entry_free); 1004 1005 return 0; 1006 1007 out_err: 1008 spin_unlock_irqrestore(&dtab->index_lock, flags); 1009 return err; 1010 } 1011 1012 static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, 1013 u64 map_flags) 1014 { 1015 return __dev_map_hash_update_elem(current->nsproxy->net_ns, 1016 map, key, value, map_flags); 1017 } 1018 1019 static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) 1020 { 1021 return __bpf_xdp_redirect_map(map, ifindex, flags, 1022 BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, 1023 __dev_map_lookup_elem); 1024 } 1025 1026 static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) 1027 { 1028 return __bpf_xdp_redirect_map(map, ifindex, flags, 1029 BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, 1030 __dev_map_hash_lookup_elem); 1031 } 1032 1033 static u64 dev_map_mem_usage(const struct bpf_map *map) 1034 { 1035 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 1036 u64 usage = sizeof(struct bpf_dtab); 1037 1038 if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) 1039 usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); 1040 else 1041 usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); 1042 usage += atomic_read((atomic_t *)&dtab->items) * 1043 (u64)sizeof(struct bpf_dtab_netdev); 1044 return usage; 1045 } 1046 1047 BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) 1048 const struct bpf_map_ops dev_map_ops = { 1049 .map_meta_equal = bpf_map_meta_equal, 1050 .map_alloc = dev_map_alloc, 1051 .map_free = dev_map_free, 1052 .map_get_next_key = dev_map_get_next_key, 1053 .map_lookup_elem = dev_map_lookup_elem, 1054 .map_update_elem = dev_map_update_elem, 1055 .map_delete_elem = dev_map_delete_elem, 1056 .map_check_btf = map_check_no_btf, 1057 .map_mem_usage = dev_map_mem_usage, 1058 .map_btf_id = &dev_map_btf_ids[0], 1059 .map_redirect = dev_map_redirect, 1060 }; 1061 1062 const struct bpf_map_ops dev_map_hash_ops = { 1063 .map_meta_equal = bpf_map_meta_equal, 1064 .map_alloc = dev_map_alloc, 1065 .map_free = dev_map_free, 1066 .map_get_next_key = dev_map_hash_get_next_key, 1067 .map_lookup_elem = dev_map_hash_lookup_elem, 1068 .map_update_elem = dev_map_hash_update_elem, 1069 .map_delete_elem = dev_map_hash_delete_elem, 1070 .map_check_btf = map_check_no_btf, 1071 .map_mem_usage = dev_map_mem_usage, 1072 .map_btf_id = &dev_map_btf_ids[0], 1073 .map_redirect = dev_hash_map_redirect, 1074 }; 1075 1076 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, 1077 struct net_device *netdev) 1078 { 1079 unsigned long flags; 1080 u32 i; 1081 1082 spin_lock_irqsave(&dtab->index_lock, flags); 1083 for (i = 0; i < dtab->n_buckets; i++) { 1084 struct bpf_dtab_netdev *dev; 1085 struct hlist_head *head; 1086 struct hlist_node *next; 1087 1088 head = dev_map_index_hash(dtab, i); 1089 1090 hlist_for_each_entry_safe(dev, next, head, index_hlist) { 1091 if (netdev != dev->dev) 1092 continue; 1093 1094 dtab->items--; 1095 hlist_del_rcu(&dev->index_hlist); 1096 call_rcu(&dev->rcu, __dev_map_entry_free); 1097 } 1098 } 1099 spin_unlock_irqrestore(&dtab->index_lock, flags); 1100 } 1101 1102 static int dev_map_notification(struct notifier_block *notifier, 1103 ulong event, void *ptr) 1104 { 1105 struct net_device *netdev = netdev_notifier_info_to_dev(ptr); 1106 struct bpf_dtab *dtab; 1107 int i, cpu; 1108 1109 switch (event) { 1110 case NETDEV_REGISTER: 1111 if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) 1112 break; 1113 1114 /* will be freed in free_netdev() */ 1115 netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); 1116 if (!netdev->xdp_bulkq) 1117 return NOTIFY_BAD; 1118 1119 for_each_possible_cpu(cpu) 1120 per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; 1121 break; 1122 case NETDEV_UNREGISTER: 1123 /* This rcu_read_lock/unlock pair is needed because 1124 * dev_map_list is an RCU list AND to ensure a delete 1125 * operation does not free a netdev_map entry while we 1126 * are comparing it against the netdev being unregistered. 1127 */ 1128 rcu_read_lock(); 1129 list_for_each_entry_rcu(dtab, &dev_map_list, list) { 1130 if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { 1131 dev_map_hash_remove_netdev(dtab, netdev); 1132 continue; 1133 } 1134 1135 for (i = 0; i < dtab->map.max_entries; i++) { 1136 struct bpf_dtab_netdev *dev, *odev; 1137 1138 dev = rcu_dereference(dtab->netdev_map[i]); 1139 if (!dev || netdev != dev->dev) 1140 continue; 1141 odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); 1142 if (dev == odev) { 1143 call_rcu(&dev->rcu, 1144 __dev_map_entry_free); 1145 atomic_dec((atomic_t *)&dtab->items); 1146 } 1147 } 1148 } 1149 rcu_read_unlock(); 1150 break; 1151 default: 1152 break; 1153 } 1154 return NOTIFY_OK; 1155 } 1156 1157 static struct notifier_block dev_map_notifier = { 1158 .notifier_call = dev_map_notification, 1159 }; 1160 1161 static int __init dev_map_init(void) 1162 { 1163 int cpu; 1164 1165 /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ 1166 BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != 1167 offsetof(struct _bpf_dtab_netdev, dev)); 1168 register_netdevice_notifier(&dev_map_notifier); 1169 1170 for_each_possible_cpu(cpu) 1171 INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); 1172 return 0; 1173 } 1174 1175 subsys_initcall(dev_map_init); 1176