1 // SPDX-License-Identifier: GPL-2.0-only 2 /* bpf/cpumap.c 3 * 4 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. 5 */ 6 7 /** 8 * DOC: cpu map 9 * The 'cpumap' is primarily used as a backend map for XDP BPF helper 10 * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. 11 * 12 * Unlike devmap which redirects XDP frames out to another NIC device, 13 * this map type redirects raw XDP frames to another CPU. The remote 14 * CPU will do SKB-allocation and call the normal network stack. 15 */ 16 /* 17 * This is a scalability and isolation mechanism, that allow 18 * separating the early driver network XDP layer, from the rest of the 19 * netstack, and assigning dedicated CPUs for this stage. This 20 * basically allows for 10G wirespeed pre-filtering via bpf. 21 */ 22 #include <linux/bitops.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <net/xdp.h> 27 #include <net/hotdata.h> 28 29 #include <linux/sched.h> 30 #include <linux/workqueue.h> 31 #include <linux/kthread.h> 32 #include <linux/completion.h> 33 #include <trace/events/xdp.h> 34 #include <linux/btf_ids.h> 35 36 #include <linux/netdevice.h> /* netif_receive_skb_list */ 37 #include <linux/etherdevice.h> /* eth_type_trans */ 38 39 /* General idea: XDP packets getting XDP redirected to another CPU, 40 * will maximum be stored/queued for one driver ->poll() call. It is 41 * guaranteed that queueing the frame and the flush operation happen on 42 * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() 43 * which queue in bpf_cpu_map_entry contains packets. 44 */ 45 46 #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ 47 struct bpf_cpu_map_entry; 48 struct bpf_cpu_map; 49 50 struct xdp_bulk_queue { 51 void *q[CPU_MAP_BULK_SIZE]; 52 struct list_head flush_node; 53 struct bpf_cpu_map_entry *obj; 54 unsigned int count; 55 }; 56 57 /* Struct for every remote "destination" CPU in map */ 58 struct bpf_cpu_map_entry { 59 u32 cpu; /* kthread CPU and map index */ 60 int map_id; /* Back reference to map */ 61 62 /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ 63 struct xdp_bulk_queue __percpu *bulkq; 64 65 /* Queue with potential multi-producers, and single-consumer kthread */ 66 struct ptr_ring *queue; 67 struct task_struct *kthread; 68 69 struct bpf_cpumap_val value; 70 struct bpf_prog *prog; 71 72 struct completion kthread_running; 73 struct rcu_work free_work; 74 }; 75 76 struct bpf_cpu_map { 77 struct bpf_map map; 78 /* Below members specific for map type */ 79 struct bpf_cpu_map_entry __rcu **cpu_map; 80 }; 81 82 static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); 83 84 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) 85 { 86 u32 value_size = attr->value_size; 87 struct bpf_cpu_map *cmap; 88 89 /* check sanity of attributes */ 90 if (attr->max_entries == 0 || attr->key_size != 4 || 91 (value_size != offsetofend(struct bpf_cpumap_val, qsize) && 92 value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || 93 attr->map_flags & ~BPF_F_NUMA_NODE) 94 return ERR_PTR(-EINVAL); 95 96 /* Pre-limit array size based on NR_CPUS, not final CPU check */ 97 if (attr->max_entries > NR_CPUS) 98 return ERR_PTR(-E2BIG); 99 100 cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); 101 if (!cmap) 102 return ERR_PTR(-ENOMEM); 103 104 bpf_map_init_from_attr(&cmap->map, attr); 105 106 /* Alloc array for possible remote "destination" CPUs */ 107 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * 108 sizeof(struct bpf_cpu_map_entry *), 109 cmap->map.numa_node); 110 if (!cmap->cpu_map) { 111 bpf_map_area_free(cmap); 112 return ERR_PTR(-ENOMEM); 113 } 114 115 return &cmap->map; 116 } 117 118 static void __cpu_map_ring_cleanup(struct ptr_ring *ring) 119 { 120 /* The tear-down procedure should have made sure that queue is 121 * empty. See __cpu_map_entry_replace() and work-queue 122 * invoked cpu_map_kthread_stop(). Catch any broken behaviour 123 * gracefully and warn once. 124 */ 125 void *ptr; 126 127 while ((ptr = ptr_ring_consume(ring))) { 128 WARN_ON_ONCE(1); 129 if (unlikely(__ptr_test_bit(0, &ptr))) { 130 __ptr_clear_bit(0, &ptr); 131 kfree_skb(ptr); 132 continue; 133 } 134 xdp_return_frame(ptr); 135 } 136 } 137 138 static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, 139 struct list_head *listp, 140 struct xdp_cpumap_stats *stats) 141 { 142 struct sk_buff *skb, *tmp; 143 struct xdp_buff xdp; 144 u32 act; 145 int err; 146 147 list_for_each_entry_safe(skb, tmp, listp, list) { 148 act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); 149 switch (act) { 150 case XDP_PASS: 151 break; 152 case XDP_REDIRECT: 153 skb_list_del_init(skb); 154 err = xdp_do_generic_redirect(skb->dev, skb, &xdp, 155 rcpu->prog); 156 if (unlikely(err)) { 157 kfree_skb(skb); 158 stats->drop++; 159 } else { 160 stats->redirect++; 161 } 162 return; 163 default: 164 bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); 165 fallthrough; 166 case XDP_ABORTED: 167 trace_xdp_exception(skb->dev, rcpu->prog, act); 168 fallthrough; 169 case XDP_DROP: 170 skb_list_del_init(skb); 171 kfree_skb(skb); 172 stats->drop++; 173 return; 174 } 175 } 176 } 177 178 static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, 179 void **frames, int n, 180 struct xdp_cpumap_stats *stats) 181 { 182 struct xdp_rxq_info rxq = {}; 183 struct xdp_buff xdp; 184 int i, nframes = 0; 185 186 xdp_set_return_frame_no_direct(); 187 xdp.rxq = &rxq; 188 189 for (i = 0; i < n; i++) { 190 struct xdp_frame *xdpf = frames[i]; 191 u32 act; 192 int err; 193 194 rxq.dev = xdpf->dev_rx; 195 rxq.mem = xdpf->mem; 196 /* TODO: report queue_index to xdp_rxq_info */ 197 198 xdp_convert_frame_to_buff(xdpf, &xdp); 199 200 act = bpf_prog_run_xdp(rcpu->prog, &xdp); 201 switch (act) { 202 case XDP_PASS: 203 err = xdp_update_frame_from_buff(&xdp, xdpf); 204 if (err < 0) { 205 xdp_return_frame(xdpf); 206 stats->drop++; 207 } else { 208 frames[nframes++] = xdpf; 209 stats->pass++; 210 } 211 break; 212 case XDP_REDIRECT: 213 err = xdp_do_redirect(xdpf->dev_rx, &xdp, 214 rcpu->prog); 215 if (unlikely(err)) { 216 xdp_return_frame(xdpf); 217 stats->drop++; 218 } else { 219 stats->redirect++; 220 } 221 break; 222 default: 223 bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); 224 fallthrough; 225 case XDP_DROP: 226 xdp_return_frame(xdpf); 227 stats->drop++; 228 break; 229 } 230 } 231 232 xdp_clear_return_frame_no_direct(); 233 234 return nframes; 235 } 236 237 #define CPUMAP_BATCH 8 238 239 static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, 240 int xdp_n, struct xdp_cpumap_stats *stats, 241 struct list_head *list) 242 { 243 int nframes; 244 245 if (!rcpu->prog) 246 return xdp_n; 247 248 rcu_read_lock_bh(); 249 250 nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); 251 252 if (stats->redirect) 253 xdp_do_flush(); 254 255 if (unlikely(!list_empty(list))) 256 cpu_map_bpf_prog_run_skb(rcpu, list, stats); 257 258 rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ 259 260 return nframes; 261 } 262 263 static int cpu_map_kthread_run(void *data) 264 { 265 struct bpf_cpu_map_entry *rcpu = data; 266 267 complete(&rcpu->kthread_running); 268 set_current_state(TASK_INTERRUPTIBLE); 269 270 /* When kthread gives stop order, then rcpu have been disconnected 271 * from map, thus no new packets can enter. Remaining in-flight 272 * per CPU stored packets are flushed to this queue. Wait honoring 273 * kthread_stop signal until queue is empty. 274 */ 275 while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { 276 struct xdp_cpumap_stats stats = {}; /* zero stats */ 277 unsigned int kmem_alloc_drops = 0, sched = 0; 278 gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; 279 int i, n, m, nframes, xdp_n; 280 void *frames[CPUMAP_BATCH]; 281 void *skbs[CPUMAP_BATCH]; 282 LIST_HEAD(list); 283 284 /* Release CPU reschedule checks */ 285 if (__ptr_ring_empty(rcpu->queue)) { 286 set_current_state(TASK_INTERRUPTIBLE); 287 /* Recheck to avoid lost wake-up */ 288 if (__ptr_ring_empty(rcpu->queue)) { 289 schedule(); 290 sched = 1; 291 } else { 292 __set_current_state(TASK_RUNNING); 293 } 294 } else { 295 sched = cond_resched(); 296 } 297 298 /* 299 * The bpf_cpu_map_entry is single consumer, with this 300 * kthread CPU pinned. Lockless access to ptr_ring 301 * consume side valid as no-resize allowed of queue. 302 */ 303 n = __ptr_ring_consume_batched(rcpu->queue, frames, 304 CPUMAP_BATCH); 305 for (i = 0, xdp_n = 0; i < n; i++) { 306 void *f = frames[i]; 307 struct page *page; 308 309 if (unlikely(__ptr_test_bit(0, &f))) { 310 struct sk_buff *skb = f; 311 312 __ptr_clear_bit(0, &skb); 313 list_add_tail(&skb->list, &list); 314 continue; 315 } 316 317 frames[xdp_n++] = f; 318 page = virt_to_page(f); 319 320 /* Bring struct page memory area to curr CPU. Read by 321 * build_skb_around via page_is_pfmemalloc(), and when 322 * freed written by page_frag_free call. 323 */ 324 prefetchw(page); 325 } 326 327 /* Support running another XDP prog on this CPU */ 328 nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); 329 if (nframes) { 330 m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 331 gfp, nframes, skbs); 332 if (unlikely(m == 0)) { 333 for (i = 0; i < nframes; i++) 334 skbs[i] = NULL; /* effect: xdp_return_frame */ 335 kmem_alloc_drops += nframes; 336 } 337 } 338 339 local_bh_disable(); 340 for (i = 0; i < nframes; i++) { 341 struct xdp_frame *xdpf = frames[i]; 342 struct sk_buff *skb = skbs[i]; 343 344 skb = __xdp_build_skb_from_frame(xdpf, skb, 345 xdpf->dev_rx); 346 if (!skb) { 347 xdp_return_frame(xdpf); 348 continue; 349 } 350 351 list_add_tail(&skb->list, &list); 352 } 353 netif_receive_skb_list(&list); 354 355 /* Feedback loop via tracepoint */ 356 trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, 357 sched, &stats); 358 359 local_bh_enable(); /* resched point, may call do_softirq() */ 360 } 361 __set_current_state(TASK_RUNNING); 362 363 return 0; 364 } 365 366 static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, 367 struct bpf_map *map, int fd) 368 { 369 struct bpf_prog *prog; 370 371 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); 372 if (IS_ERR(prog)) 373 return PTR_ERR(prog); 374 375 if (prog->expected_attach_type != BPF_XDP_CPUMAP || 376 !bpf_prog_map_compatible(map, prog)) { 377 bpf_prog_put(prog); 378 return -EINVAL; 379 } 380 381 rcpu->value.bpf_prog.id = prog->aux->id; 382 rcpu->prog = prog; 383 384 return 0; 385 } 386 387 static struct bpf_cpu_map_entry * 388 __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, 389 u32 cpu) 390 { 391 int numa, err, i, fd = value->bpf_prog.fd; 392 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 393 struct bpf_cpu_map_entry *rcpu; 394 struct xdp_bulk_queue *bq; 395 396 /* Have map->numa_node, but choose node of redirect target CPU */ 397 numa = cpu_to_node(cpu); 398 399 rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); 400 if (!rcpu) 401 return NULL; 402 403 /* Alloc percpu bulkq */ 404 rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), 405 sizeof(void *), gfp); 406 if (!rcpu->bulkq) 407 goto free_rcu; 408 409 for_each_possible_cpu(i) { 410 bq = per_cpu_ptr(rcpu->bulkq, i); 411 bq->obj = rcpu; 412 } 413 414 /* Alloc queue */ 415 rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, 416 numa); 417 if (!rcpu->queue) 418 goto free_bulkq; 419 420 err = ptr_ring_init(rcpu->queue, value->qsize, gfp); 421 if (err) 422 goto free_queue; 423 424 rcpu->cpu = cpu; 425 rcpu->map_id = map->id; 426 rcpu->value.qsize = value->qsize; 427 428 if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) 429 goto free_ptr_ring; 430 431 /* Setup kthread */ 432 init_completion(&rcpu->kthread_running); 433 rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, 434 "cpumap/%d/map:%d", cpu, 435 map->id); 436 if (IS_ERR(rcpu->kthread)) 437 goto free_prog; 438 439 /* Make sure kthread runs on a single CPU */ 440 kthread_bind(rcpu->kthread, cpu); 441 wake_up_process(rcpu->kthread); 442 443 /* Make sure kthread has been running, so kthread_stop() will not 444 * stop the kthread prematurely and all pending frames or skbs 445 * will be handled by the kthread before kthread_stop() returns. 446 */ 447 wait_for_completion(&rcpu->kthread_running); 448 449 return rcpu; 450 451 free_prog: 452 if (rcpu->prog) 453 bpf_prog_put(rcpu->prog); 454 free_ptr_ring: 455 ptr_ring_cleanup(rcpu->queue, NULL); 456 free_queue: 457 kfree(rcpu->queue); 458 free_bulkq: 459 free_percpu(rcpu->bulkq); 460 free_rcu: 461 kfree(rcpu); 462 return NULL; 463 } 464 465 static void __cpu_map_entry_free(struct work_struct *work) 466 { 467 struct bpf_cpu_map_entry *rcpu; 468 469 /* This cpu_map_entry have been disconnected from map and one 470 * RCU grace-period have elapsed. Thus, XDP cannot queue any 471 * new packets and cannot change/set flush_needed that can 472 * find this entry. 473 */ 474 rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work); 475 476 /* kthread_stop will wake_up_process and wait for it to complete. 477 * cpu_map_kthread_run() makes sure the pointer ring is empty 478 * before exiting. 479 */ 480 kthread_stop(rcpu->kthread); 481 482 if (rcpu->prog) 483 bpf_prog_put(rcpu->prog); 484 /* The queue should be empty at this point */ 485 __cpu_map_ring_cleanup(rcpu->queue); 486 ptr_ring_cleanup(rcpu->queue, NULL); 487 kfree(rcpu->queue); 488 free_percpu(rcpu->bulkq); 489 kfree(rcpu); 490 } 491 492 /* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old 493 * entry is no longer in use before freeing. We use queue_rcu_work() to call 494 * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace 495 * period. This means that (a) all pending enqueue and flush operations have 496 * completed (because of the RCU callback), and (b) we are in a workqueue 497 * context where we can stop the kthread and wait for it to exit before freeing 498 * everything. 499 */ 500 static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, 501 u32 key_cpu, struct bpf_cpu_map_entry *rcpu) 502 { 503 struct bpf_cpu_map_entry *old_rcpu; 504 505 old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); 506 if (old_rcpu) { 507 INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); 508 queue_rcu_work(system_wq, &old_rcpu->free_work); 509 } 510 } 511 512 static long cpu_map_delete_elem(struct bpf_map *map, void *key) 513 { 514 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 515 u32 key_cpu = *(u32 *)key; 516 517 if (key_cpu >= map->max_entries) 518 return -EINVAL; 519 520 /* notice caller map_delete_elem() uses rcu_read_lock() */ 521 __cpu_map_entry_replace(cmap, key_cpu, NULL); 522 return 0; 523 } 524 525 static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, 526 u64 map_flags) 527 { 528 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 529 struct bpf_cpumap_val cpumap_value = {}; 530 struct bpf_cpu_map_entry *rcpu; 531 /* Array index key correspond to CPU number */ 532 u32 key_cpu = *(u32 *)key; 533 534 memcpy(&cpumap_value, value, map->value_size); 535 536 if (unlikely(map_flags > BPF_EXIST)) 537 return -EINVAL; 538 if (unlikely(key_cpu >= cmap->map.max_entries)) 539 return -E2BIG; 540 if (unlikely(map_flags == BPF_NOEXIST)) 541 return -EEXIST; 542 if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ 543 return -EOVERFLOW; 544 545 /* Make sure CPU is a valid possible cpu */ 546 if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) 547 return -ENODEV; 548 549 if (cpumap_value.qsize == 0) { 550 rcpu = NULL; /* Same as deleting */ 551 } else { 552 /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ 553 rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); 554 if (!rcpu) 555 return -ENOMEM; 556 } 557 rcu_read_lock(); 558 __cpu_map_entry_replace(cmap, key_cpu, rcpu); 559 rcu_read_unlock(); 560 return 0; 561 } 562 563 static void cpu_map_free(struct bpf_map *map) 564 { 565 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 566 u32 i; 567 568 /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 569 * so the bpf programs (can be more than one that used this map) were 570 * disconnected from events. Wait for outstanding critical sections in 571 * these programs to complete. synchronize_rcu() below not only 572 * guarantees no further "XDP/bpf-side" reads against 573 * bpf_cpu_map->cpu_map, but also ensure pending flush operations 574 * (if any) are completed. 575 */ 576 synchronize_rcu(); 577 578 /* The only possible user of bpf_cpu_map_entry is 579 * cpu_map_kthread_run(). 580 */ 581 for (i = 0; i < cmap->map.max_entries; i++) { 582 struct bpf_cpu_map_entry *rcpu; 583 584 rcpu = rcu_dereference_raw(cmap->cpu_map[i]); 585 if (!rcpu) 586 continue; 587 588 /* Stop kthread and cleanup entry directly */ 589 __cpu_map_entry_free(&rcpu->free_work.work); 590 } 591 bpf_map_area_free(cmap->cpu_map); 592 bpf_map_area_free(cmap); 593 } 594 595 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or 596 * by local_bh_disable() (from XDP calls inside NAPI). The 597 * rcu_read_lock_bh_held() below makes lockdep accept both. 598 */ 599 static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) 600 { 601 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 602 struct bpf_cpu_map_entry *rcpu; 603 604 if (key >= map->max_entries) 605 return NULL; 606 607 rcpu = rcu_dereference_check(cmap->cpu_map[key], 608 rcu_read_lock_bh_held()); 609 return rcpu; 610 } 611 612 static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) 613 { 614 struct bpf_cpu_map_entry *rcpu = 615 __cpu_map_lookup_elem(map, *(u32 *)key); 616 617 return rcpu ? &rcpu->value : NULL; 618 } 619 620 static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 621 { 622 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 623 u32 index = key ? *(u32 *)key : U32_MAX; 624 u32 *next = next_key; 625 626 if (index >= cmap->map.max_entries) { 627 *next = 0; 628 return 0; 629 } 630 631 if (index == cmap->map.max_entries - 1) 632 return -ENOENT; 633 *next = index + 1; 634 return 0; 635 } 636 637 static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) 638 { 639 return __bpf_xdp_redirect_map(map, index, flags, 0, 640 __cpu_map_lookup_elem); 641 } 642 643 static u64 cpu_map_mem_usage(const struct bpf_map *map) 644 { 645 u64 usage = sizeof(struct bpf_cpu_map); 646 647 /* Currently the dynamically allocated elements are not counted */ 648 usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *); 649 return usage; 650 } 651 652 BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) 653 const struct bpf_map_ops cpu_map_ops = { 654 .map_meta_equal = bpf_map_meta_equal, 655 .map_alloc = cpu_map_alloc, 656 .map_free = cpu_map_free, 657 .map_delete_elem = cpu_map_delete_elem, 658 .map_update_elem = cpu_map_update_elem, 659 .map_lookup_elem = cpu_map_lookup_elem, 660 .map_get_next_key = cpu_map_get_next_key, 661 .map_check_btf = map_check_no_btf, 662 .map_mem_usage = cpu_map_mem_usage, 663 .map_btf_id = &cpu_map_btf_ids[0], 664 .map_redirect = cpu_map_redirect, 665 }; 666 667 static void bq_flush_to_queue(struct xdp_bulk_queue *bq) 668 { 669 struct bpf_cpu_map_entry *rcpu = bq->obj; 670 unsigned int processed = 0, drops = 0; 671 const int to_cpu = rcpu->cpu; 672 struct ptr_ring *q; 673 int i; 674 675 if (unlikely(!bq->count)) 676 return; 677 678 q = rcpu->queue; 679 spin_lock(&q->producer_lock); 680 681 for (i = 0; i < bq->count; i++) { 682 struct xdp_frame *xdpf = bq->q[i]; 683 int err; 684 685 err = __ptr_ring_produce(q, xdpf); 686 if (err) { 687 drops++; 688 xdp_return_frame_rx_napi(xdpf); 689 } 690 processed++; 691 } 692 bq->count = 0; 693 spin_unlock(&q->producer_lock); 694 695 __list_del_clearprev(&bq->flush_node); 696 697 /* Feedback loop via tracepoints */ 698 trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); 699 } 700 701 /* Runs under RCU-read-side, plus in softirq under NAPI protection. 702 * Thus, safe percpu variable access. 703 */ 704 static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) 705 { 706 struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); 707 struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); 708 709 if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) 710 bq_flush_to_queue(bq); 711 712 /* Notice, xdp_buff/page MUST be queued here, long enough for 713 * driver to code invoking us to finished, due to driver 714 * (e.g. ixgbe) recycle tricks based on page-refcnt. 715 * 716 * Thus, incoming xdp_frame is always queued here (else we race 717 * with another CPU on page-refcnt and remaining driver code). 718 * Queue time is very short, as driver will invoke flush 719 * operation, when completing napi->poll call. 720 */ 721 bq->q[bq->count++] = xdpf; 722 723 if (!bq->flush_node.prev) 724 list_add(&bq->flush_node, flush_list); 725 } 726 727 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, 728 struct net_device *dev_rx) 729 { 730 /* Info needed when constructing SKB on remote CPU */ 731 xdpf->dev_rx = dev_rx; 732 733 bq_enqueue(rcpu, xdpf); 734 return 0; 735 } 736 737 int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, 738 struct sk_buff *skb) 739 { 740 int ret; 741 742 __skb_pull(skb, skb->mac_len); 743 skb_set_redirected(skb, false); 744 __ptr_set_bit(0, &skb); 745 746 ret = ptr_ring_produce(rcpu->queue, skb); 747 if (ret < 0) 748 goto trace; 749 750 wake_up_process(rcpu->kthread); 751 trace: 752 trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu); 753 return ret; 754 } 755 756 void __cpu_map_flush(void) 757 { 758 struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); 759 struct xdp_bulk_queue *bq, *tmp; 760 761 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { 762 bq_flush_to_queue(bq); 763 764 /* If already running, costs spin_lock_irqsave + smb_mb */ 765 wake_up_process(bq->obj->kthread); 766 } 767 } 768 769 #ifdef CONFIG_DEBUG_NET 770 bool cpu_map_check_flush(void) 771 { 772 if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) 773 return false; 774 __cpu_map_flush(); 775 return true; 776 } 777 #endif 778 779 static int __init cpu_map_init(void) 780 { 781 int cpu; 782 783 for_each_possible_cpu(cpu) 784 INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); 785 return 0; 786 } 787 788 subsys_initcall(cpu_map_init); 789