1 // SPDX-License-Identifier: GPL-2.0-only 2 /* bpf/cpumap.c 3 * 4 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. 5 */ 6 7 /** 8 * DOC: cpu map 9 * The 'cpumap' is primarily used as a backend map for XDP BPF helper 10 * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. 11 * 12 * Unlike devmap which redirects XDP frames out to another NIC device, 13 * this map type redirects raw XDP frames to another CPU. The remote 14 * CPU will do SKB-allocation and call the normal network stack. 15 */ 16 /* 17 * This is a scalability and isolation mechanism, that allow 18 * separating the early driver network XDP layer, from the rest of the 19 * netstack, and assigning dedicated CPUs for this stage. This 20 * basically allows for 10G wirespeed pre-filtering via bpf. 21 */ 22 #include <linux/bitops.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <net/xdp.h> 27 #include <net/hotdata.h> 28 29 #include <linux/sched.h> 30 #include <linux/workqueue.h> 31 #include <linux/kthread.h> 32 #include <linux/completion.h> 33 #include <trace/events/xdp.h> 34 #include <linux/btf_ids.h> 35 36 #include <linux/netdevice.h> /* netif_receive_skb_list */ 37 #include <linux/etherdevice.h> /* eth_type_trans */ 38 39 /* General idea: XDP packets getting XDP redirected to another CPU, 40 * will maximum be stored/queued for one driver ->poll() call. It is 41 * guaranteed that queueing the frame and the flush operation happen on 42 * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() 43 * which queue in bpf_cpu_map_entry contains packets. 44 */ 45 46 #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ 47 struct bpf_cpu_map_entry; 48 struct bpf_cpu_map; 49 50 struct xdp_bulk_queue { 51 void *q[CPU_MAP_BULK_SIZE]; 52 struct list_head flush_node; 53 struct bpf_cpu_map_entry *obj; 54 unsigned int count; 55 }; 56 57 /* Struct for every remote "destination" CPU in map */ 58 struct bpf_cpu_map_entry { 59 u32 cpu; /* kthread CPU and map index */ 60 int map_id; /* Back reference to map */ 61 62 /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ 63 struct xdp_bulk_queue __percpu *bulkq; 64 65 /* Queue with potential multi-producers, and single-consumer kthread */ 66 struct ptr_ring *queue; 67 struct task_struct *kthread; 68 69 struct bpf_cpumap_val value; 70 struct bpf_prog *prog; 71 72 struct completion kthread_running; 73 struct rcu_work free_work; 74 }; 75 76 struct bpf_cpu_map { 77 struct bpf_map map; 78 /* Below members specific for map type */ 79 struct bpf_cpu_map_entry __rcu **cpu_map; 80 }; 81 82 static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); 83 84 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) 85 { 86 u32 value_size = attr->value_size; 87 struct bpf_cpu_map *cmap; 88 89 /* check sanity of attributes */ 90 if (attr->max_entries == 0 || attr->key_size != 4 || 91 (value_size != offsetofend(struct bpf_cpumap_val, qsize) && 92 value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || 93 attr->map_flags & ~BPF_F_NUMA_NODE) 94 return ERR_PTR(-EINVAL); 95 96 /* Pre-limit array size based on NR_CPUS, not final CPU check */ 97 if (attr->max_entries > NR_CPUS) 98 return ERR_PTR(-E2BIG); 99 100 cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); 101 if (!cmap) 102 return ERR_PTR(-ENOMEM); 103 104 bpf_map_init_from_attr(&cmap->map, attr); 105 106 /* Alloc array for possible remote "destination" CPUs */ 107 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * 108 sizeof(struct bpf_cpu_map_entry *), 109 cmap->map.numa_node); 110 if (!cmap->cpu_map) { 111 bpf_map_area_free(cmap); 112 return ERR_PTR(-ENOMEM); 113 } 114 115 return &cmap->map; 116 } 117 118 static void __cpu_map_ring_cleanup(struct ptr_ring *ring) 119 { 120 /* The tear-down procedure should have made sure that queue is 121 * empty. See __cpu_map_entry_replace() and work-queue 122 * invoked cpu_map_kthread_stop(). Catch any broken behaviour 123 * gracefully and warn once. 124 */ 125 void *ptr; 126 127 while ((ptr = ptr_ring_consume(ring))) { 128 WARN_ON_ONCE(1); 129 if (unlikely(__ptr_test_bit(0, &ptr))) { 130 __ptr_clear_bit(0, &ptr); 131 kfree_skb(ptr); 132 continue; 133 } 134 xdp_return_frame(ptr); 135 } 136 } 137 138 static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, 139 struct list_head *listp, 140 struct xdp_cpumap_stats *stats) 141 { 142 struct sk_buff *skb, *tmp; 143 struct xdp_buff xdp; 144 u32 act; 145 int err; 146 147 list_for_each_entry_safe(skb, tmp, listp, list) { 148 act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); 149 switch (act) { 150 case XDP_PASS: 151 break; 152 case XDP_REDIRECT: 153 skb_list_del_init(skb); 154 err = xdp_do_generic_redirect(skb->dev, skb, &xdp, 155 rcpu->prog); 156 if (unlikely(err)) { 157 kfree_skb(skb); 158 stats->drop++; 159 } else { 160 stats->redirect++; 161 } 162 return; 163 default: 164 bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); 165 fallthrough; 166 case XDP_ABORTED: 167 trace_xdp_exception(skb->dev, rcpu->prog, act); 168 fallthrough; 169 case XDP_DROP: 170 skb_list_del_init(skb); 171 kfree_skb(skb); 172 stats->drop++; 173 return; 174 } 175 } 176 } 177 178 static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, 179 void **frames, int n, 180 struct xdp_cpumap_stats *stats) 181 { 182 struct xdp_rxq_info rxq = {}; 183 struct xdp_buff xdp; 184 int i, nframes = 0; 185 186 xdp_set_return_frame_no_direct(); 187 xdp.rxq = &rxq; 188 189 for (i = 0; i < n; i++) { 190 struct xdp_frame *xdpf = frames[i]; 191 u32 act; 192 int err; 193 194 rxq.dev = xdpf->dev_rx; 195 rxq.mem = xdpf->mem; 196 /* TODO: report queue_index to xdp_rxq_info */ 197 198 xdp_convert_frame_to_buff(xdpf, &xdp); 199 200 act = bpf_prog_run_xdp(rcpu->prog, &xdp); 201 switch (act) { 202 case XDP_PASS: 203 err = xdp_update_frame_from_buff(&xdp, xdpf); 204 if (err < 0) { 205 xdp_return_frame(xdpf); 206 stats->drop++; 207 } else { 208 frames[nframes++] = xdpf; 209 stats->pass++; 210 } 211 break; 212 case XDP_REDIRECT: 213 err = xdp_do_redirect(xdpf->dev_rx, &xdp, 214 rcpu->prog); 215 if (unlikely(err)) { 216 xdp_return_frame(xdpf); 217 stats->drop++; 218 } else { 219 stats->redirect++; 220 } 221 break; 222 default: 223 bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); 224 fallthrough; 225 case XDP_DROP: 226 xdp_return_frame(xdpf); 227 stats->drop++; 228 break; 229 } 230 } 231 232 xdp_clear_return_frame_no_direct(); 233 234 return nframes; 235 } 236 237 #define CPUMAP_BATCH 8 238 239 static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, 240 int xdp_n, struct xdp_cpumap_stats *stats, 241 struct list_head *list) 242 { 243 int nframes; 244 245 if (!rcpu->prog) 246 return xdp_n; 247 248 rcu_read_lock_bh(); 249 250 nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); 251 252 if (stats->redirect) 253 xdp_do_flush(); 254 255 if (unlikely(!list_empty(list))) 256 cpu_map_bpf_prog_run_skb(rcpu, list, stats); 257 258 rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ 259 260 return nframes; 261 } 262 263 static int cpu_map_kthread_run(void *data) 264 { 265 struct bpf_cpu_map_entry *rcpu = data; 266 unsigned long last_qs = jiffies; 267 268 complete(&rcpu->kthread_running); 269 set_current_state(TASK_INTERRUPTIBLE); 270 271 /* When kthread gives stop order, then rcpu have been disconnected 272 * from map, thus no new packets can enter. Remaining in-flight 273 * per CPU stored packets are flushed to this queue. Wait honoring 274 * kthread_stop signal until queue is empty. 275 */ 276 while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { 277 struct xdp_cpumap_stats stats = {}; /* zero stats */ 278 unsigned int kmem_alloc_drops = 0, sched = 0; 279 gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; 280 int i, n, m, nframes, xdp_n; 281 void *frames[CPUMAP_BATCH]; 282 void *skbs[CPUMAP_BATCH]; 283 LIST_HEAD(list); 284 285 /* Release CPU reschedule checks */ 286 if (__ptr_ring_empty(rcpu->queue)) { 287 set_current_state(TASK_INTERRUPTIBLE); 288 /* Recheck to avoid lost wake-up */ 289 if (__ptr_ring_empty(rcpu->queue)) { 290 schedule(); 291 sched = 1; 292 last_qs = jiffies; 293 } else { 294 __set_current_state(TASK_RUNNING); 295 } 296 } else { 297 rcu_softirq_qs_periodic(last_qs); 298 sched = cond_resched(); 299 } 300 301 /* 302 * The bpf_cpu_map_entry is single consumer, with this 303 * kthread CPU pinned. Lockless access to ptr_ring 304 * consume side valid as no-resize allowed of queue. 305 */ 306 n = __ptr_ring_consume_batched(rcpu->queue, frames, 307 CPUMAP_BATCH); 308 for (i = 0, xdp_n = 0; i < n; i++) { 309 void *f = frames[i]; 310 struct page *page; 311 312 if (unlikely(__ptr_test_bit(0, &f))) { 313 struct sk_buff *skb = f; 314 315 __ptr_clear_bit(0, &skb); 316 list_add_tail(&skb->list, &list); 317 continue; 318 } 319 320 frames[xdp_n++] = f; 321 page = virt_to_page(f); 322 323 /* Bring struct page memory area to curr CPU. Read by 324 * build_skb_around via page_is_pfmemalloc(), and when 325 * freed written by page_frag_free call. 326 */ 327 prefetchw(page); 328 } 329 330 /* Support running another XDP prog on this CPU */ 331 nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); 332 if (nframes) { 333 m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 334 gfp, nframes, skbs); 335 if (unlikely(m == 0)) { 336 for (i = 0; i < nframes; i++) 337 skbs[i] = NULL; /* effect: xdp_return_frame */ 338 kmem_alloc_drops += nframes; 339 } 340 } 341 342 local_bh_disable(); 343 for (i = 0; i < nframes; i++) { 344 struct xdp_frame *xdpf = frames[i]; 345 struct sk_buff *skb = skbs[i]; 346 347 skb = __xdp_build_skb_from_frame(xdpf, skb, 348 xdpf->dev_rx); 349 if (!skb) { 350 xdp_return_frame(xdpf); 351 continue; 352 } 353 354 list_add_tail(&skb->list, &list); 355 } 356 netif_receive_skb_list(&list); 357 358 /* Feedback loop via tracepoint */ 359 trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, 360 sched, &stats); 361 362 local_bh_enable(); /* resched point, may call do_softirq() */ 363 } 364 __set_current_state(TASK_RUNNING); 365 366 return 0; 367 } 368 369 static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, 370 struct bpf_map *map, int fd) 371 { 372 struct bpf_prog *prog; 373 374 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); 375 if (IS_ERR(prog)) 376 return PTR_ERR(prog); 377 378 if (prog->expected_attach_type != BPF_XDP_CPUMAP || 379 !bpf_prog_map_compatible(map, prog)) { 380 bpf_prog_put(prog); 381 return -EINVAL; 382 } 383 384 rcpu->value.bpf_prog.id = prog->aux->id; 385 rcpu->prog = prog; 386 387 return 0; 388 } 389 390 static struct bpf_cpu_map_entry * 391 __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, 392 u32 cpu) 393 { 394 int numa, err, i, fd = value->bpf_prog.fd; 395 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 396 struct bpf_cpu_map_entry *rcpu; 397 struct xdp_bulk_queue *bq; 398 399 /* Have map->numa_node, but choose node of redirect target CPU */ 400 numa = cpu_to_node(cpu); 401 402 rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); 403 if (!rcpu) 404 return NULL; 405 406 /* Alloc percpu bulkq */ 407 rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), 408 sizeof(void *), gfp); 409 if (!rcpu->bulkq) 410 goto free_rcu; 411 412 for_each_possible_cpu(i) { 413 bq = per_cpu_ptr(rcpu->bulkq, i); 414 bq->obj = rcpu; 415 } 416 417 /* Alloc queue */ 418 rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, 419 numa); 420 if (!rcpu->queue) 421 goto free_bulkq; 422 423 err = ptr_ring_init(rcpu->queue, value->qsize, gfp); 424 if (err) 425 goto free_queue; 426 427 rcpu->cpu = cpu; 428 rcpu->map_id = map->id; 429 rcpu->value.qsize = value->qsize; 430 431 if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) 432 goto free_ptr_ring; 433 434 /* Setup kthread */ 435 init_completion(&rcpu->kthread_running); 436 rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, 437 "cpumap/%d/map:%d", cpu, 438 map->id); 439 if (IS_ERR(rcpu->kthread)) 440 goto free_prog; 441 442 /* Make sure kthread runs on a single CPU */ 443 kthread_bind(rcpu->kthread, cpu); 444 wake_up_process(rcpu->kthread); 445 446 /* Make sure kthread has been running, so kthread_stop() will not 447 * stop the kthread prematurely and all pending frames or skbs 448 * will be handled by the kthread before kthread_stop() returns. 449 */ 450 wait_for_completion(&rcpu->kthread_running); 451 452 return rcpu; 453 454 free_prog: 455 if (rcpu->prog) 456 bpf_prog_put(rcpu->prog); 457 free_ptr_ring: 458 ptr_ring_cleanup(rcpu->queue, NULL); 459 free_queue: 460 kfree(rcpu->queue); 461 free_bulkq: 462 free_percpu(rcpu->bulkq); 463 free_rcu: 464 kfree(rcpu); 465 return NULL; 466 } 467 468 static void __cpu_map_entry_free(struct work_struct *work) 469 { 470 struct bpf_cpu_map_entry *rcpu; 471 472 /* This cpu_map_entry have been disconnected from map and one 473 * RCU grace-period have elapsed. Thus, XDP cannot queue any 474 * new packets and cannot change/set flush_needed that can 475 * find this entry. 476 */ 477 rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work); 478 479 /* kthread_stop will wake_up_process and wait for it to complete. 480 * cpu_map_kthread_run() makes sure the pointer ring is empty 481 * before exiting. 482 */ 483 kthread_stop(rcpu->kthread); 484 485 if (rcpu->prog) 486 bpf_prog_put(rcpu->prog); 487 /* The queue should be empty at this point */ 488 __cpu_map_ring_cleanup(rcpu->queue); 489 ptr_ring_cleanup(rcpu->queue, NULL); 490 kfree(rcpu->queue); 491 free_percpu(rcpu->bulkq); 492 kfree(rcpu); 493 } 494 495 /* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old 496 * entry is no longer in use before freeing. We use queue_rcu_work() to call 497 * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace 498 * period. This means that (a) all pending enqueue and flush operations have 499 * completed (because of the RCU callback), and (b) we are in a workqueue 500 * context where we can stop the kthread and wait for it to exit before freeing 501 * everything. 502 */ 503 static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, 504 u32 key_cpu, struct bpf_cpu_map_entry *rcpu) 505 { 506 struct bpf_cpu_map_entry *old_rcpu; 507 508 old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); 509 if (old_rcpu) { 510 INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); 511 queue_rcu_work(system_wq, &old_rcpu->free_work); 512 } 513 } 514 515 static long cpu_map_delete_elem(struct bpf_map *map, void *key) 516 { 517 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 518 u32 key_cpu = *(u32 *)key; 519 520 if (key_cpu >= map->max_entries) 521 return -EINVAL; 522 523 /* notice caller map_delete_elem() uses rcu_read_lock() */ 524 __cpu_map_entry_replace(cmap, key_cpu, NULL); 525 return 0; 526 } 527 528 static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, 529 u64 map_flags) 530 { 531 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 532 struct bpf_cpumap_val cpumap_value = {}; 533 struct bpf_cpu_map_entry *rcpu; 534 /* Array index key correspond to CPU number */ 535 u32 key_cpu = *(u32 *)key; 536 537 memcpy(&cpumap_value, value, map->value_size); 538 539 if (unlikely(map_flags > BPF_EXIST)) 540 return -EINVAL; 541 if (unlikely(key_cpu >= cmap->map.max_entries)) 542 return -E2BIG; 543 if (unlikely(map_flags == BPF_NOEXIST)) 544 return -EEXIST; 545 if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ 546 return -EOVERFLOW; 547 548 /* Make sure CPU is a valid possible cpu */ 549 if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) 550 return -ENODEV; 551 552 if (cpumap_value.qsize == 0) { 553 rcpu = NULL; /* Same as deleting */ 554 } else { 555 /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ 556 rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); 557 if (!rcpu) 558 return -ENOMEM; 559 } 560 rcu_read_lock(); 561 __cpu_map_entry_replace(cmap, key_cpu, rcpu); 562 rcu_read_unlock(); 563 return 0; 564 } 565 566 static void cpu_map_free(struct bpf_map *map) 567 { 568 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 569 u32 i; 570 571 /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 572 * so the bpf programs (can be more than one that used this map) were 573 * disconnected from events. Wait for outstanding critical sections in 574 * these programs to complete. synchronize_rcu() below not only 575 * guarantees no further "XDP/bpf-side" reads against 576 * bpf_cpu_map->cpu_map, but also ensure pending flush operations 577 * (if any) are completed. 578 */ 579 synchronize_rcu(); 580 581 /* The only possible user of bpf_cpu_map_entry is 582 * cpu_map_kthread_run(). 583 */ 584 for (i = 0; i < cmap->map.max_entries; i++) { 585 struct bpf_cpu_map_entry *rcpu; 586 587 rcpu = rcu_dereference_raw(cmap->cpu_map[i]); 588 if (!rcpu) 589 continue; 590 591 /* Stop kthread and cleanup entry directly */ 592 __cpu_map_entry_free(&rcpu->free_work.work); 593 } 594 bpf_map_area_free(cmap->cpu_map); 595 bpf_map_area_free(cmap); 596 } 597 598 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or 599 * by local_bh_disable() (from XDP calls inside NAPI). The 600 * rcu_read_lock_bh_held() below makes lockdep accept both. 601 */ 602 static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) 603 { 604 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 605 struct bpf_cpu_map_entry *rcpu; 606 607 if (key >= map->max_entries) 608 return NULL; 609 610 rcpu = rcu_dereference_check(cmap->cpu_map[key], 611 rcu_read_lock_bh_held()); 612 return rcpu; 613 } 614 615 static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) 616 { 617 struct bpf_cpu_map_entry *rcpu = 618 __cpu_map_lookup_elem(map, *(u32 *)key); 619 620 return rcpu ? &rcpu->value : NULL; 621 } 622 623 static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 624 { 625 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 626 u32 index = key ? *(u32 *)key : U32_MAX; 627 u32 *next = next_key; 628 629 if (index >= cmap->map.max_entries) { 630 *next = 0; 631 return 0; 632 } 633 634 if (index == cmap->map.max_entries - 1) 635 return -ENOENT; 636 *next = index + 1; 637 return 0; 638 } 639 640 static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) 641 { 642 return __bpf_xdp_redirect_map(map, index, flags, 0, 643 __cpu_map_lookup_elem); 644 } 645 646 static u64 cpu_map_mem_usage(const struct bpf_map *map) 647 { 648 u64 usage = sizeof(struct bpf_cpu_map); 649 650 /* Currently the dynamically allocated elements are not counted */ 651 usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *); 652 return usage; 653 } 654 655 BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map) 656 const struct bpf_map_ops cpu_map_ops = { 657 .map_meta_equal = bpf_map_meta_equal, 658 .map_alloc = cpu_map_alloc, 659 .map_free = cpu_map_free, 660 .map_delete_elem = cpu_map_delete_elem, 661 .map_update_elem = cpu_map_update_elem, 662 .map_lookup_elem = cpu_map_lookup_elem, 663 .map_get_next_key = cpu_map_get_next_key, 664 .map_check_btf = map_check_no_btf, 665 .map_mem_usage = cpu_map_mem_usage, 666 .map_btf_id = &cpu_map_btf_ids[0], 667 .map_redirect = cpu_map_redirect, 668 }; 669 670 static void bq_flush_to_queue(struct xdp_bulk_queue *bq) 671 { 672 struct bpf_cpu_map_entry *rcpu = bq->obj; 673 unsigned int processed = 0, drops = 0; 674 const int to_cpu = rcpu->cpu; 675 struct ptr_ring *q; 676 int i; 677 678 if (unlikely(!bq->count)) 679 return; 680 681 q = rcpu->queue; 682 spin_lock(&q->producer_lock); 683 684 for (i = 0; i < bq->count; i++) { 685 struct xdp_frame *xdpf = bq->q[i]; 686 int err; 687 688 err = __ptr_ring_produce(q, xdpf); 689 if (err) { 690 drops++; 691 xdp_return_frame_rx_napi(xdpf); 692 } 693 processed++; 694 } 695 bq->count = 0; 696 spin_unlock(&q->producer_lock); 697 698 __list_del_clearprev(&bq->flush_node); 699 700 /* Feedback loop via tracepoints */ 701 trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); 702 } 703 704 /* Runs under RCU-read-side, plus in softirq under NAPI protection. 705 * Thus, safe percpu variable access. 706 */ 707 static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) 708 { 709 struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); 710 struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); 711 712 if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) 713 bq_flush_to_queue(bq); 714 715 /* Notice, xdp_buff/page MUST be queued here, long enough for 716 * driver to code invoking us to finished, due to driver 717 * (e.g. ixgbe) recycle tricks based on page-refcnt. 718 * 719 * Thus, incoming xdp_frame is always queued here (else we race 720 * with another CPU on page-refcnt and remaining driver code). 721 * Queue time is very short, as driver will invoke flush 722 * operation, when completing napi->poll call. 723 */ 724 bq->q[bq->count++] = xdpf; 725 726 if (!bq->flush_node.prev) 727 list_add(&bq->flush_node, flush_list); 728 } 729 730 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, 731 struct net_device *dev_rx) 732 { 733 /* Info needed when constructing SKB on remote CPU */ 734 xdpf->dev_rx = dev_rx; 735 736 bq_enqueue(rcpu, xdpf); 737 return 0; 738 } 739 740 int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, 741 struct sk_buff *skb) 742 { 743 int ret; 744 745 __skb_pull(skb, skb->mac_len); 746 skb_set_redirected(skb, false); 747 __ptr_set_bit(0, &skb); 748 749 ret = ptr_ring_produce(rcpu->queue, skb); 750 if (ret < 0) 751 goto trace; 752 753 wake_up_process(rcpu->kthread); 754 trace: 755 trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu); 756 return ret; 757 } 758 759 void __cpu_map_flush(void) 760 { 761 struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); 762 struct xdp_bulk_queue *bq, *tmp; 763 764 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { 765 bq_flush_to_queue(bq); 766 767 /* If already running, costs spin_lock_irqsave + smb_mb */ 768 wake_up_process(bq->obj->kthread); 769 } 770 } 771 772 #ifdef CONFIG_DEBUG_NET 773 bool cpu_map_check_flush(void) 774 { 775 if (list_empty(this_cpu_ptr(&cpu_map_flush_list))) 776 return false; 777 __cpu_map_flush(); 778 return true; 779 } 780 #endif 781 782 static int __init cpu_map_init(void) 783 { 784 int cpu; 785 786 for_each_possible_cpu(cpu) 787 INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); 788 return 0; 789 } 790 791 subsys_initcall(cpu_map_init); 792