1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io 3 */ 4 5 /* Devmaps primary use is as a backend map for XDP BPF helper call 6 * bpf_redirect_map(). Because XDP is mostly concerned with performance we 7 * spent some effort to ensure the datapath with redirect maps does not use 8 * any locking. This is a quick note on the details. 9 * 10 * We have three possible paths to get into the devmap control plane bpf 11 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall 12 * will invoke an update, delete, or lookup operation. To ensure updates and 13 * deletes appear atomic from the datapath side xchg() is used to modify the 14 * netdev_map array. Then because the datapath does a lookup into the netdev_map 15 * array (read-only) from an RCU critical section we use call_rcu() to wait for 16 * an rcu grace period before free'ing the old data structures. This ensures the 17 * datapath always has a valid copy. However, the datapath does a "flush" 18 * operation that pushes any pending packets in the driver outside the RCU 19 * critical section. Each bpf_dtab_netdev tracks these pending operations using 20 * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed 21 * until all bits are cleared indicating outstanding flush operations have 22 * completed. 23 * 24 * BPF syscalls may race with BPF program calls on any of the update, delete 25 * or lookup operations. As noted above the xchg() operation also keep the 26 * netdev_map consistent in this case. From the devmap side BPF programs 27 * calling into these operations are the same as multiple user space threads 28 * making system calls. 29 * 30 * Finally, any of the above may race with a netdev_unregister notifier. The 31 * unregister notifier must search for net devices in the map structure that 32 * contain a reference to the net device and remove them. This is a two step 33 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) 34 * check to see if the ifindex is the same as the net_device being removed. 35 * When removing the dev a cmpxchg() is used to ensure the correct dev is 36 * removed, in the case of a concurrent update or delete operation it is 37 * possible that the initially referenced dev is no longer in the map. As the 38 * notifier hook walks the map we know that new dev references can not be 39 * added by the user because core infrastructure ensures dev_get_by_index() 40 * calls will fail at this point. 41 */ 42 #include <linux/bpf.h> 43 #include <net/xdp.h> 44 #include <linux/filter.h> 45 #include <trace/events/xdp.h> 46 47 #define DEV_CREATE_FLAG_MASK \ 48 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 49 50 #define DEV_MAP_BULK_SIZE 16 51 struct xdp_bulk_queue { 52 struct xdp_frame *q[DEV_MAP_BULK_SIZE]; 53 struct net_device *dev_rx; 54 unsigned int count; 55 }; 56 57 struct bpf_dtab_netdev { 58 struct net_device *dev; /* must be first member, due to tracepoint */ 59 struct bpf_dtab *dtab; 60 unsigned int bit; 61 struct xdp_bulk_queue __percpu *bulkq; 62 struct rcu_head rcu; 63 }; 64 65 struct bpf_dtab { 66 struct bpf_map map; 67 struct bpf_dtab_netdev **netdev_map; 68 unsigned long __percpu *flush_needed; 69 struct list_head list; 70 }; 71 72 static DEFINE_SPINLOCK(dev_map_lock); 73 static LIST_HEAD(dev_map_list); 74 75 static u64 dev_map_bitmap_size(const union bpf_attr *attr) 76 { 77 return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); 78 } 79 80 static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 81 { 82 struct bpf_dtab *dtab; 83 int err = -EINVAL; 84 u64 cost; 85 86 if (!capable(CAP_NET_ADMIN)) 87 return ERR_PTR(-EPERM); 88 89 /* check sanity of attributes */ 90 if (attr->max_entries == 0 || attr->key_size != 4 || 91 attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) 92 return ERR_PTR(-EINVAL); 93 94 dtab = kzalloc(sizeof(*dtab), GFP_USER); 95 if (!dtab) 96 return ERR_PTR(-ENOMEM); 97 98 bpf_map_init_from_attr(&dtab->map, attr); 99 100 /* make sure page count doesn't overflow */ 101 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 102 cost += dev_map_bitmap_size(attr) * num_possible_cpus(); 103 104 /* if map size is larger than memlock limit, reject it */ 105 err = bpf_map_charge_init(&dtab->map.memory, cost); 106 if (err) 107 goto free_dtab; 108 109 err = -ENOMEM; 110 111 /* A per cpu bitfield with a bit per possible net device */ 112 dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), 113 __alignof__(unsigned long), 114 GFP_KERNEL | __GFP_NOWARN); 115 if (!dtab->flush_needed) 116 goto free_charge; 117 118 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * 119 sizeof(struct bpf_dtab_netdev *), 120 dtab->map.numa_node); 121 if (!dtab->netdev_map) 122 goto free_charge; 123 124 spin_lock(&dev_map_lock); 125 list_add_tail_rcu(&dtab->list, &dev_map_list); 126 spin_unlock(&dev_map_lock); 127 128 return &dtab->map; 129 free_charge: 130 bpf_map_charge_finish(&dtab->map.memory); 131 free_dtab: 132 free_percpu(dtab->flush_needed); 133 kfree(dtab); 134 return ERR_PTR(err); 135 } 136 137 static void dev_map_free(struct bpf_map *map) 138 { 139 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 140 int i, cpu; 141 142 /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, 143 * so the programs (can be more than one that used this map) were 144 * disconnected from events. Wait for outstanding critical sections in 145 * these programs to complete. The rcu critical section only guarantees 146 * no further reads against netdev_map. It does __not__ ensure pending 147 * flush operations (if any) are complete. 148 */ 149 150 spin_lock(&dev_map_lock); 151 list_del_rcu(&dtab->list); 152 spin_unlock(&dev_map_lock); 153 154 bpf_clear_redirect_map(map); 155 synchronize_rcu(); 156 157 /* Make sure prior __dev_map_entry_free() have completed. */ 158 rcu_barrier(); 159 160 /* To ensure all pending flush operations have completed wait for flush 161 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. 162 * Because the above synchronize_rcu() ensures the map is disconnected 163 * from the program we can assume no new bits will be set. 164 */ 165 for_each_online_cpu(cpu) { 166 unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); 167 168 while (!bitmap_empty(bitmap, dtab->map.max_entries)) 169 cond_resched(); 170 } 171 172 for (i = 0; i < dtab->map.max_entries; i++) { 173 struct bpf_dtab_netdev *dev; 174 175 dev = dtab->netdev_map[i]; 176 if (!dev) 177 continue; 178 179 free_percpu(dev->bulkq); 180 dev_put(dev->dev); 181 kfree(dev); 182 } 183 184 free_percpu(dtab->flush_needed); 185 bpf_map_area_free(dtab->netdev_map); 186 kfree(dtab); 187 } 188 189 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 190 { 191 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 192 u32 index = key ? *(u32 *)key : U32_MAX; 193 u32 *next = next_key; 194 195 if (index >= dtab->map.max_entries) { 196 *next = 0; 197 return 0; 198 } 199 200 if (index == dtab->map.max_entries - 1) 201 return -ENOENT; 202 *next = index + 1; 203 return 0; 204 } 205 206 void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) 207 { 208 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 209 unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); 210 211 __set_bit(bit, bitmap); 212 } 213 214 static int bq_xmit_all(struct bpf_dtab_netdev *obj, 215 struct xdp_bulk_queue *bq, u32 flags, 216 bool in_napi_ctx) 217 { 218 struct net_device *dev = obj->dev; 219 int sent = 0, drops = 0, err = 0; 220 int i; 221 222 if (unlikely(!bq->count)) 223 return 0; 224 225 for (i = 0; i < bq->count; i++) { 226 struct xdp_frame *xdpf = bq->q[i]; 227 228 prefetch(xdpf); 229 } 230 231 sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); 232 if (sent < 0) { 233 err = sent; 234 sent = 0; 235 goto error; 236 } 237 drops = bq->count - sent; 238 out: 239 bq->count = 0; 240 241 trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, 242 sent, drops, bq->dev_rx, dev, err); 243 bq->dev_rx = NULL; 244 return 0; 245 error: 246 /* If ndo_xdp_xmit fails with an errno, no frames have been 247 * xmit'ed and it's our responsibility to them free all. 248 */ 249 for (i = 0; i < bq->count; i++) { 250 struct xdp_frame *xdpf = bq->q[i]; 251 252 /* RX path under NAPI protection, can return frames faster */ 253 if (likely(in_napi_ctx)) 254 xdp_return_frame_rx_napi(xdpf); 255 else 256 xdp_return_frame(xdpf); 257 drops++; 258 } 259 goto out; 260 } 261 262 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled 263 * from the driver before returning from its napi->poll() routine. The poll() 264 * routine is called either from busy_poll context or net_rx_action signaled 265 * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the 266 * net device can be torn down. On devmap tear down we ensure the ctx bitmap 267 * is zeroed before completing to ensure all flush operations have completed. 268 */ 269 void __dev_map_flush(struct bpf_map *map) 270 { 271 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 272 unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); 273 u32 bit; 274 275 rcu_read_lock(); 276 for_each_set_bit(bit, bitmap, map->max_entries) { 277 struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); 278 struct xdp_bulk_queue *bq; 279 280 /* This is possible if the dev entry is removed by user space 281 * between xdp redirect and flush op. 282 */ 283 if (unlikely(!dev)) 284 continue; 285 286 bq = this_cpu_ptr(dev->bulkq); 287 bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); 288 289 __clear_bit(bit, bitmap); 290 } 291 rcu_read_unlock(); 292 } 293 294 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or 295 * update happens in parallel here a dev_put wont happen until after reading the 296 * ifindex. 297 */ 298 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) 299 { 300 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 301 struct bpf_dtab_netdev *obj; 302 303 if (key >= map->max_entries) 304 return NULL; 305 306 obj = READ_ONCE(dtab->netdev_map[key]); 307 return obj; 308 } 309 310 /* Runs under RCU-read-side, plus in softirq under NAPI protection. 311 * Thus, safe percpu variable access. 312 */ 313 static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, 314 struct net_device *dev_rx) 315 316 { 317 struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); 318 319 if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) 320 bq_xmit_all(obj, bq, 0, true); 321 322 /* Ingress dev_rx will be the same for all xdp_frame's in 323 * bulk_queue, because bq stored per-CPU and must be flushed 324 * from net_device drivers NAPI func end. 325 */ 326 if (!bq->dev_rx) 327 bq->dev_rx = dev_rx; 328 329 bq->q[bq->count++] = xdpf; 330 return 0; 331 } 332 333 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, 334 struct net_device *dev_rx) 335 { 336 struct net_device *dev = dst->dev; 337 struct xdp_frame *xdpf; 338 int err; 339 340 if (!dev->netdev_ops->ndo_xdp_xmit) 341 return -EOPNOTSUPP; 342 343 err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); 344 if (unlikely(err)) 345 return err; 346 347 xdpf = convert_to_xdp_frame(xdp); 348 if (unlikely(!xdpf)) 349 return -EOVERFLOW; 350 351 return bq_enqueue(dst, xdpf, dev_rx); 352 } 353 354 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, 355 struct bpf_prog *xdp_prog) 356 { 357 int err; 358 359 err = xdp_ok_fwd_dev(dst->dev, skb->len); 360 if (unlikely(err)) 361 return err; 362 skb->dev = dst->dev; 363 generic_xdp_tx(skb, xdp_prog); 364 365 return 0; 366 } 367 368 static void *dev_map_lookup_elem(struct bpf_map *map, void *key) 369 { 370 struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); 371 struct net_device *dev = obj ? obj->dev : NULL; 372 373 return dev ? &dev->ifindex : NULL; 374 } 375 376 static void dev_map_flush_old(struct bpf_dtab_netdev *dev) 377 { 378 if (dev->dev->netdev_ops->ndo_xdp_xmit) { 379 struct xdp_bulk_queue *bq; 380 unsigned long *bitmap; 381 382 int cpu; 383 384 rcu_read_lock(); 385 for_each_online_cpu(cpu) { 386 bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); 387 __clear_bit(dev->bit, bitmap); 388 389 bq = per_cpu_ptr(dev->bulkq, cpu); 390 bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); 391 } 392 rcu_read_unlock(); 393 } 394 } 395 396 static void __dev_map_entry_free(struct rcu_head *rcu) 397 { 398 struct bpf_dtab_netdev *dev; 399 400 dev = container_of(rcu, struct bpf_dtab_netdev, rcu); 401 dev_map_flush_old(dev); 402 free_percpu(dev->bulkq); 403 dev_put(dev->dev); 404 kfree(dev); 405 } 406 407 static int dev_map_delete_elem(struct bpf_map *map, void *key) 408 { 409 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 410 struct bpf_dtab_netdev *old_dev; 411 int k = *(u32 *)key; 412 413 if (k >= map->max_entries) 414 return -EINVAL; 415 416 /* Use call_rcu() here to ensure any rcu critical sections have 417 * completed, but this does not guarantee a flush has happened 418 * yet. Because driver side rcu_read_lock/unlock only protects the 419 * running XDP program. However, for pending flush operations the 420 * dev and ctx are stored in another per cpu map. And additionally, 421 * the driver tear down ensures all soft irqs are complete before 422 * removing the net device in the case of dev_put equals zero. 423 */ 424 old_dev = xchg(&dtab->netdev_map[k], NULL); 425 if (old_dev) 426 call_rcu(&old_dev->rcu, __dev_map_entry_free); 427 return 0; 428 } 429 430 static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, 431 u64 map_flags) 432 { 433 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 434 struct net *net = current->nsproxy->net_ns; 435 gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 436 struct bpf_dtab_netdev *dev, *old_dev; 437 u32 i = *(u32 *)key; 438 u32 ifindex = *(u32 *)value; 439 440 if (unlikely(map_flags > BPF_EXIST)) 441 return -EINVAL; 442 if (unlikely(i >= dtab->map.max_entries)) 443 return -E2BIG; 444 if (unlikely(map_flags == BPF_NOEXIST)) 445 return -EEXIST; 446 447 if (!ifindex) { 448 dev = NULL; 449 } else { 450 dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); 451 if (!dev) 452 return -ENOMEM; 453 454 dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), 455 sizeof(void *), gfp); 456 if (!dev->bulkq) { 457 kfree(dev); 458 return -ENOMEM; 459 } 460 461 dev->dev = dev_get_by_index(net, ifindex); 462 if (!dev->dev) { 463 free_percpu(dev->bulkq); 464 kfree(dev); 465 return -EINVAL; 466 } 467 468 dev->bit = i; 469 dev->dtab = dtab; 470 } 471 472 /* Use call_rcu() here to ensure rcu critical sections have completed 473 * Remembering the driver side flush operation will happen before the 474 * net device is removed. 475 */ 476 old_dev = xchg(&dtab->netdev_map[i], dev); 477 if (old_dev) 478 call_rcu(&old_dev->rcu, __dev_map_entry_free); 479 480 return 0; 481 } 482 483 const struct bpf_map_ops dev_map_ops = { 484 .map_alloc = dev_map_alloc, 485 .map_free = dev_map_free, 486 .map_get_next_key = dev_map_get_next_key, 487 .map_lookup_elem = dev_map_lookup_elem, 488 .map_update_elem = dev_map_update_elem, 489 .map_delete_elem = dev_map_delete_elem, 490 .map_check_btf = map_check_no_btf, 491 }; 492 493 static int dev_map_notification(struct notifier_block *notifier, 494 ulong event, void *ptr) 495 { 496 struct net_device *netdev = netdev_notifier_info_to_dev(ptr); 497 struct bpf_dtab *dtab; 498 int i; 499 500 switch (event) { 501 case NETDEV_UNREGISTER: 502 /* This rcu_read_lock/unlock pair is needed because 503 * dev_map_list is an RCU list AND to ensure a delete 504 * operation does not free a netdev_map entry while we 505 * are comparing it against the netdev being unregistered. 506 */ 507 rcu_read_lock(); 508 list_for_each_entry_rcu(dtab, &dev_map_list, list) { 509 for (i = 0; i < dtab->map.max_entries; i++) { 510 struct bpf_dtab_netdev *dev, *odev; 511 512 dev = READ_ONCE(dtab->netdev_map[i]); 513 if (!dev || netdev != dev->dev) 514 continue; 515 odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); 516 if (dev == odev) 517 call_rcu(&dev->rcu, 518 __dev_map_entry_free); 519 } 520 } 521 rcu_read_unlock(); 522 break; 523 default: 524 break; 525 } 526 return NOTIFY_OK; 527 } 528 529 static struct notifier_block dev_map_notifier = { 530 .notifier_call = dev_map_notification, 531 }; 532 533 static int __init dev_map_init(void) 534 { 535 /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ 536 BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != 537 offsetof(struct _bpf_dtab_netdev, dev)); 538 register_netdevice_notifier(&dev_map_notifier); 539 return 0; 540 } 541 542 subsys_initcall(dev_map_init); 543