1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Devmem TCP 4 * 5 * Authors: Mina Almasry <almasrymina@google.com> 6 * Willem de Bruijn <willemdebruijn.kernel@gmail.com> 7 * Kaiyuan Zhang <kaiyuanz@google.com 8 */ 9 10 #include <linux/dma-buf.h> 11 #include <linux/genalloc.h> 12 #include <linux/mm.h> 13 #include <linux/netdevice.h> 14 #include <linux/types.h> 15 #include <net/netdev_queues.h> 16 #include <net/netdev_rx_queue.h> 17 #include <net/page_pool/helpers.h> 18 #include <net/page_pool/memory_provider.h> 19 #include <net/sock.h> 20 #include <net/tcp.h> 21 #include <trace/events/page_pool.h> 22 23 #include "devmem.h" 24 #include "mp_dmabuf_devmem.h" 25 #include "page_pool_priv.h" 26 27 /* Device memory support */ 28 29 static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); 30 31 static const struct memory_provider_ops dmabuf_devmem_ops; 32 33 bool net_is_devmem_iov(struct net_iov *niov) 34 { 35 return niov->type == NET_IOV_DMABUF; 36 } 37 38 static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, 39 struct gen_pool_chunk *chunk, 40 void *not_used) 41 { 42 struct dmabuf_genpool_chunk_owner *owner = chunk->owner; 43 44 kvfree(owner->area.niovs); 45 kfree(owner); 46 } 47 48 static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) 49 { 50 struct dmabuf_genpool_chunk_owner *owner; 51 52 owner = net_devmem_iov_to_chunk_owner(niov); 53 return owner->base_dma_addr + 54 ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); 55 } 56 57 static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref) 58 { 59 struct net_devmem_dmabuf_binding *binding = 60 container_of(ref, struct net_devmem_dmabuf_binding, ref); 61 62 INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); 63 schedule_work(&binding->unbind_w); 64 } 65 66 void __net_devmem_dmabuf_binding_free(struct work_struct *wq) 67 { 68 struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); 69 70 size_t size, avail; 71 72 gen_pool_for_each_chunk(binding->chunk_pool, 73 net_devmem_dmabuf_free_chunk_owner, NULL); 74 75 size = gen_pool_size(binding->chunk_pool); 76 avail = gen_pool_avail(binding->chunk_pool); 77 78 if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", 79 size, avail)) 80 gen_pool_destroy(binding->chunk_pool); 81 82 dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, 83 binding->direction); 84 dma_buf_detach(binding->dmabuf, binding->attachment); 85 dma_buf_put(binding->dmabuf); 86 xa_destroy(&binding->bound_rxqs); 87 percpu_ref_exit(&binding->ref); 88 kvfree(binding->tx_vec); 89 kfree(binding); 90 } 91 92 struct net_iov * 93 net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) 94 { 95 struct dmabuf_genpool_chunk_owner *owner; 96 unsigned long dma_addr; 97 struct net_iov *niov; 98 ssize_t offset; 99 ssize_t index; 100 101 dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, 102 (void **)&owner); 103 if (!dma_addr) 104 return NULL; 105 106 offset = dma_addr - owner->base_dma_addr; 107 index = offset / PAGE_SIZE; 108 niov = &owner->area.niovs[index]; 109 110 niov->desc.pp_magic = 0; 111 niov->desc.pp = NULL; 112 atomic_long_set(&niov->desc.pp_ref_count, 0); 113 114 return niov; 115 } 116 117 void net_devmem_free_dmabuf(struct net_iov *niov) 118 { 119 struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); 120 unsigned long dma_addr = net_devmem_get_dma_addr(niov); 121 122 if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, 123 PAGE_SIZE))) 124 return; 125 126 gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); 127 } 128 129 void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) 130 { 131 struct netdev_rx_queue *rxq; 132 unsigned long xa_idx; 133 unsigned int rxq_idx; 134 135 xa_erase(&net_devmem_dmabuf_bindings, binding->id); 136 137 /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the 138 * erase. 139 */ 140 synchronize_net(); 141 142 if (binding->list.next) 143 list_del(&binding->list); 144 145 xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { 146 const struct pp_memory_provider_params mp_params = { 147 .mp_priv = binding, 148 .mp_ops = &dmabuf_devmem_ops, 149 }; 150 151 rxq_idx = get_netdev_rx_queue_index(rxq); 152 153 __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); 154 } 155 156 percpu_ref_kill(&binding->ref); 157 } 158 159 int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, 160 struct net_devmem_dmabuf_binding *binding, 161 struct netlink_ext_ack *extack) 162 { 163 struct pp_memory_provider_params mp_params = { 164 .mp_priv = binding, 165 .mp_ops = &dmabuf_devmem_ops, 166 }; 167 struct netdev_rx_queue *rxq; 168 u32 xa_idx; 169 int err; 170 171 err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); 172 if (err) 173 return err; 174 175 rxq = __netif_get_rx_queue(dev, rxq_idx); 176 err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, 177 GFP_KERNEL); 178 if (err) 179 goto err_close_rxq; 180 181 return 0; 182 183 err_close_rxq: 184 __net_mp_close_rxq(dev, rxq_idx, &mp_params); 185 return err; 186 } 187 188 struct net_devmem_dmabuf_binding * 189 net_devmem_bind_dmabuf(struct net_device *dev, 190 struct device *dma_dev, 191 enum dma_data_direction direction, 192 unsigned int dmabuf_fd, struct netdev_nl_sock *priv, 193 struct netlink_ext_ack *extack) 194 { 195 struct net_devmem_dmabuf_binding *binding; 196 static u32 id_alloc_next; 197 struct scatterlist *sg; 198 struct dma_buf *dmabuf; 199 unsigned int sg_idx, i; 200 unsigned long virtual; 201 int err; 202 203 if (!dma_dev) { 204 NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); 205 return ERR_PTR(-EOPNOTSUPP); 206 } 207 208 dmabuf = dma_buf_get(dmabuf_fd); 209 if (IS_ERR(dmabuf)) 210 return ERR_CAST(dmabuf); 211 212 binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, 213 dev_to_node(&dev->dev)); 214 if (!binding) { 215 err = -ENOMEM; 216 goto err_put_dmabuf; 217 } 218 219 binding->dev = dev; 220 xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); 221 222 err = percpu_ref_init(&binding->ref, 223 net_devmem_dmabuf_binding_release, 224 0, GFP_KERNEL); 225 if (err < 0) 226 goto err_free_binding; 227 228 mutex_init(&binding->lock); 229 230 binding->dmabuf = dmabuf; 231 binding->direction = direction; 232 233 binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); 234 if (IS_ERR(binding->attachment)) { 235 err = PTR_ERR(binding->attachment); 236 NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); 237 goto err_exit_ref; 238 } 239 240 binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, 241 direction); 242 if (IS_ERR(binding->sgt)) { 243 err = PTR_ERR(binding->sgt); 244 NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); 245 goto err_detach; 246 } 247 248 if (direction == DMA_TO_DEVICE) { 249 binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, 250 sizeof(struct net_iov *), 251 GFP_KERNEL); 252 if (!binding->tx_vec) { 253 err = -ENOMEM; 254 goto err_unmap; 255 } 256 } 257 258 /* For simplicity we expect to make PAGE_SIZE allocations, but the 259 * binding can be much more flexible than that. We may be able to 260 * allocate MTU sized chunks here. Leave that for future work... 261 */ 262 binding->chunk_pool = gen_pool_create(PAGE_SHIFT, 263 dev_to_node(&dev->dev)); 264 if (!binding->chunk_pool) { 265 err = -ENOMEM; 266 goto err_tx_vec; 267 } 268 269 virtual = 0; 270 for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { 271 dma_addr_t dma_addr = sg_dma_address(sg); 272 struct dmabuf_genpool_chunk_owner *owner; 273 size_t len = sg_dma_len(sg); 274 struct net_iov *niov; 275 276 owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, 277 dev_to_node(&dev->dev)); 278 if (!owner) { 279 err = -ENOMEM; 280 goto err_free_chunks; 281 } 282 283 owner->area.base_virtual = virtual; 284 owner->base_dma_addr = dma_addr; 285 owner->area.num_niovs = len / PAGE_SIZE; 286 owner->binding = binding; 287 288 err = gen_pool_add_owner(binding->chunk_pool, dma_addr, 289 dma_addr, len, dev_to_node(&dev->dev), 290 owner); 291 if (err) { 292 kfree(owner); 293 err = -EINVAL; 294 goto err_free_chunks; 295 } 296 297 owner->area.niovs = kvmalloc_array(owner->area.num_niovs, 298 sizeof(*owner->area.niovs), 299 GFP_KERNEL); 300 if (!owner->area.niovs) { 301 err = -ENOMEM; 302 goto err_free_chunks; 303 } 304 305 for (i = 0; i < owner->area.num_niovs; i++) { 306 niov = &owner->area.niovs[i]; 307 niov->type = NET_IOV_DMABUF; 308 niov->owner = &owner->area; 309 page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), 310 net_devmem_get_dma_addr(niov)); 311 if (direction == DMA_TO_DEVICE) 312 binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; 313 } 314 315 virtual += len; 316 } 317 318 err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, 319 binding, xa_limit_32b, &id_alloc_next, 320 GFP_KERNEL); 321 if (err < 0) 322 goto err_free_chunks; 323 324 list_add(&binding->list, &priv->bindings); 325 326 return binding; 327 328 err_free_chunks: 329 gen_pool_for_each_chunk(binding->chunk_pool, 330 net_devmem_dmabuf_free_chunk_owner, NULL); 331 gen_pool_destroy(binding->chunk_pool); 332 err_tx_vec: 333 kvfree(binding->tx_vec); 334 err_unmap: 335 dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, 336 direction); 337 err_detach: 338 dma_buf_detach(dmabuf, binding->attachment); 339 err_exit_ref: 340 percpu_ref_exit(&binding->ref); 341 err_free_binding: 342 kfree(binding); 343 err_put_dmabuf: 344 dma_buf_put(dmabuf); 345 return ERR_PTR(err); 346 } 347 348 struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) 349 { 350 struct net_devmem_dmabuf_binding *binding; 351 352 rcu_read_lock(); 353 binding = xa_load(&net_devmem_dmabuf_bindings, id); 354 if (binding) { 355 if (!net_devmem_dmabuf_binding_get(binding)) 356 binding = NULL; 357 } 358 rcu_read_unlock(); 359 360 return binding; 361 } 362 363 void net_devmem_get_net_iov(struct net_iov *niov) 364 { 365 net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); 366 } 367 368 void net_devmem_put_net_iov(struct net_iov *niov) 369 { 370 net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); 371 } 372 373 struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, 374 unsigned int dmabuf_id) 375 { 376 struct net_devmem_dmabuf_binding *binding; 377 struct net_device *dst_dev; 378 struct dst_entry *dst; 379 int err = 0; 380 381 binding = net_devmem_lookup_dmabuf(dmabuf_id); 382 if (!binding || !binding->tx_vec) { 383 err = -EINVAL; 384 goto out_err; 385 } 386 387 rcu_read_lock(); 388 dst = __sk_dst_get(sk); 389 /* If dst is NULL (route expired), attempt to rebuild it. */ 390 if (unlikely(!dst)) { 391 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { 392 err = -EHOSTUNREACH; 393 goto out_unlock; 394 } 395 dst = __sk_dst_get(sk); 396 if (unlikely(!dst)) { 397 err = -ENODEV; 398 goto out_unlock; 399 } 400 } 401 402 /* The dma-addrs in this binding are only reachable to the corresponding 403 * net_device. 404 */ 405 dst_dev = dst_dev_rcu(dst); 406 if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { 407 err = -ENODEV; 408 goto out_unlock; 409 } 410 411 rcu_read_unlock(); 412 return binding; 413 414 out_unlock: 415 rcu_read_unlock(); 416 out_err: 417 if (binding) 418 net_devmem_dmabuf_binding_put(binding); 419 420 return ERR_PTR(err); 421 } 422 423 struct net_iov * 424 net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, 425 size_t virt_addr, size_t *off, size_t *size) 426 { 427 if (virt_addr >= binding->dmabuf->size) 428 return NULL; 429 430 *off = virt_addr % PAGE_SIZE; 431 *size = PAGE_SIZE - *off; 432 433 return binding->tx_vec[virt_addr / PAGE_SIZE]; 434 } 435 436 /*** "Dmabuf devmem memory provider" ***/ 437 438 int mp_dmabuf_devmem_init(struct page_pool *pool) 439 { 440 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 441 442 if (!binding) 443 return -EINVAL; 444 445 /* dma-buf dma addresses do not need and should not be used with 446 * dma_sync_for_cpu/device. Force disable dma_sync. 447 */ 448 pool->dma_sync = false; 449 pool->dma_sync_for_cpu = false; 450 451 if (pool->p.order != 0) 452 return -E2BIG; 453 454 net_devmem_dmabuf_binding_get(binding); 455 return 0; 456 } 457 458 netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) 459 { 460 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 461 struct net_iov *niov; 462 netmem_ref netmem; 463 464 niov = net_devmem_alloc_dmabuf(binding); 465 if (!niov) 466 return 0; 467 468 netmem = net_iov_to_netmem(niov); 469 470 page_pool_set_pp_info(pool, netmem); 471 472 pool->pages_state_hold_cnt++; 473 trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); 474 return netmem; 475 } 476 477 void mp_dmabuf_devmem_destroy(struct page_pool *pool) 478 { 479 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 480 481 net_devmem_dmabuf_binding_put(binding); 482 } 483 484 bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) 485 { 486 long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); 487 488 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 489 return false; 490 491 if (WARN_ON_ONCE(refcount != 1)) 492 return false; 493 494 page_pool_clear_pp_info(netmem); 495 496 net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); 497 498 /* We don't want the page pool put_page()ing our net_iovs. */ 499 return false; 500 } 501 502 static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, 503 struct netdev_rx_queue *rxq) 504 { 505 const struct net_devmem_dmabuf_binding *binding = mp_priv; 506 int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; 507 508 return nla_put_u32(rsp, type, binding->id); 509 } 510 511 static void mp_dmabuf_devmem_uninstall(void *mp_priv, 512 struct netdev_rx_queue *rxq) 513 { 514 struct net_devmem_dmabuf_binding *binding = mp_priv; 515 struct netdev_rx_queue *bound_rxq; 516 unsigned long xa_idx; 517 518 xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { 519 if (bound_rxq == rxq) { 520 xa_erase(&binding->bound_rxqs, xa_idx); 521 if (xa_empty(&binding->bound_rxqs)) { 522 mutex_lock(&binding->lock); 523 binding->dev = NULL; 524 mutex_unlock(&binding->lock); 525 } 526 break; 527 } 528 } 529 } 530 531 static const struct memory_provider_ops dmabuf_devmem_ops = { 532 .init = mp_dmabuf_devmem_init, 533 .destroy = mp_dmabuf_devmem_destroy, 534 .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, 535 .release_netmem = mp_dmabuf_devmem_release_page, 536 .nl_fill = mp_dmabuf_devmem_nl_fill, 537 .uninstall = mp_dmabuf_devmem_uninstall, 538 }; 539