1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Devmem TCP 4 * 5 * Authors: Mina Almasry <almasrymina@google.com> 6 * Willem de Bruijn <willemdebruijn.kernel@gmail.com> 7 * Kaiyuan Zhang <kaiyuanz@google.com 8 */ 9 10 #include <linux/dma-buf.h> 11 #include <linux/ethtool_netlink.h> 12 #include <linux/genalloc.h> 13 #include <linux/mm.h> 14 #include <linux/netdevice.h> 15 #include <linux/types.h> 16 #include <net/netdev_queues.h> 17 #include <net/netdev_rx_queue.h> 18 #include <net/page_pool/helpers.h> 19 #include <trace/events/page_pool.h> 20 21 #include "devmem.h" 22 #include "mp_dmabuf_devmem.h" 23 #include "page_pool_priv.h" 24 25 /* Device memory support */ 26 27 /* Protected by rtnl_lock() */ 28 static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); 29 30 static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, 31 struct gen_pool_chunk *chunk, 32 void *not_used) 33 { 34 struct dmabuf_genpool_chunk_owner *owner = chunk->owner; 35 36 kvfree(owner->niovs); 37 kfree(owner); 38 } 39 40 static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) 41 { 42 struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); 43 44 return owner->base_dma_addr + 45 ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); 46 } 47 48 void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) 49 { 50 size_t size, avail; 51 52 gen_pool_for_each_chunk(binding->chunk_pool, 53 net_devmem_dmabuf_free_chunk_owner, NULL); 54 55 size = gen_pool_size(binding->chunk_pool); 56 avail = gen_pool_avail(binding->chunk_pool); 57 58 if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", 59 size, avail)) 60 gen_pool_destroy(binding->chunk_pool); 61 62 dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, 63 DMA_FROM_DEVICE); 64 dma_buf_detach(binding->dmabuf, binding->attachment); 65 dma_buf_put(binding->dmabuf); 66 xa_destroy(&binding->bound_rxqs); 67 kfree(binding); 68 } 69 70 struct net_iov * 71 net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) 72 { 73 struct dmabuf_genpool_chunk_owner *owner; 74 unsigned long dma_addr; 75 struct net_iov *niov; 76 ssize_t offset; 77 ssize_t index; 78 79 dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, 80 (void **)&owner); 81 if (!dma_addr) 82 return NULL; 83 84 offset = dma_addr - owner->base_dma_addr; 85 index = offset / PAGE_SIZE; 86 niov = &owner->niovs[index]; 87 88 niov->pp_magic = 0; 89 niov->pp = NULL; 90 atomic_long_set(&niov->pp_ref_count, 0); 91 92 return niov; 93 } 94 95 void net_devmem_free_dmabuf(struct net_iov *niov) 96 { 97 struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); 98 unsigned long dma_addr = net_devmem_get_dma_addr(niov); 99 100 if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, 101 PAGE_SIZE))) 102 return; 103 104 gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); 105 } 106 107 void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) 108 { 109 struct netdev_rx_queue *rxq; 110 unsigned long xa_idx; 111 unsigned int rxq_idx; 112 int err; 113 114 if (binding->list.next) 115 list_del(&binding->list); 116 117 xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { 118 WARN_ON(rxq->mp_params.mp_priv != binding); 119 120 rxq->mp_params.mp_priv = NULL; 121 122 rxq_idx = get_netdev_rx_queue_index(rxq); 123 124 err = netdev_rx_queue_restart(binding->dev, rxq_idx); 125 WARN_ON(err && err != -ENETDOWN); 126 } 127 128 xa_erase(&net_devmem_dmabuf_bindings, binding->id); 129 130 net_devmem_dmabuf_binding_put(binding); 131 } 132 133 int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, 134 struct net_devmem_dmabuf_binding *binding, 135 struct netlink_ext_ack *extack) 136 { 137 struct netdev_rx_queue *rxq; 138 u32 xa_idx; 139 int err; 140 141 if (rxq_idx >= dev->real_num_rx_queues) { 142 NL_SET_ERR_MSG(extack, "rx queue index out of range"); 143 return -ERANGE; 144 } 145 146 if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { 147 NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); 148 return -EINVAL; 149 } 150 151 if (dev->cfg->hds_thresh) { 152 NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); 153 return -EINVAL; 154 } 155 156 rxq = __netif_get_rx_queue(dev, rxq_idx); 157 if (rxq->mp_params.mp_priv) { 158 NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); 159 return -EEXIST; 160 } 161 162 #ifdef CONFIG_XDP_SOCKETS 163 if (rxq->pool) { 164 NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); 165 return -EBUSY; 166 } 167 #endif 168 169 err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, 170 GFP_KERNEL); 171 if (err) 172 return err; 173 174 rxq->mp_params.mp_priv = binding; 175 176 err = netdev_rx_queue_restart(dev, rxq_idx); 177 if (err) 178 goto err_xa_erase; 179 180 return 0; 181 182 err_xa_erase: 183 rxq->mp_params.mp_priv = NULL; 184 xa_erase(&binding->bound_rxqs, xa_idx); 185 186 return err; 187 } 188 189 struct net_devmem_dmabuf_binding * 190 net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, 191 struct netlink_ext_ack *extack) 192 { 193 struct net_devmem_dmabuf_binding *binding; 194 static u32 id_alloc_next; 195 struct scatterlist *sg; 196 struct dma_buf *dmabuf; 197 unsigned int sg_idx, i; 198 unsigned long virtual; 199 int err; 200 201 dmabuf = dma_buf_get(dmabuf_fd); 202 if (IS_ERR(dmabuf)) 203 return ERR_CAST(dmabuf); 204 205 binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, 206 dev_to_node(&dev->dev)); 207 if (!binding) { 208 err = -ENOMEM; 209 goto err_put_dmabuf; 210 } 211 212 binding->dev = dev; 213 214 err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, 215 binding, xa_limit_32b, &id_alloc_next, 216 GFP_KERNEL); 217 if (err < 0) 218 goto err_free_binding; 219 220 xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); 221 222 refcount_set(&binding->ref, 1); 223 224 binding->dmabuf = dmabuf; 225 226 binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); 227 if (IS_ERR(binding->attachment)) { 228 err = PTR_ERR(binding->attachment); 229 NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); 230 goto err_free_id; 231 } 232 233 binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, 234 DMA_FROM_DEVICE); 235 if (IS_ERR(binding->sgt)) { 236 err = PTR_ERR(binding->sgt); 237 NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); 238 goto err_detach; 239 } 240 241 /* For simplicity we expect to make PAGE_SIZE allocations, but the 242 * binding can be much more flexible than that. We may be able to 243 * allocate MTU sized chunks here. Leave that for future work... 244 */ 245 binding->chunk_pool = 246 gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); 247 if (!binding->chunk_pool) { 248 err = -ENOMEM; 249 goto err_unmap; 250 } 251 252 virtual = 0; 253 for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { 254 dma_addr_t dma_addr = sg_dma_address(sg); 255 struct dmabuf_genpool_chunk_owner *owner; 256 size_t len = sg_dma_len(sg); 257 struct net_iov *niov; 258 259 owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, 260 dev_to_node(&dev->dev)); 261 if (!owner) { 262 err = -ENOMEM; 263 goto err_free_chunks; 264 } 265 266 owner->base_virtual = virtual; 267 owner->base_dma_addr = dma_addr; 268 owner->num_niovs = len / PAGE_SIZE; 269 owner->binding = binding; 270 271 err = gen_pool_add_owner(binding->chunk_pool, dma_addr, 272 dma_addr, len, dev_to_node(&dev->dev), 273 owner); 274 if (err) { 275 kfree(owner); 276 err = -EINVAL; 277 goto err_free_chunks; 278 } 279 280 owner->niovs = kvmalloc_array(owner->num_niovs, 281 sizeof(*owner->niovs), 282 GFP_KERNEL); 283 if (!owner->niovs) { 284 err = -ENOMEM; 285 goto err_free_chunks; 286 } 287 288 for (i = 0; i < owner->num_niovs; i++) { 289 niov = &owner->niovs[i]; 290 niov->owner = owner; 291 page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), 292 net_devmem_get_dma_addr(niov)); 293 } 294 295 virtual += len; 296 } 297 298 return binding; 299 300 err_free_chunks: 301 gen_pool_for_each_chunk(binding->chunk_pool, 302 net_devmem_dmabuf_free_chunk_owner, NULL); 303 gen_pool_destroy(binding->chunk_pool); 304 err_unmap: 305 dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, 306 DMA_FROM_DEVICE); 307 err_detach: 308 dma_buf_detach(dmabuf, binding->attachment); 309 err_free_id: 310 xa_erase(&net_devmem_dmabuf_bindings, binding->id); 311 err_free_binding: 312 kfree(binding); 313 err_put_dmabuf: 314 dma_buf_put(dmabuf); 315 return ERR_PTR(err); 316 } 317 318 void dev_dmabuf_uninstall(struct net_device *dev) 319 { 320 struct net_devmem_dmabuf_binding *binding; 321 struct netdev_rx_queue *rxq; 322 unsigned long xa_idx; 323 unsigned int i; 324 325 for (i = 0; i < dev->real_num_rx_queues; i++) { 326 binding = dev->_rx[i].mp_params.mp_priv; 327 if (!binding) 328 continue; 329 330 xa_for_each(&binding->bound_rxqs, xa_idx, rxq) 331 if (rxq == &dev->_rx[i]) { 332 xa_erase(&binding->bound_rxqs, xa_idx); 333 break; 334 } 335 } 336 } 337 338 /*** "Dmabuf devmem memory provider" ***/ 339 340 int mp_dmabuf_devmem_init(struct page_pool *pool) 341 { 342 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 343 344 if (!binding) 345 return -EINVAL; 346 347 /* dma-buf dma addresses do not need and should not be used with 348 * dma_sync_for_cpu/device. Force disable dma_sync. 349 */ 350 pool->dma_sync = false; 351 pool->dma_sync_for_cpu = false; 352 353 if (pool->p.order != 0) 354 return -E2BIG; 355 356 net_devmem_dmabuf_binding_get(binding); 357 return 0; 358 } 359 360 netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) 361 { 362 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 363 struct net_iov *niov; 364 netmem_ref netmem; 365 366 niov = net_devmem_alloc_dmabuf(binding); 367 if (!niov) 368 return 0; 369 370 netmem = net_iov_to_netmem(niov); 371 372 page_pool_set_pp_info(pool, netmem); 373 374 pool->pages_state_hold_cnt++; 375 trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); 376 return netmem; 377 } 378 379 void mp_dmabuf_devmem_destroy(struct page_pool *pool) 380 { 381 struct net_devmem_dmabuf_binding *binding = pool->mp_priv; 382 383 net_devmem_dmabuf_binding_put(binding); 384 } 385 386 bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) 387 { 388 long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); 389 390 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 391 return false; 392 393 if (WARN_ON_ONCE(refcount != 1)) 394 return false; 395 396 page_pool_clear_pp_info(netmem); 397 398 net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); 399 400 /* We don't want the page pool put_page()ing our net_iovs. */ 401 return false; 402 } 403