1 /**************************************************************************** 2 * Driver for Solarflare Solarstorm network controllers and boards 3 * Copyright 2005-2006 Fen Systems Ltd. 4 * Copyright 2005-2010 Solarflare Communications Inc. 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation, incorporated herein by reference. 9 */ 10 11 #include <linux/pci.h> 12 #include <linux/tcp.h> 13 #include <linux/ip.h> 14 #include <linux/in.h> 15 #include <linux/ipv6.h> 16 #include <linux/slab.h> 17 #include <net/ipv6.h> 18 #include <linux/if_ether.h> 19 #include <linux/highmem.h> 20 #include "net_driver.h" 21 #include "efx.h" 22 #include "nic.h" 23 #include "workarounds.h" 24 25 static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue, 26 struct efx_tx_buffer *buffer, 27 unsigned int *pkts_compl, 28 unsigned int *bytes_compl) 29 { 30 if (buffer->unmap_len) { 31 struct device *dma_dev = &tx_queue->efx->pci_dev->dev; 32 dma_addr_t unmap_addr = (buffer->dma_addr + buffer->len - 33 buffer->unmap_len); 34 if (buffer->flags & EFX_TX_BUF_MAP_SINGLE) 35 dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len, 36 DMA_TO_DEVICE); 37 else 38 dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len, 39 DMA_TO_DEVICE); 40 buffer->unmap_len = 0; 41 } 42 43 if (buffer->flags & EFX_TX_BUF_SKB) { 44 (*pkts_compl)++; 45 (*bytes_compl) += buffer->skb->len; 46 dev_kfree_skb_any((struct sk_buff *) buffer->skb); 47 netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev, 48 "TX queue %d transmission id %x complete\n", 49 tx_queue->queue, tx_queue->read_count); 50 } else if (buffer->flags & EFX_TX_BUF_HEAP) { 51 kfree(buffer->heap_buf); 52 } 53 54 buffer->len = 0; 55 buffer->flags = 0; 56 } 57 58 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, 59 struct sk_buff *skb); 60 61 static inline unsigned 62 efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr) 63 { 64 /* Depending on the NIC revision, we can use descriptor 65 * lengths up to 8K or 8K-1. However, since PCI Express 66 * devices must split read requests at 4K boundaries, there is 67 * little benefit from using descriptors that cross those 68 * boundaries and we keep things simple by not doing so. 69 */ 70 unsigned len = (~dma_addr & (EFX_PAGE_SIZE - 1)) + 1; 71 72 /* Work around hardware bug for unaligned buffers. */ 73 if (EFX_WORKAROUND_5391(efx) && (dma_addr & 0xf)) 74 len = min_t(unsigned, len, 512 - (dma_addr & 0xf)); 75 76 return len; 77 } 78 79 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) 80 { 81 /* Header and payload descriptor for each output segment, plus 82 * one for every input fragment boundary within a segment 83 */ 84 unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS; 85 86 /* Possibly one more per segment for the alignment workaround */ 87 if (EFX_WORKAROUND_5391(efx)) 88 max_descs += EFX_TSO_MAX_SEGS; 89 90 /* Possibly more for PCIe page boundaries within input fragments */ 91 if (PAGE_SIZE > EFX_PAGE_SIZE) 92 max_descs += max_t(unsigned int, MAX_SKB_FRAGS, 93 DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); 94 95 return max_descs; 96 } 97 98 /* Get partner of a TX queue, seen as part of the same net core queue */ 99 static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue) 100 { 101 if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) 102 return tx_queue - EFX_TXQ_TYPE_OFFLOAD; 103 else 104 return tx_queue + EFX_TXQ_TYPE_OFFLOAD; 105 } 106 107 static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1) 108 { 109 /* We need to consider both queues that the net core sees as one */ 110 struct efx_tx_queue *txq2 = efx_tx_queue_partner(txq1); 111 struct efx_nic *efx = txq1->efx; 112 unsigned int fill_level; 113 114 fill_level = max(txq1->insert_count - txq1->old_read_count, 115 txq2->insert_count - txq2->old_read_count); 116 if (likely(fill_level < efx->txq_stop_thresh)) 117 return; 118 119 /* We used the stale old_read_count above, which gives us a 120 * pessimistic estimate of the fill level (which may even 121 * validly be >= efx->txq_entries). Now try again using 122 * read_count (more likely to be a cache miss). 123 * 124 * If we read read_count and then conditionally stop the 125 * queue, it is possible for the completion path to race with 126 * us and complete all outstanding descriptors in the middle, 127 * after which there will be no more completions to wake it. 128 * Therefore we stop the queue first, then read read_count 129 * (with a memory barrier to ensure the ordering), then 130 * restart the queue if the fill level turns out to be low 131 * enough. 132 */ 133 netif_tx_stop_queue(txq1->core_txq); 134 smp_mb(); 135 txq1->old_read_count = ACCESS_ONCE(txq1->read_count); 136 txq2->old_read_count = ACCESS_ONCE(txq2->read_count); 137 138 fill_level = max(txq1->insert_count - txq1->old_read_count, 139 txq2->insert_count - txq2->old_read_count); 140 EFX_BUG_ON_PARANOID(fill_level >= efx->txq_entries); 141 if (likely(fill_level < efx->txq_stop_thresh)) { 142 smp_mb(); 143 if (likely(!efx->loopback_selftest)) 144 netif_tx_start_queue(txq1->core_txq); 145 } 146 } 147 148 /* 149 * Add a socket buffer to a TX queue 150 * 151 * This maps all fragments of a socket buffer for DMA and adds them to 152 * the TX queue. The queue's insert pointer will be incremented by 153 * the number of fragments in the socket buffer. 154 * 155 * If any DMA mapping fails, any mapped fragments will be unmapped, 156 * the queue's insert pointer will be restored to its original value. 157 * 158 * This function is split out from efx_hard_start_xmit to allow the 159 * loopback test to direct packets via specific TX queues. 160 * 161 * Returns NETDEV_TX_OK. 162 * You must hold netif_tx_lock() to call this function. 163 */ 164 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb) 165 { 166 struct efx_nic *efx = tx_queue->efx; 167 struct device *dma_dev = &efx->pci_dev->dev; 168 struct efx_tx_buffer *buffer; 169 skb_frag_t *fragment; 170 unsigned int len, unmap_len = 0, insert_ptr; 171 dma_addr_t dma_addr, unmap_addr = 0; 172 unsigned int dma_len; 173 unsigned short dma_flags; 174 int i = 0; 175 176 EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count); 177 178 if (skb_shinfo(skb)->gso_size) 179 return efx_enqueue_skb_tso(tx_queue, skb); 180 181 /* Get size of the initial fragment */ 182 len = skb_headlen(skb); 183 184 /* Pad if necessary */ 185 if (EFX_WORKAROUND_15592(efx) && skb->len <= 32) { 186 EFX_BUG_ON_PARANOID(skb->data_len); 187 len = 32 + 1; 188 if (skb_pad(skb, len - skb->len)) 189 return NETDEV_TX_OK; 190 } 191 192 /* Map for DMA. Use dma_map_single rather than dma_map_page 193 * since this is more efficient on machines with sparse 194 * memory. 195 */ 196 dma_flags = EFX_TX_BUF_MAP_SINGLE; 197 dma_addr = dma_map_single(dma_dev, skb->data, len, PCI_DMA_TODEVICE); 198 199 /* Process all fragments */ 200 while (1) { 201 if (unlikely(dma_mapping_error(dma_dev, dma_addr))) 202 goto dma_err; 203 204 /* Store fields for marking in the per-fragment final 205 * descriptor */ 206 unmap_len = len; 207 unmap_addr = dma_addr; 208 209 /* Add to TX queue, splitting across DMA boundaries */ 210 do { 211 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; 212 buffer = &tx_queue->buffer[insert_ptr]; 213 EFX_BUG_ON_PARANOID(buffer->flags); 214 EFX_BUG_ON_PARANOID(buffer->len); 215 EFX_BUG_ON_PARANOID(buffer->unmap_len); 216 217 dma_len = efx_max_tx_len(efx, dma_addr); 218 if (likely(dma_len >= len)) 219 dma_len = len; 220 221 /* Fill out per descriptor fields */ 222 buffer->len = dma_len; 223 buffer->dma_addr = dma_addr; 224 buffer->flags = EFX_TX_BUF_CONT; 225 len -= dma_len; 226 dma_addr += dma_len; 227 ++tx_queue->insert_count; 228 } while (len); 229 230 /* Transfer ownership of the unmapping to the final buffer */ 231 buffer->flags = EFX_TX_BUF_CONT | dma_flags; 232 buffer->unmap_len = unmap_len; 233 unmap_len = 0; 234 235 /* Get address and size of next fragment */ 236 if (i >= skb_shinfo(skb)->nr_frags) 237 break; 238 fragment = &skb_shinfo(skb)->frags[i]; 239 len = skb_frag_size(fragment); 240 i++; 241 /* Map for DMA */ 242 dma_flags = 0; 243 dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len, 244 DMA_TO_DEVICE); 245 } 246 247 /* Transfer ownership of the skb to the final buffer */ 248 buffer->skb = skb; 249 buffer->flags = EFX_TX_BUF_SKB | dma_flags; 250 251 netdev_tx_sent_queue(tx_queue->core_txq, skb->len); 252 253 /* Pass off to hardware */ 254 efx_nic_push_buffers(tx_queue); 255 256 efx_tx_maybe_stop_queue(tx_queue); 257 258 return NETDEV_TX_OK; 259 260 dma_err: 261 netif_err(efx, tx_err, efx->net_dev, 262 " TX queue %d could not map skb with %d bytes %d " 263 "fragments for DMA\n", tx_queue->queue, skb->len, 264 skb_shinfo(skb)->nr_frags + 1); 265 266 /* Mark the packet as transmitted, and free the SKB ourselves */ 267 dev_kfree_skb_any(skb); 268 269 /* Work backwards until we hit the original insert pointer value */ 270 while (tx_queue->insert_count != tx_queue->write_count) { 271 unsigned int pkts_compl = 0, bytes_compl = 0; 272 --tx_queue->insert_count; 273 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; 274 buffer = &tx_queue->buffer[insert_ptr]; 275 efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); 276 } 277 278 /* Free the fragment we were mid-way through pushing */ 279 if (unmap_len) { 280 if (dma_flags & EFX_TX_BUF_MAP_SINGLE) 281 dma_unmap_single(dma_dev, unmap_addr, unmap_len, 282 DMA_TO_DEVICE); 283 else 284 dma_unmap_page(dma_dev, unmap_addr, unmap_len, 285 DMA_TO_DEVICE); 286 } 287 288 return NETDEV_TX_OK; 289 } 290 291 /* Remove packets from the TX queue 292 * 293 * This removes packets from the TX queue, up to and including the 294 * specified index. 295 */ 296 static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue, 297 unsigned int index, 298 unsigned int *pkts_compl, 299 unsigned int *bytes_compl) 300 { 301 struct efx_nic *efx = tx_queue->efx; 302 unsigned int stop_index, read_ptr; 303 304 stop_index = (index + 1) & tx_queue->ptr_mask; 305 read_ptr = tx_queue->read_count & tx_queue->ptr_mask; 306 307 while (read_ptr != stop_index) { 308 struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr]; 309 if (unlikely(buffer->len == 0)) { 310 netif_err(efx, tx_err, efx->net_dev, 311 "TX queue %d spurious TX completion id %x\n", 312 tx_queue->queue, read_ptr); 313 efx_schedule_reset(efx, RESET_TYPE_TX_SKIP); 314 return; 315 } 316 317 efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl); 318 319 ++tx_queue->read_count; 320 read_ptr = tx_queue->read_count & tx_queue->ptr_mask; 321 } 322 } 323 324 /* Initiate a packet transmission. We use one channel per CPU 325 * (sharing when we have more CPUs than channels). On Falcon, the TX 326 * completion events will be directed back to the CPU that transmitted 327 * the packet, which should be cache-efficient. 328 * 329 * Context: non-blocking. 330 * Note that returning anything other than NETDEV_TX_OK will cause the 331 * OS to free the skb. 332 */ 333 netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb, 334 struct net_device *net_dev) 335 { 336 struct efx_nic *efx = netdev_priv(net_dev); 337 struct efx_tx_queue *tx_queue; 338 unsigned index, type; 339 340 EFX_WARN_ON_PARANOID(!netif_device_present(net_dev)); 341 342 /* PTP "event" packet */ 343 if (unlikely(efx_xmit_with_hwtstamp(skb)) && 344 unlikely(efx_ptp_is_ptp_tx(efx, skb))) { 345 return efx_ptp_tx(efx, skb); 346 } 347 348 index = skb_get_queue_mapping(skb); 349 type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0; 350 if (index >= efx->n_tx_channels) { 351 index -= efx->n_tx_channels; 352 type |= EFX_TXQ_TYPE_HIGHPRI; 353 } 354 tx_queue = efx_get_tx_queue(efx, index, type); 355 356 return efx_enqueue_skb(tx_queue, skb); 357 } 358 359 void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue) 360 { 361 struct efx_nic *efx = tx_queue->efx; 362 363 /* Must be inverse of queue lookup in efx_hard_start_xmit() */ 364 tx_queue->core_txq = 365 netdev_get_tx_queue(efx->net_dev, 366 tx_queue->queue / EFX_TXQ_TYPES + 367 ((tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI) ? 368 efx->n_tx_channels : 0)); 369 } 370 371 int efx_setup_tc(struct net_device *net_dev, u8 num_tc) 372 { 373 struct efx_nic *efx = netdev_priv(net_dev); 374 struct efx_channel *channel; 375 struct efx_tx_queue *tx_queue; 376 unsigned tc; 377 int rc; 378 379 if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC) 380 return -EINVAL; 381 382 if (num_tc == net_dev->num_tc) 383 return 0; 384 385 for (tc = 0; tc < num_tc; tc++) { 386 net_dev->tc_to_txq[tc].offset = tc * efx->n_tx_channels; 387 net_dev->tc_to_txq[tc].count = efx->n_tx_channels; 388 } 389 390 if (num_tc > net_dev->num_tc) { 391 /* Initialise high-priority queues as necessary */ 392 efx_for_each_channel(channel, efx) { 393 efx_for_each_possible_channel_tx_queue(tx_queue, 394 channel) { 395 if (!(tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI)) 396 continue; 397 if (!tx_queue->buffer) { 398 rc = efx_probe_tx_queue(tx_queue); 399 if (rc) 400 return rc; 401 } 402 if (!tx_queue->initialised) 403 efx_init_tx_queue(tx_queue); 404 efx_init_tx_queue_core_txq(tx_queue); 405 } 406 } 407 } else { 408 /* Reduce number of classes before number of queues */ 409 net_dev->num_tc = num_tc; 410 } 411 412 rc = netif_set_real_num_tx_queues(net_dev, 413 max_t(int, num_tc, 1) * 414 efx->n_tx_channels); 415 if (rc) 416 return rc; 417 418 /* Do not destroy high-priority queues when they become 419 * unused. We would have to flush them first, and it is 420 * fairly difficult to flush a subset of TX queues. Leave 421 * it to efx_fini_channels(). 422 */ 423 424 net_dev->num_tc = num_tc; 425 return 0; 426 } 427 428 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index) 429 { 430 unsigned fill_level; 431 struct efx_nic *efx = tx_queue->efx; 432 struct efx_tx_queue *txq2; 433 unsigned int pkts_compl = 0, bytes_compl = 0; 434 435 EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask); 436 437 efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl); 438 netdev_tx_completed_queue(tx_queue->core_txq, pkts_compl, bytes_compl); 439 440 /* See if we need to restart the netif queue. This memory 441 * barrier ensures that we write read_count (inside 442 * efx_dequeue_buffers()) before reading the queue status. 443 */ 444 smp_mb(); 445 if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) && 446 likely(efx->port_enabled) && 447 likely(netif_device_present(efx->net_dev))) { 448 txq2 = efx_tx_queue_partner(tx_queue); 449 fill_level = max(tx_queue->insert_count - tx_queue->read_count, 450 txq2->insert_count - txq2->read_count); 451 if (fill_level <= efx->txq_wake_thresh) 452 netif_tx_wake_queue(tx_queue->core_txq); 453 } 454 455 /* Check whether the hardware queue is now empty */ 456 if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) { 457 tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count); 458 if (tx_queue->read_count == tx_queue->old_write_count) { 459 smp_mb(); 460 tx_queue->empty_read_count = 461 tx_queue->read_count | EFX_EMPTY_COUNT_VALID; 462 } 463 } 464 } 465 466 /* Size of page-based TSO header buffers. Larger blocks must be 467 * allocated from the heap. 468 */ 469 #define TSOH_STD_SIZE 128 470 #define TSOH_PER_PAGE (PAGE_SIZE / TSOH_STD_SIZE) 471 472 /* At most half the descriptors in the queue at any time will refer to 473 * a TSO header buffer, since they must always be followed by a 474 * payload descriptor referring to an skb. 475 */ 476 static unsigned int efx_tsoh_page_count(struct efx_tx_queue *tx_queue) 477 { 478 return DIV_ROUND_UP(tx_queue->ptr_mask + 1, 2 * TSOH_PER_PAGE); 479 } 480 481 int efx_probe_tx_queue(struct efx_tx_queue *tx_queue) 482 { 483 struct efx_nic *efx = tx_queue->efx; 484 unsigned int entries; 485 int rc; 486 487 /* Create the smallest power-of-two aligned ring */ 488 entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE); 489 EFX_BUG_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE); 490 tx_queue->ptr_mask = entries - 1; 491 492 netif_dbg(efx, probe, efx->net_dev, 493 "creating TX queue %d size %#x mask %#x\n", 494 tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask); 495 496 /* Allocate software ring */ 497 tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer), 498 GFP_KERNEL); 499 if (!tx_queue->buffer) 500 return -ENOMEM; 501 502 if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) { 503 tx_queue->tsoh_page = 504 kcalloc(efx_tsoh_page_count(tx_queue), 505 sizeof(tx_queue->tsoh_page[0]), GFP_KERNEL); 506 if (!tx_queue->tsoh_page) { 507 rc = -ENOMEM; 508 goto fail1; 509 } 510 } 511 512 /* Allocate hardware ring */ 513 rc = efx_nic_probe_tx(tx_queue); 514 if (rc) 515 goto fail2; 516 517 return 0; 518 519 fail2: 520 kfree(tx_queue->tsoh_page); 521 tx_queue->tsoh_page = NULL; 522 fail1: 523 kfree(tx_queue->buffer); 524 tx_queue->buffer = NULL; 525 return rc; 526 } 527 528 void efx_init_tx_queue(struct efx_tx_queue *tx_queue) 529 { 530 netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, 531 "initialising TX queue %d\n", tx_queue->queue); 532 533 tx_queue->insert_count = 0; 534 tx_queue->write_count = 0; 535 tx_queue->old_write_count = 0; 536 tx_queue->read_count = 0; 537 tx_queue->old_read_count = 0; 538 tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID; 539 540 /* Set up TX descriptor ring */ 541 efx_nic_init_tx(tx_queue); 542 543 tx_queue->initialised = true; 544 } 545 546 void efx_release_tx_buffers(struct efx_tx_queue *tx_queue) 547 { 548 struct efx_tx_buffer *buffer; 549 550 if (!tx_queue->buffer) 551 return; 552 553 /* Free any buffers left in the ring */ 554 while (tx_queue->read_count != tx_queue->write_count) { 555 unsigned int pkts_compl = 0, bytes_compl = 0; 556 buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask]; 557 efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl); 558 559 ++tx_queue->read_count; 560 } 561 netdev_tx_reset_queue(tx_queue->core_txq); 562 } 563 564 void efx_fini_tx_queue(struct efx_tx_queue *tx_queue) 565 { 566 if (!tx_queue->initialised) 567 return; 568 569 netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, 570 "shutting down TX queue %d\n", tx_queue->queue); 571 572 tx_queue->initialised = false; 573 574 /* Flush TX queue, remove descriptor ring */ 575 efx_nic_fini_tx(tx_queue); 576 577 efx_release_tx_buffers(tx_queue); 578 } 579 580 void efx_remove_tx_queue(struct efx_tx_queue *tx_queue) 581 { 582 int i; 583 584 if (!tx_queue->buffer) 585 return; 586 587 netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev, 588 "destroying TX queue %d\n", tx_queue->queue); 589 efx_nic_remove_tx(tx_queue); 590 591 if (tx_queue->tsoh_page) { 592 for (i = 0; i < efx_tsoh_page_count(tx_queue); i++) 593 efx_nic_free_buffer(tx_queue->efx, 594 &tx_queue->tsoh_page[i]); 595 kfree(tx_queue->tsoh_page); 596 tx_queue->tsoh_page = NULL; 597 } 598 599 kfree(tx_queue->buffer); 600 tx_queue->buffer = NULL; 601 } 602 603 604 /* Efx TCP segmentation acceleration. 605 * 606 * Why? Because by doing it here in the driver we can go significantly 607 * faster than the GSO. 608 * 609 * Requires TX checksum offload support. 610 */ 611 612 /* Number of bytes inserted at the start of a TSO header buffer, 613 * similar to NET_IP_ALIGN. 614 */ 615 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 616 #define TSOH_OFFSET 0 617 #else 618 #define TSOH_OFFSET NET_IP_ALIGN 619 #endif 620 621 #define PTR_DIFF(p1, p2) ((u8 *)(p1) - (u8 *)(p2)) 622 623 /** 624 * struct tso_state - TSO state for an SKB 625 * @out_len: Remaining length in current segment 626 * @seqnum: Current sequence number 627 * @ipv4_id: Current IPv4 ID, host endian 628 * @packet_space: Remaining space in current packet 629 * @dma_addr: DMA address of current position 630 * @in_len: Remaining length in current SKB fragment 631 * @unmap_len: Length of SKB fragment 632 * @unmap_addr: DMA address of SKB fragment 633 * @dma_flags: TX buffer flags for DMA mapping - %EFX_TX_BUF_MAP_SINGLE or 0 634 * @protocol: Network protocol (after any VLAN header) 635 * @ip_off: Offset of IP header 636 * @tcp_off: Offset of TCP header 637 * @header_len: Number of bytes of header 638 * @ip_base_len: IPv4 tot_len or IPv6 payload_len, before TCP payload 639 * 640 * The state used during segmentation. It is put into this data structure 641 * just to make it easy to pass into inline functions. 642 */ 643 struct tso_state { 644 /* Output position */ 645 unsigned out_len; 646 unsigned seqnum; 647 unsigned ipv4_id; 648 unsigned packet_space; 649 650 /* Input position */ 651 dma_addr_t dma_addr; 652 unsigned in_len; 653 unsigned unmap_len; 654 dma_addr_t unmap_addr; 655 unsigned short dma_flags; 656 657 __be16 protocol; 658 unsigned int ip_off; 659 unsigned int tcp_off; 660 unsigned header_len; 661 unsigned int ip_base_len; 662 }; 663 664 665 /* 666 * Verify that our various assumptions about sk_buffs and the conditions 667 * under which TSO will be attempted hold true. Return the protocol number. 668 */ 669 static __be16 efx_tso_check_protocol(struct sk_buff *skb) 670 { 671 __be16 protocol = skb->protocol; 672 673 EFX_BUG_ON_PARANOID(((struct ethhdr *)skb->data)->h_proto != 674 protocol); 675 if (protocol == htons(ETH_P_8021Q)) { 676 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 677 protocol = veh->h_vlan_encapsulated_proto; 678 } 679 680 if (protocol == htons(ETH_P_IP)) { 681 EFX_BUG_ON_PARANOID(ip_hdr(skb)->protocol != IPPROTO_TCP); 682 } else { 683 EFX_BUG_ON_PARANOID(protocol != htons(ETH_P_IPV6)); 684 EFX_BUG_ON_PARANOID(ipv6_hdr(skb)->nexthdr != NEXTHDR_TCP); 685 } 686 EFX_BUG_ON_PARANOID((PTR_DIFF(tcp_hdr(skb), skb->data) 687 + (tcp_hdr(skb)->doff << 2u)) > 688 skb_headlen(skb)); 689 690 return protocol; 691 } 692 693 static u8 *efx_tsoh_get_buffer(struct efx_tx_queue *tx_queue, 694 struct efx_tx_buffer *buffer, unsigned int len) 695 { 696 u8 *result; 697 698 EFX_BUG_ON_PARANOID(buffer->len); 699 EFX_BUG_ON_PARANOID(buffer->flags); 700 EFX_BUG_ON_PARANOID(buffer->unmap_len); 701 702 if (likely(len <= TSOH_STD_SIZE - TSOH_OFFSET)) { 703 unsigned index = 704 (tx_queue->insert_count & tx_queue->ptr_mask) / 2; 705 struct efx_buffer *page_buf = 706 &tx_queue->tsoh_page[index / TSOH_PER_PAGE]; 707 unsigned offset = 708 TSOH_STD_SIZE * (index % TSOH_PER_PAGE) + TSOH_OFFSET; 709 710 if (unlikely(!page_buf->addr) && 711 efx_nic_alloc_buffer(tx_queue->efx, page_buf, PAGE_SIZE)) 712 return NULL; 713 714 result = (u8 *)page_buf->addr + offset; 715 buffer->dma_addr = page_buf->dma_addr + offset; 716 buffer->flags = EFX_TX_BUF_CONT; 717 } else { 718 tx_queue->tso_long_headers++; 719 720 buffer->heap_buf = kmalloc(TSOH_OFFSET + len, GFP_ATOMIC); 721 if (unlikely(!buffer->heap_buf)) 722 return NULL; 723 result = (u8 *)buffer->heap_buf + TSOH_OFFSET; 724 buffer->flags = EFX_TX_BUF_CONT | EFX_TX_BUF_HEAP; 725 } 726 727 buffer->len = len; 728 729 return result; 730 } 731 732 /** 733 * efx_tx_queue_insert - push descriptors onto the TX queue 734 * @tx_queue: Efx TX queue 735 * @dma_addr: DMA address of fragment 736 * @len: Length of fragment 737 * @final_buffer: The final buffer inserted into the queue 738 * 739 * Push descriptors onto the TX queue. 740 */ 741 static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue, 742 dma_addr_t dma_addr, unsigned len, 743 struct efx_tx_buffer **final_buffer) 744 { 745 struct efx_tx_buffer *buffer; 746 struct efx_nic *efx = tx_queue->efx; 747 unsigned dma_len, insert_ptr; 748 749 EFX_BUG_ON_PARANOID(len <= 0); 750 751 while (1) { 752 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask; 753 buffer = &tx_queue->buffer[insert_ptr]; 754 ++tx_queue->insert_count; 755 756 EFX_BUG_ON_PARANOID(tx_queue->insert_count - 757 tx_queue->read_count >= 758 efx->txq_entries); 759 760 EFX_BUG_ON_PARANOID(buffer->len); 761 EFX_BUG_ON_PARANOID(buffer->unmap_len); 762 EFX_BUG_ON_PARANOID(buffer->flags); 763 764 buffer->dma_addr = dma_addr; 765 766 dma_len = efx_max_tx_len(efx, dma_addr); 767 768 /* If there is enough space to send then do so */ 769 if (dma_len >= len) 770 break; 771 772 buffer->len = dma_len; 773 buffer->flags = EFX_TX_BUF_CONT; 774 dma_addr += dma_len; 775 len -= dma_len; 776 } 777 778 EFX_BUG_ON_PARANOID(!len); 779 buffer->len = len; 780 *final_buffer = buffer; 781 } 782 783 784 /* 785 * Put a TSO header into the TX queue. 786 * 787 * This is special-cased because we know that it is small enough to fit in 788 * a single fragment, and we know it doesn't cross a page boundary. It 789 * also allows us to not worry about end-of-packet etc. 790 */ 791 static int efx_tso_put_header(struct efx_tx_queue *tx_queue, 792 struct efx_tx_buffer *buffer, u8 *header) 793 { 794 if (unlikely(buffer->flags & EFX_TX_BUF_HEAP)) { 795 buffer->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev, 796 header, buffer->len, 797 DMA_TO_DEVICE); 798 if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev, 799 buffer->dma_addr))) { 800 kfree(buffer->heap_buf); 801 buffer->len = 0; 802 buffer->flags = 0; 803 return -ENOMEM; 804 } 805 buffer->unmap_len = buffer->len; 806 buffer->flags |= EFX_TX_BUF_MAP_SINGLE; 807 } 808 809 ++tx_queue->insert_count; 810 return 0; 811 } 812 813 814 /* Remove buffers put into a tx_queue. None of the buffers must have 815 * an skb attached. 816 */ 817 static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue) 818 { 819 struct efx_tx_buffer *buffer; 820 821 /* Work backwards until we hit the original insert pointer value */ 822 while (tx_queue->insert_count != tx_queue->write_count) { 823 --tx_queue->insert_count; 824 buffer = &tx_queue->buffer[tx_queue->insert_count & 825 tx_queue->ptr_mask]; 826 efx_dequeue_buffer(tx_queue, buffer, NULL, NULL); 827 } 828 } 829 830 831 /* Parse the SKB header and initialise state. */ 832 static void tso_start(struct tso_state *st, const struct sk_buff *skb) 833 { 834 st->ip_off = skb_network_header(skb) - skb->data; 835 st->tcp_off = skb_transport_header(skb) - skb->data; 836 st->header_len = st->tcp_off + (tcp_hdr(skb)->doff << 2u); 837 if (st->protocol == htons(ETH_P_IP)) { 838 st->ip_base_len = st->header_len - st->ip_off; 839 st->ipv4_id = ntohs(ip_hdr(skb)->id); 840 } else { 841 st->ip_base_len = st->header_len - st->tcp_off; 842 st->ipv4_id = 0; 843 } 844 st->seqnum = ntohl(tcp_hdr(skb)->seq); 845 846 EFX_BUG_ON_PARANOID(tcp_hdr(skb)->urg); 847 EFX_BUG_ON_PARANOID(tcp_hdr(skb)->syn); 848 EFX_BUG_ON_PARANOID(tcp_hdr(skb)->rst); 849 850 st->out_len = skb->len - st->header_len; 851 st->unmap_len = 0; 852 st->dma_flags = 0; 853 } 854 855 static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx, 856 skb_frag_t *frag) 857 { 858 st->unmap_addr = skb_frag_dma_map(&efx->pci_dev->dev, frag, 0, 859 skb_frag_size(frag), DMA_TO_DEVICE); 860 if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) { 861 st->dma_flags = 0; 862 st->unmap_len = skb_frag_size(frag); 863 st->in_len = skb_frag_size(frag); 864 st->dma_addr = st->unmap_addr; 865 return 0; 866 } 867 return -ENOMEM; 868 } 869 870 static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx, 871 const struct sk_buff *skb) 872 { 873 int hl = st->header_len; 874 int len = skb_headlen(skb) - hl; 875 876 st->unmap_addr = dma_map_single(&efx->pci_dev->dev, skb->data + hl, 877 len, DMA_TO_DEVICE); 878 if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) { 879 st->dma_flags = EFX_TX_BUF_MAP_SINGLE; 880 st->unmap_len = len; 881 st->in_len = len; 882 st->dma_addr = st->unmap_addr; 883 return 0; 884 } 885 return -ENOMEM; 886 } 887 888 889 /** 890 * tso_fill_packet_with_fragment - form descriptors for the current fragment 891 * @tx_queue: Efx TX queue 892 * @skb: Socket buffer 893 * @st: TSO state 894 * 895 * Form descriptors for the current fragment, until we reach the end 896 * of fragment or end-of-packet. 897 */ 898 static void tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue, 899 const struct sk_buff *skb, 900 struct tso_state *st) 901 { 902 struct efx_tx_buffer *buffer; 903 int n; 904 905 if (st->in_len == 0) 906 return; 907 if (st->packet_space == 0) 908 return; 909 910 EFX_BUG_ON_PARANOID(st->in_len <= 0); 911 EFX_BUG_ON_PARANOID(st->packet_space <= 0); 912 913 n = min(st->in_len, st->packet_space); 914 915 st->packet_space -= n; 916 st->out_len -= n; 917 st->in_len -= n; 918 919 efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer); 920 921 if (st->out_len == 0) { 922 /* Transfer ownership of the skb */ 923 buffer->skb = skb; 924 buffer->flags = EFX_TX_BUF_SKB; 925 } else if (st->packet_space != 0) { 926 buffer->flags = EFX_TX_BUF_CONT; 927 } 928 929 if (st->in_len == 0) { 930 /* Transfer ownership of the DMA mapping */ 931 buffer->unmap_len = st->unmap_len; 932 buffer->flags |= st->dma_flags; 933 st->unmap_len = 0; 934 } 935 936 st->dma_addr += n; 937 } 938 939 940 /** 941 * tso_start_new_packet - generate a new header and prepare for the new packet 942 * @tx_queue: Efx TX queue 943 * @skb: Socket buffer 944 * @st: TSO state 945 * 946 * Generate a new header and prepare for the new packet. Return 0 on 947 * success, or -%ENOMEM if failed to alloc header. 948 */ 949 static int tso_start_new_packet(struct efx_tx_queue *tx_queue, 950 const struct sk_buff *skb, 951 struct tso_state *st) 952 { 953 struct efx_tx_buffer *buffer = 954 &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask]; 955 struct tcphdr *tsoh_th; 956 unsigned ip_length; 957 u8 *header; 958 int rc; 959 960 /* Allocate and insert a DMA-mapped header buffer. */ 961 header = efx_tsoh_get_buffer(tx_queue, buffer, st->header_len); 962 if (!header) 963 return -ENOMEM; 964 965 tsoh_th = (struct tcphdr *)(header + st->tcp_off); 966 967 /* Copy and update the headers. */ 968 memcpy(header, skb->data, st->header_len); 969 970 tsoh_th->seq = htonl(st->seqnum); 971 st->seqnum += skb_shinfo(skb)->gso_size; 972 if (st->out_len > skb_shinfo(skb)->gso_size) { 973 /* This packet will not finish the TSO burst. */ 974 st->packet_space = skb_shinfo(skb)->gso_size; 975 tsoh_th->fin = 0; 976 tsoh_th->psh = 0; 977 } else { 978 /* This packet will be the last in the TSO burst. */ 979 st->packet_space = st->out_len; 980 tsoh_th->fin = tcp_hdr(skb)->fin; 981 tsoh_th->psh = tcp_hdr(skb)->psh; 982 } 983 ip_length = st->ip_base_len + st->packet_space; 984 985 if (st->protocol == htons(ETH_P_IP)) { 986 struct iphdr *tsoh_iph = (struct iphdr *)(header + st->ip_off); 987 988 tsoh_iph->tot_len = htons(ip_length); 989 990 /* Linux leaves suitable gaps in the IP ID space for us to fill. */ 991 tsoh_iph->id = htons(st->ipv4_id); 992 st->ipv4_id++; 993 } else { 994 struct ipv6hdr *tsoh_iph = 995 (struct ipv6hdr *)(header + st->ip_off); 996 997 tsoh_iph->payload_len = htons(ip_length); 998 } 999 1000 rc = efx_tso_put_header(tx_queue, buffer, header); 1001 if (unlikely(rc)) 1002 return rc; 1003 1004 ++tx_queue->tso_packets; 1005 1006 return 0; 1007 } 1008 1009 1010 /** 1011 * efx_enqueue_skb_tso - segment and transmit a TSO socket buffer 1012 * @tx_queue: Efx TX queue 1013 * @skb: Socket buffer 1014 * 1015 * Context: You must hold netif_tx_lock() to call this function. 1016 * 1017 * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if 1018 * @skb was not enqueued. In all cases @skb is consumed. Return 1019 * %NETDEV_TX_OK. 1020 */ 1021 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue, 1022 struct sk_buff *skb) 1023 { 1024 struct efx_nic *efx = tx_queue->efx; 1025 int frag_i, rc; 1026 struct tso_state state; 1027 1028 /* Find the packet protocol and sanity-check it */ 1029 state.protocol = efx_tso_check_protocol(skb); 1030 1031 EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count); 1032 1033 tso_start(&state, skb); 1034 1035 /* Assume that skb header area contains exactly the headers, and 1036 * all payload is in the frag list. 1037 */ 1038 if (skb_headlen(skb) == state.header_len) { 1039 /* Grab the first payload fragment. */ 1040 EFX_BUG_ON_PARANOID(skb_shinfo(skb)->nr_frags < 1); 1041 frag_i = 0; 1042 rc = tso_get_fragment(&state, efx, 1043 skb_shinfo(skb)->frags + frag_i); 1044 if (rc) 1045 goto mem_err; 1046 } else { 1047 rc = tso_get_head_fragment(&state, efx, skb); 1048 if (rc) 1049 goto mem_err; 1050 frag_i = -1; 1051 } 1052 1053 if (tso_start_new_packet(tx_queue, skb, &state) < 0) 1054 goto mem_err; 1055 1056 while (1) { 1057 tso_fill_packet_with_fragment(tx_queue, skb, &state); 1058 1059 /* Move onto the next fragment? */ 1060 if (state.in_len == 0) { 1061 if (++frag_i >= skb_shinfo(skb)->nr_frags) 1062 /* End of payload reached. */ 1063 break; 1064 rc = tso_get_fragment(&state, efx, 1065 skb_shinfo(skb)->frags + frag_i); 1066 if (rc) 1067 goto mem_err; 1068 } 1069 1070 /* Start at new packet? */ 1071 if (state.packet_space == 0 && 1072 tso_start_new_packet(tx_queue, skb, &state) < 0) 1073 goto mem_err; 1074 } 1075 1076 netdev_tx_sent_queue(tx_queue->core_txq, skb->len); 1077 1078 /* Pass off to hardware */ 1079 efx_nic_push_buffers(tx_queue); 1080 1081 efx_tx_maybe_stop_queue(tx_queue); 1082 1083 tx_queue->tso_bursts++; 1084 return NETDEV_TX_OK; 1085 1086 mem_err: 1087 netif_err(efx, tx_err, efx->net_dev, 1088 "Out of memory for TSO headers, or DMA mapping error\n"); 1089 dev_kfree_skb_any(skb); 1090 1091 /* Free the DMA mapping we were in the process of writing out */ 1092 if (state.unmap_len) { 1093 if (state.dma_flags & EFX_TX_BUF_MAP_SINGLE) 1094 dma_unmap_single(&efx->pci_dev->dev, state.unmap_addr, 1095 state.unmap_len, DMA_TO_DEVICE); 1096 else 1097 dma_unmap_page(&efx->pci_dev->dev, state.unmap_addr, 1098 state.unmap_len, DMA_TO_DEVICE); 1099 } 1100 1101 efx_enqueue_unwind(tx_queue); 1102 return NETDEV_TX_OK; 1103 } 1104