xref: /linux/drivers/net/ethernet/sfc/tx.c (revision f2ee442115c9b6219083c019939a9cc0c9abb2f8)
1 /****************************************************************************
2  * Driver for Solarflare Solarstorm network controllers and boards
3  * Copyright 2005-2006 Fen Systems Ltd.
4  * Copyright 2005-2010 Solarflare Communications Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 as published
8  * by the Free Software Foundation, incorporated herein by reference.
9  */
10 
11 #include <linux/pci.h>
12 #include <linux/tcp.h>
13 #include <linux/ip.h>
14 #include <linux/in.h>
15 #include <linux/ipv6.h>
16 #include <linux/slab.h>
17 #include <net/ipv6.h>
18 #include <linux/if_ether.h>
19 #include <linux/highmem.h>
20 #include "net_driver.h"
21 #include "efx.h"
22 #include "nic.h"
23 #include "workarounds.h"
24 
25 /*
26  * TX descriptor ring full threshold
27  *
28  * The tx_queue descriptor ring fill-level must fall below this value
29  * before we restart the netif queue
30  */
31 #define EFX_TXQ_THRESHOLD(_efx) ((_efx)->txq_entries / 2u)
32 
33 static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
34 			       struct efx_tx_buffer *buffer)
35 {
36 	if (buffer->unmap_len) {
37 		struct pci_dev *pci_dev = tx_queue->efx->pci_dev;
38 		dma_addr_t unmap_addr = (buffer->dma_addr + buffer->len -
39 					 buffer->unmap_len);
40 		if (buffer->unmap_single)
41 			pci_unmap_single(pci_dev, unmap_addr, buffer->unmap_len,
42 					 PCI_DMA_TODEVICE);
43 		else
44 			pci_unmap_page(pci_dev, unmap_addr, buffer->unmap_len,
45 				       PCI_DMA_TODEVICE);
46 		buffer->unmap_len = 0;
47 		buffer->unmap_single = false;
48 	}
49 
50 	if (buffer->skb) {
51 		dev_kfree_skb_any((struct sk_buff *) buffer->skb);
52 		buffer->skb = NULL;
53 		netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev,
54 			   "TX queue %d transmission id %x complete\n",
55 			   tx_queue->queue, tx_queue->read_count);
56 	}
57 }
58 
59 /**
60  * struct efx_tso_header - a DMA mapped buffer for packet headers
61  * @next: Linked list of free ones.
62  *	The list is protected by the TX queue lock.
63  * @dma_unmap_len: Length to unmap for an oversize buffer, or 0.
64  * @dma_addr: The DMA address of the header below.
65  *
66  * This controls the memory used for a TSO header.  Use TSOH_DATA()
67  * to find the packet header data.  Use TSOH_SIZE() to calculate the
68  * total size required for a given packet header length.  TSO headers
69  * in the free list are exactly %TSOH_STD_SIZE bytes in size.
70  */
71 struct efx_tso_header {
72 	union {
73 		struct efx_tso_header *next;
74 		size_t unmap_len;
75 	};
76 	dma_addr_t dma_addr;
77 };
78 
79 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
80 			       struct sk_buff *skb);
81 static void efx_fini_tso(struct efx_tx_queue *tx_queue);
82 static void efx_tsoh_heap_free(struct efx_tx_queue *tx_queue,
83 			       struct efx_tso_header *tsoh);
84 
85 static void efx_tsoh_free(struct efx_tx_queue *tx_queue,
86 			  struct efx_tx_buffer *buffer)
87 {
88 	if (buffer->tsoh) {
89 		if (likely(!buffer->tsoh->unmap_len)) {
90 			buffer->tsoh->next = tx_queue->tso_headers_free;
91 			tx_queue->tso_headers_free = buffer->tsoh;
92 		} else {
93 			efx_tsoh_heap_free(tx_queue, buffer->tsoh);
94 		}
95 		buffer->tsoh = NULL;
96 	}
97 }
98 
99 
100 static inline unsigned
101 efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr)
102 {
103 	/* Depending on the NIC revision, we can use descriptor
104 	 * lengths up to 8K or 8K-1.  However, since PCI Express
105 	 * devices must split read requests at 4K boundaries, there is
106 	 * little benefit from using descriptors that cross those
107 	 * boundaries and we keep things simple by not doing so.
108 	 */
109 	unsigned len = (~dma_addr & 0xfff) + 1;
110 
111 	/* Work around hardware bug for unaligned buffers. */
112 	if (EFX_WORKAROUND_5391(efx) && (dma_addr & 0xf))
113 		len = min_t(unsigned, len, 512 - (dma_addr & 0xf));
114 
115 	return len;
116 }
117 
118 /*
119  * Add a socket buffer to a TX queue
120  *
121  * This maps all fragments of a socket buffer for DMA and adds them to
122  * the TX queue.  The queue's insert pointer will be incremented by
123  * the number of fragments in the socket buffer.
124  *
125  * If any DMA mapping fails, any mapped fragments will be unmapped,
126  * the queue's insert pointer will be restored to its original value.
127  *
128  * This function is split out from efx_hard_start_xmit to allow the
129  * loopback test to direct packets via specific TX queues.
130  *
131  * Returns NETDEV_TX_OK or NETDEV_TX_BUSY
132  * You must hold netif_tx_lock() to call this function.
133  */
134 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
135 {
136 	struct efx_nic *efx = tx_queue->efx;
137 	struct pci_dev *pci_dev = efx->pci_dev;
138 	struct efx_tx_buffer *buffer;
139 	skb_frag_t *fragment;
140 	unsigned int len, unmap_len = 0, fill_level, insert_ptr;
141 	dma_addr_t dma_addr, unmap_addr = 0;
142 	unsigned int dma_len;
143 	bool unmap_single;
144 	int q_space, i = 0;
145 	netdev_tx_t rc = NETDEV_TX_OK;
146 
147 	EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
148 
149 	if (skb_shinfo(skb)->gso_size)
150 		return efx_enqueue_skb_tso(tx_queue, skb);
151 
152 	/* Get size of the initial fragment */
153 	len = skb_headlen(skb);
154 
155 	/* Pad if necessary */
156 	if (EFX_WORKAROUND_15592(efx) && skb->len <= 32) {
157 		EFX_BUG_ON_PARANOID(skb->data_len);
158 		len = 32 + 1;
159 		if (skb_pad(skb, len - skb->len))
160 			return NETDEV_TX_OK;
161 	}
162 
163 	fill_level = tx_queue->insert_count - tx_queue->old_read_count;
164 	q_space = efx->txq_entries - 1 - fill_level;
165 
166 	/* Map for DMA.  Use pci_map_single rather than pci_map_page
167 	 * since this is more efficient on machines with sparse
168 	 * memory.
169 	 */
170 	unmap_single = true;
171 	dma_addr = pci_map_single(pci_dev, skb->data, len, PCI_DMA_TODEVICE);
172 
173 	/* Process all fragments */
174 	while (1) {
175 		if (unlikely(pci_dma_mapping_error(pci_dev, dma_addr)))
176 			goto pci_err;
177 
178 		/* Store fields for marking in the per-fragment final
179 		 * descriptor */
180 		unmap_len = len;
181 		unmap_addr = dma_addr;
182 
183 		/* Add to TX queue, splitting across DMA boundaries */
184 		do {
185 			if (unlikely(q_space-- <= 0)) {
186 				/* It might be that completions have
187 				 * happened since the xmit path last
188 				 * checked.  Update the xmit path's
189 				 * copy of read_count.
190 				 */
191 				netif_tx_stop_queue(tx_queue->core_txq);
192 				/* This memory barrier protects the
193 				 * change of queue state from the access
194 				 * of read_count. */
195 				smp_mb();
196 				tx_queue->old_read_count =
197 					ACCESS_ONCE(tx_queue->read_count);
198 				fill_level = (tx_queue->insert_count
199 					      - tx_queue->old_read_count);
200 				q_space = efx->txq_entries - 1 - fill_level;
201 				if (unlikely(q_space-- <= 0)) {
202 					rc = NETDEV_TX_BUSY;
203 					goto unwind;
204 				}
205 				smp_mb();
206 				if (likely(!efx->loopback_selftest))
207 					netif_tx_start_queue(
208 						tx_queue->core_txq);
209 			}
210 
211 			insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
212 			buffer = &tx_queue->buffer[insert_ptr];
213 			efx_tsoh_free(tx_queue, buffer);
214 			EFX_BUG_ON_PARANOID(buffer->tsoh);
215 			EFX_BUG_ON_PARANOID(buffer->skb);
216 			EFX_BUG_ON_PARANOID(buffer->len);
217 			EFX_BUG_ON_PARANOID(!buffer->continuation);
218 			EFX_BUG_ON_PARANOID(buffer->unmap_len);
219 
220 			dma_len = efx_max_tx_len(efx, dma_addr);
221 			if (likely(dma_len >= len))
222 				dma_len = len;
223 
224 			/* Fill out per descriptor fields */
225 			buffer->len = dma_len;
226 			buffer->dma_addr = dma_addr;
227 			len -= dma_len;
228 			dma_addr += dma_len;
229 			++tx_queue->insert_count;
230 		} while (len);
231 
232 		/* Transfer ownership of the unmapping to the final buffer */
233 		buffer->unmap_single = unmap_single;
234 		buffer->unmap_len = unmap_len;
235 		unmap_len = 0;
236 
237 		/* Get address and size of next fragment */
238 		if (i >= skb_shinfo(skb)->nr_frags)
239 			break;
240 		fragment = &skb_shinfo(skb)->frags[i];
241 		len = skb_frag_size(fragment);
242 		i++;
243 		/* Map for DMA */
244 		unmap_single = false;
245 		dma_addr = skb_frag_dma_map(&pci_dev->dev, fragment, 0, len,
246 					    DMA_TO_DEVICE);
247 	}
248 
249 	/* Transfer ownership of the skb to the final buffer */
250 	buffer->skb = skb;
251 	buffer->continuation = false;
252 
253 	/* Pass off to hardware */
254 	efx_nic_push_buffers(tx_queue);
255 
256 	return NETDEV_TX_OK;
257 
258  pci_err:
259 	netif_err(efx, tx_err, efx->net_dev,
260 		  " TX queue %d could not map skb with %d bytes %d "
261 		  "fragments for DMA\n", tx_queue->queue, skb->len,
262 		  skb_shinfo(skb)->nr_frags + 1);
263 
264 	/* Mark the packet as transmitted, and free the SKB ourselves */
265 	dev_kfree_skb_any(skb);
266 
267  unwind:
268 	/* Work backwards until we hit the original insert pointer value */
269 	while (tx_queue->insert_count != tx_queue->write_count) {
270 		--tx_queue->insert_count;
271 		insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
272 		buffer = &tx_queue->buffer[insert_ptr];
273 		efx_dequeue_buffer(tx_queue, buffer);
274 		buffer->len = 0;
275 	}
276 
277 	/* Free the fragment we were mid-way through pushing */
278 	if (unmap_len) {
279 		if (unmap_single)
280 			pci_unmap_single(pci_dev, unmap_addr, unmap_len,
281 					 PCI_DMA_TODEVICE);
282 		else
283 			pci_unmap_page(pci_dev, unmap_addr, unmap_len,
284 				       PCI_DMA_TODEVICE);
285 	}
286 
287 	return rc;
288 }
289 
290 /* Remove packets from the TX queue
291  *
292  * This removes packets from the TX queue, up to and including the
293  * specified index.
294  */
295 static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue,
296 				unsigned int index)
297 {
298 	struct efx_nic *efx = tx_queue->efx;
299 	unsigned int stop_index, read_ptr;
300 
301 	stop_index = (index + 1) & tx_queue->ptr_mask;
302 	read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
303 
304 	while (read_ptr != stop_index) {
305 		struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr];
306 		if (unlikely(buffer->len == 0)) {
307 			netif_err(efx, tx_err, efx->net_dev,
308 				  "TX queue %d spurious TX completion id %x\n",
309 				  tx_queue->queue, read_ptr);
310 			efx_schedule_reset(efx, RESET_TYPE_TX_SKIP);
311 			return;
312 		}
313 
314 		efx_dequeue_buffer(tx_queue, buffer);
315 		buffer->continuation = true;
316 		buffer->len = 0;
317 
318 		++tx_queue->read_count;
319 		read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
320 	}
321 }
322 
323 /* Initiate a packet transmission.  We use one channel per CPU
324  * (sharing when we have more CPUs than channels).  On Falcon, the TX
325  * completion events will be directed back to the CPU that transmitted
326  * the packet, which should be cache-efficient.
327  *
328  * Context: non-blocking.
329  * Note that returning anything other than NETDEV_TX_OK will cause the
330  * OS to free the skb.
331  */
332 netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
333 				      struct net_device *net_dev)
334 {
335 	struct efx_nic *efx = netdev_priv(net_dev);
336 	struct efx_tx_queue *tx_queue;
337 	unsigned index, type;
338 
339 	EFX_WARN_ON_PARANOID(!netif_device_present(net_dev));
340 
341 	index = skb_get_queue_mapping(skb);
342 	type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0;
343 	if (index >= efx->n_tx_channels) {
344 		index -= efx->n_tx_channels;
345 		type |= EFX_TXQ_TYPE_HIGHPRI;
346 	}
347 	tx_queue = efx_get_tx_queue(efx, index, type);
348 
349 	return efx_enqueue_skb(tx_queue, skb);
350 }
351 
352 void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
353 {
354 	struct efx_nic *efx = tx_queue->efx;
355 
356 	/* Must be inverse of queue lookup in efx_hard_start_xmit() */
357 	tx_queue->core_txq =
358 		netdev_get_tx_queue(efx->net_dev,
359 				    tx_queue->queue / EFX_TXQ_TYPES +
360 				    ((tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI) ?
361 				     efx->n_tx_channels : 0));
362 }
363 
364 int efx_setup_tc(struct net_device *net_dev, u8 num_tc)
365 {
366 	struct efx_nic *efx = netdev_priv(net_dev);
367 	struct efx_channel *channel;
368 	struct efx_tx_queue *tx_queue;
369 	unsigned tc;
370 	int rc;
371 
372 	if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC)
373 		return -EINVAL;
374 
375 	if (num_tc == net_dev->num_tc)
376 		return 0;
377 
378 	for (tc = 0; tc < num_tc; tc++) {
379 		net_dev->tc_to_txq[tc].offset = tc * efx->n_tx_channels;
380 		net_dev->tc_to_txq[tc].count = efx->n_tx_channels;
381 	}
382 
383 	if (num_tc > net_dev->num_tc) {
384 		/* Initialise high-priority queues as necessary */
385 		efx_for_each_channel(channel, efx) {
386 			efx_for_each_possible_channel_tx_queue(tx_queue,
387 							       channel) {
388 				if (!(tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI))
389 					continue;
390 				if (!tx_queue->buffer) {
391 					rc = efx_probe_tx_queue(tx_queue);
392 					if (rc)
393 						return rc;
394 				}
395 				if (!tx_queue->initialised)
396 					efx_init_tx_queue(tx_queue);
397 				efx_init_tx_queue_core_txq(tx_queue);
398 			}
399 		}
400 	} else {
401 		/* Reduce number of classes before number of queues */
402 		net_dev->num_tc = num_tc;
403 	}
404 
405 	rc = netif_set_real_num_tx_queues(net_dev,
406 					  max_t(int, num_tc, 1) *
407 					  efx->n_tx_channels);
408 	if (rc)
409 		return rc;
410 
411 	/* Do not destroy high-priority queues when they become
412 	 * unused.  We would have to flush them first, and it is
413 	 * fairly difficult to flush a subset of TX queues.  Leave
414 	 * it to efx_fini_channels().
415 	 */
416 
417 	net_dev->num_tc = num_tc;
418 	return 0;
419 }
420 
421 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
422 {
423 	unsigned fill_level;
424 	struct efx_nic *efx = tx_queue->efx;
425 
426 	EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask);
427 
428 	efx_dequeue_buffers(tx_queue, index);
429 
430 	/* See if we need to restart the netif queue.  This barrier
431 	 * separates the update of read_count from the test of the
432 	 * queue state. */
433 	smp_mb();
434 	if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) &&
435 	    likely(efx->port_enabled) &&
436 	    likely(netif_device_present(efx->net_dev))) {
437 		fill_level = tx_queue->insert_count - tx_queue->read_count;
438 		if (fill_level < EFX_TXQ_THRESHOLD(efx)) {
439 			EFX_BUG_ON_PARANOID(!efx_dev_registered(efx));
440 			netif_tx_wake_queue(tx_queue->core_txq);
441 		}
442 	}
443 
444 	/* Check whether the hardware queue is now empty */
445 	if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
446 		tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count);
447 		if (tx_queue->read_count == tx_queue->old_write_count) {
448 			smp_mb();
449 			tx_queue->empty_read_count =
450 				tx_queue->read_count | EFX_EMPTY_COUNT_VALID;
451 		}
452 	}
453 }
454 
455 int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
456 {
457 	struct efx_nic *efx = tx_queue->efx;
458 	unsigned int entries;
459 	int i, rc;
460 
461 	/* Create the smallest power-of-two aligned ring */
462 	entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE);
463 	EFX_BUG_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE);
464 	tx_queue->ptr_mask = entries - 1;
465 
466 	netif_dbg(efx, probe, efx->net_dev,
467 		  "creating TX queue %d size %#x mask %#x\n",
468 		  tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask);
469 
470 	/* Allocate software ring */
471 	tx_queue->buffer = kzalloc(entries * sizeof(*tx_queue->buffer),
472 				   GFP_KERNEL);
473 	if (!tx_queue->buffer)
474 		return -ENOMEM;
475 	for (i = 0; i <= tx_queue->ptr_mask; ++i)
476 		tx_queue->buffer[i].continuation = true;
477 
478 	/* Allocate hardware ring */
479 	rc = efx_nic_probe_tx(tx_queue);
480 	if (rc)
481 		goto fail;
482 
483 	return 0;
484 
485  fail:
486 	kfree(tx_queue->buffer);
487 	tx_queue->buffer = NULL;
488 	return rc;
489 }
490 
491 void efx_init_tx_queue(struct efx_tx_queue *tx_queue)
492 {
493 	netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
494 		  "initialising TX queue %d\n", tx_queue->queue);
495 
496 	tx_queue->insert_count = 0;
497 	tx_queue->write_count = 0;
498 	tx_queue->old_write_count = 0;
499 	tx_queue->read_count = 0;
500 	tx_queue->old_read_count = 0;
501 	tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID;
502 
503 	/* Set up TX descriptor ring */
504 	efx_nic_init_tx(tx_queue);
505 
506 	tx_queue->initialised = true;
507 }
508 
509 void efx_release_tx_buffers(struct efx_tx_queue *tx_queue)
510 {
511 	struct efx_tx_buffer *buffer;
512 
513 	if (!tx_queue->buffer)
514 		return;
515 
516 	/* Free any buffers left in the ring */
517 	while (tx_queue->read_count != tx_queue->write_count) {
518 		buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask];
519 		efx_dequeue_buffer(tx_queue, buffer);
520 		buffer->continuation = true;
521 		buffer->len = 0;
522 
523 		++tx_queue->read_count;
524 	}
525 }
526 
527 void efx_fini_tx_queue(struct efx_tx_queue *tx_queue)
528 {
529 	if (!tx_queue->initialised)
530 		return;
531 
532 	netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
533 		  "shutting down TX queue %d\n", tx_queue->queue);
534 
535 	tx_queue->initialised = false;
536 
537 	/* Flush TX queue, remove descriptor ring */
538 	efx_nic_fini_tx(tx_queue);
539 
540 	efx_release_tx_buffers(tx_queue);
541 
542 	/* Free up TSO header cache */
543 	efx_fini_tso(tx_queue);
544 }
545 
546 void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
547 {
548 	if (!tx_queue->buffer)
549 		return;
550 
551 	netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
552 		  "destroying TX queue %d\n", tx_queue->queue);
553 	efx_nic_remove_tx(tx_queue);
554 
555 	kfree(tx_queue->buffer);
556 	tx_queue->buffer = NULL;
557 }
558 
559 
560 /* Efx TCP segmentation acceleration.
561  *
562  * Why?  Because by doing it here in the driver we can go significantly
563  * faster than the GSO.
564  *
565  * Requires TX checksum offload support.
566  */
567 
568 /* Number of bytes inserted at the start of a TSO header buffer,
569  * similar to NET_IP_ALIGN.
570  */
571 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
572 #define TSOH_OFFSET	0
573 #else
574 #define TSOH_OFFSET	NET_IP_ALIGN
575 #endif
576 
577 #define TSOH_BUFFER(tsoh)	((u8 *)(tsoh + 1) + TSOH_OFFSET)
578 
579 /* Total size of struct efx_tso_header, buffer and padding */
580 #define TSOH_SIZE(hdr_len)					\
581 	(sizeof(struct efx_tso_header) + TSOH_OFFSET + hdr_len)
582 
583 /* Size of blocks on free list.  Larger blocks must be allocated from
584  * the heap.
585  */
586 #define TSOH_STD_SIZE		128
587 
588 #define PTR_DIFF(p1, p2)  ((u8 *)(p1) - (u8 *)(p2))
589 #define ETH_HDR_LEN(skb)  (skb_network_header(skb) - (skb)->data)
590 #define SKB_TCP_OFF(skb)  PTR_DIFF(tcp_hdr(skb), (skb)->data)
591 #define SKB_IPV4_OFF(skb) PTR_DIFF(ip_hdr(skb), (skb)->data)
592 #define SKB_IPV6_OFF(skb) PTR_DIFF(ipv6_hdr(skb), (skb)->data)
593 
594 /**
595  * struct tso_state - TSO state for an SKB
596  * @out_len: Remaining length in current segment
597  * @seqnum: Current sequence number
598  * @ipv4_id: Current IPv4 ID, host endian
599  * @packet_space: Remaining space in current packet
600  * @dma_addr: DMA address of current position
601  * @in_len: Remaining length in current SKB fragment
602  * @unmap_len: Length of SKB fragment
603  * @unmap_addr: DMA address of SKB fragment
604  * @unmap_single: DMA single vs page mapping flag
605  * @protocol: Network protocol (after any VLAN header)
606  * @header_len: Number of bytes of header
607  * @full_packet_size: Number of bytes to put in each outgoing segment
608  *
609  * The state used during segmentation.  It is put into this data structure
610  * just to make it easy to pass into inline functions.
611  */
612 struct tso_state {
613 	/* Output position */
614 	unsigned out_len;
615 	unsigned seqnum;
616 	unsigned ipv4_id;
617 	unsigned packet_space;
618 
619 	/* Input position */
620 	dma_addr_t dma_addr;
621 	unsigned in_len;
622 	unsigned unmap_len;
623 	dma_addr_t unmap_addr;
624 	bool unmap_single;
625 
626 	__be16 protocol;
627 	unsigned header_len;
628 	int full_packet_size;
629 };
630 
631 
632 /*
633  * Verify that our various assumptions about sk_buffs and the conditions
634  * under which TSO will be attempted hold true.  Return the protocol number.
635  */
636 static __be16 efx_tso_check_protocol(struct sk_buff *skb)
637 {
638 	__be16 protocol = skb->protocol;
639 
640 	EFX_BUG_ON_PARANOID(((struct ethhdr *)skb->data)->h_proto !=
641 			    protocol);
642 	if (protocol == htons(ETH_P_8021Q)) {
643 		/* Find the encapsulated protocol; reset network header
644 		 * and transport header based on that. */
645 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
646 		protocol = veh->h_vlan_encapsulated_proto;
647 		skb_set_network_header(skb, sizeof(*veh));
648 		if (protocol == htons(ETH_P_IP))
649 			skb_set_transport_header(skb, sizeof(*veh) +
650 						 4 * ip_hdr(skb)->ihl);
651 		else if (protocol == htons(ETH_P_IPV6))
652 			skb_set_transport_header(skb, sizeof(*veh) +
653 						 sizeof(struct ipv6hdr));
654 	}
655 
656 	if (protocol == htons(ETH_P_IP)) {
657 		EFX_BUG_ON_PARANOID(ip_hdr(skb)->protocol != IPPROTO_TCP);
658 	} else {
659 		EFX_BUG_ON_PARANOID(protocol != htons(ETH_P_IPV6));
660 		EFX_BUG_ON_PARANOID(ipv6_hdr(skb)->nexthdr != NEXTHDR_TCP);
661 	}
662 	EFX_BUG_ON_PARANOID((PTR_DIFF(tcp_hdr(skb), skb->data)
663 			     + (tcp_hdr(skb)->doff << 2u)) >
664 			    skb_headlen(skb));
665 
666 	return protocol;
667 }
668 
669 
670 /*
671  * Allocate a page worth of efx_tso_header structures, and string them
672  * into the tx_queue->tso_headers_free linked list. Return 0 or -ENOMEM.
673  */
674 static int efx_tsoh_block_alloc(struct efx_tx_queue *tx_queue)
675 {
676 
677 	struct pci_dev *pci_dev = tx_queue->efx->pci_dev;
678 	struct efx_tso_header *tsoh;
679 	dma_addr_t dma_addr;
680 	u8 *base_kva, *kva;
681 
682 	base_kva = pci_alloc_consistent(pci_dev, PAGE_SIZE, &dma_addr);
683 	if (base_kva == NULL) {
684 		netif_err(tx_queue->efx, tx_err, tx_queue->efx->net_dev,
685 			  "Unable to allocate page for TSO headers\n");
686 		return -ENOMEM;
687 	}
688 
689 	/* pci_alloc_consistent() allocates pages. */
690 	EFX_BUG_ON_PARANOID(dma_addr & (PAGE_SIZE - 1u));
691 
692 	for (kva = base_kva; kva < base_kva + PAGE_SIZE; kva += TSOH_STD_SIZE) {
693 		tsoh = (struct efx_tso_header *)kva;
694 		tsoh->dma_addr = dma_addr + (TSOH_BUFFER(tsoh) - base_kva);
695 		tsoh->next = tx_queue->tso_headers_free;
696 		tx_queue->tso_headers_free = tsoh;
697 	}
698 
699 	return 0;
700 }
701 
702 
703 /* Free up a TSO header, and all others in the same page. */
704 static void efx_tsoh_block_free(struct efx_tx_queue *tx_queue,
705 				struct efx_tso_header *tsoh,
706 				struct pci_dev *pci_dev)
707 {
708 	struct efx_tso_header **p;
709 	unsigned long base_kva;
710 	dma_addr_t base_dma;
711 
712 	base_kva = (unsigned long)tsoh & PAGE_MASK;
713 	base_dma = tsoh->dma_addr & PAGE_MASK;
714 
715 	p = &tx_queue->tso_headers_free;
716 	while (*p != NULL) {
717 		if (((unsigned long)*p & PAGE_MASK) == base_kva)
718 			*p = (*p)->next;
719 		else
720 			p = &(*p)->next;
721 	}
722 
723 	pci_free_consistent(pci_dev, PAGE_SIZE, (void *)base_kva, base_dma);
724 }
725 
726 static struct efx_tso_header *
727 efx_tsoh_heap_alloc(struct efx_tx_queue *tx_queue, size_t header_len)
728 {
729 	struct efx_tso_header *tsoh;
730 
731 	tsoh = kmalloc(TSOH_SIZE(header_len), GFP_ATOMIC | GFP_DMA);
732 	if (unlikely(!tsoh))
733 		return NULL;
734 
735 	tsoh->dma_addr = pci_map_single(tx_queue->efx->pci_dev,
736 					TSOH_BUFFER(tsoh), header_len,
737 					PCI_DMA_TODEVICE);
738 	if (unlikely(pci_dma_mapping_error(tx_queue->efx->pci_dev,
739 					   tsoh->dma_addr))) {
740 		kfree(tsoh);
741 		return NULL;
742 	}
743 
744 	tsoh->unmap_len = header_len;
745 	return tsoh;
746 }
747 
748 static void
749 efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, struct efx_tso_header *tsoh)
750 {
751 	pci_unmap_single(tx_queue->efx->pci_dev,
752 			 tsoh->dma_addr, tsoh->unmap_len,
753 			 PCI_DMA_TODEVICE);
754 	kfree(tsoh);
755 }
756 
757 /**
758  * efx_tx_queue_insert - push descriptors onto the TX queue
759  * @tx_queue:		Efx TX queue
760  * @dma_addr:		DMA address of fragment
761  * @len:		Length of fragment
762  * @final_buffer:	The final buffer inserted into the queue
763  *
764  * Push descriptors onto the TX queue.  Return 0 on success or 1 if
765  * @tx_queue full.
766  */
767 static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
768 			       dma_addr_t dma_addr, unsigned len,
769 			       struct efx_tx_buffer **final_buffer)
770 {
771 	struct efx_tx_buffer *buffer;
772 	struct efx_nic *efx = tx_queue->efx;
773 	unsigned dma_len, fill_level, insert_ptr;
774 	int q_space;
775 
776 	EFX_BUG_ON_PARANOID(len <= 0);
777 
778 	fill_level = tx_queue->insert_count - tx_queue->old_read_count;
779 	/* -1 as there is no way to represent all descriptors used */
780 	q_space = efx->txq_entries - 1 - fill_level;
781 
782 	while (1) {
783 		if (unlikely(q_space-- <= 0)) {
784 			/* It might be that completions have happened
785 			 * since the xmit path last checked.  Update
786 			 * the xmit path's copy of read_count.
787 			 */
788 			netif_tx_stop_queue(tx_queue->core_txq);
789 			/* This memory barrier protects the change of
790 			 * queue state from the access of read_count. */
791 			smp_mb();
792 			tx_queue->old_read_count =
793 				ACCESS_ONCE(tx_queue->read_count);
794 			fill_level = (tx_queue->insert_count
795 				      - tx_queue->old_read_count);
796 			q_space = efx->txq_entries - 1 - fill_level;
797 			if (unlikely(q_space-- <= 0)) {
798 				*final_buffer = NULL;
799 				return 1;
800 			}
801 			smp_mb();
802 			netif_tx_start_queue(tx_queue->core_txq);
803 		}
804 
805 		insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
806 		buffer = &tx_queue->buffer[insert_ptr];
807 		++tx_queue->insert_count;
808 
809 		EFX_BUG_ON_PARANOID(tx_queue->insert_count -
810 				    tx_queue->read_count >=
811 				    efx->txq_entries);
812 
813 		efx_tsoh_free(tx_queue, buffer);
814 		EFX_BUG_ON_PARANOID(buffer->len);
815 		EFX_BUG_ON_PARANOID(buffer->unmap_len);
816 		EFX_BUG_ON_PARANOID(buffer->skb);
817 		EFX_BUG_ON_PARANOID(!buffer->continuation);
818 		EFX_BUG_ON_PARANOID(buffer->tsoh);
819 
820 		buffer->dma_addr = dma_addr;
821 
822 		dma_len = efx_max_tx_len(efx, dma_addr);
823 
824 		/* If there is enough space to send then do so */
825 		if (dma_len >= len)
826 			break;
827 
828 		buffer->len = dma_len; /* Don't set the other members */
829 		dma_addr += dma_len;
830 		len -= dma_len;
831 	}
832 
833 	EFX_BUG_ON_PARANOID(!len);
834 	buffer->len = len;
835 	*final_buffer = buffer;
836 	return 0;
837 }
838 
839 
840 /*
841  * Put a TSO header into the TX queue.
842  *
843  * This is special-cased because we know that it is small enough to fit in
844  * a single fragment, and we know it doesn't cross a page boundary.  It
845  * also allows us to not worry about end-of-packet etc.
846  */
847 static void efx_tso_put_header(struct efx_tx_queue *tx_queue,
848 			       struct efx_tso_header *tsoh, unsigned len)
849 {
850 	struct efx_tx_buffer *buffer;
851 
852 	buffer = &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask];
853 	efx_tsoh_free(tx_queue, buffer);
854 	EFX_BUG_ON_PARANOID(buffer->len);
855 	EFX_BUG_ON_PARANOID(buffer->unmap_len);
856 	EFX_BUG_ON_PARANOID(buffer->skb);
857 	EFX_BUG_ON_PARANOID(!buffer->continuation);
858 	EFX_BUG_ON_PARANOID(buffer->tsoh);
859 	buffer->len = len;
860 	buffer->dma_addr = tsoh->dma_addr;
861 	buffer->tsoh = tsoh;
862 
863 	++tx_queue->insert_count;
864 }
865 
866 
867 /* Remove descriptors put into a tx_queue. */
868 static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
869 {
870 	struct efx_tx_buffer *buffer;
871 	dma_addr_t unmap_addr;
872 
873 	/* Work backwards until we hit the original insert pointer value */
874 	while (tx_queue->insert_count != tx_queue->write_count) {
875 		--tx_queue->insert_count;
876 		buffer = &tx_queue->buffer[tx_queue->insert_count &
877 					   tx_queue->ptr_mask];
878 		efx_tsoh_free(tx_queue, buffer);
879 		EFX_BUG_ON_PARANOID(buffer->skb);
880 		if (buffer->unmap_len) {
881 			unmap_addr = (buffer->dma_addr + buffer->len -
882 				      buffer->unmap_len);
883 			if (buffer->unmap_single)
884 				pci_unmap_single(tx_queue->efx->pci_dev,
885 						 unmap_addr, buffer->unmap_len,
886 						 PCI_DMA_TODEVICE);
887 			else
888 				pci_unmap_page(tx_queue->efx->pci_dev,
889 					       unmap_addr, buffer->unmap_len,
890 					       PCI_DMA_TODEVICE);
891 			buffer->unmap_len = 0;
892 		}
893 		buffer->len = 0;
894 		buffer->continuation = true;
895 	}
896 }
897 
898 
899 /* Parse the SKB header and initialise state. */
900 static void tso_start(struct tso_state *st, const struct sk_buff *skb)
901 {
902 	/* All ethernet/IP/TCP headers combined size is TCP header size
903 	 * plus offset of TCP header relative to start of packet.
904 	 */
905 	st->header_len = ((tcp_hdr(skb)->doff << 2u)
906 			  + PTR_DIFF(tcp_hdr(skb), skb->data));
907 	st->full_packet_size = st->header_len + skb_shinfo(skb)->gso_size;
908 
909 	if (st->protocol == htons(ETH_P_IP))
910 		st->ipv4_id = ntohs(ip_hdr(skb)->id);
911 	else
912 		st->ipv4_id = 0;
913 	st->seqnum = ntohl(tcp_hdr(skb)->seq);
914 
915 	EFX_BUG_ON_PARANOID(tcp_hdr(skb)->urg);
916 	EFX_BUG_ON_PARANOID(tcp_hdr(skb)->syn);
917 	EFX_BUG_ON_PARANOID(tcp_hdr(skb)->rst);
918 
919 	st->packet_space = st->full_packet_size;
920 	st->out_len = skb->len - st->header_len;
921 	st->unmap_len = 0;
922 	st->unmap_single = false;
923 }
924 
925 static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
926 			    skb_frag_t *frag)
927 {
928 	st->unmap_addr = skb_frag_dma_map(&efx->pci_dev->dev, frag, 0,
929 					  skb_frag_size(frag), DMA_TO_DEVICE);
930 	if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
931 		st->unmap_single = false;
932 		st->unmap_len = skb_frag_size(frag);
933 		st->in_len = skb_frag_size(frag);
934 		st->dma_addr = st->unmap_addr;
935 		return 0;
936 	}
937 	return -ENOMEM;
938 }
939 
940 static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx,
941 				 const struct sk_buff *skb)
942 {
943 	int hl = st->header_len;
944 	int len = skb_headlen(skb) - hl;
945 
946 	st->unmap_addr = pci_map_single(efx->pci_dev, skb->data + hl,
947 					len, PCI_DMA_TODEVICE);
948 	if (likely(!pci_dma_mapping_error(efx->pci_dev, st->unmap_addr))) {
949 		st->unmap_single = true;
950 		st->unmap_len = len;
951 		st->in_len = len;
952 		st->dma_addr = st->unmap_addr;
953 		return 0;
954 	}
955 	return -ENOMEM;
956 }
957 
958 
959 /**
960  * tso_fill_packet_with_fragment - form descriptors for the current fragment
961  * @tx_queue:		Efx TX queue
962  * @skb:		Socket buffer
963  * @st:			TSO state
964  *
965  * Form descriptors for the current fragment, until we reach the end
966  * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
967  * space in @tx_queue.
968  */
969 static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
970 					 const struct sk_buff *skb,
971 					 struct tso_state *st)
972 {
973 	struct efx_tx_buffer *buffer;
974 	int n, end_of_packet, rc;
975 
976 	if (st->in_len == 0)
977 		return 0;
978 	if (st->packet_space == 0)
979 		return 0;
980 
981 	EFX_BUG_ON_PARANOID(st->in_len <= 0);
982 	EFX_BUG_ON_PARANOID(st->packet_space <= 0);
983 
984 	n = min(st->in_len, st->packet_space);
985 
986 	st->packet_space -= n;
987 	st->out_len -= n;
988 	st->in_len -= n;
989 
990 	rc = efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
991 	if (likely(rc == 0)) {
992 		if (st->out_len == 0)
993 			/* Transfer ownership of the skb */
994 			buffer->skb = skb;
995 
996 		end_of_packet = st->out_len == 0 || st->packet_space == 0;
997 		buffer->continuation = !end_of_packet;
998 
999 		if (st->in_len == 0) {
1000 			/* Transfer ownership of the pci mapping */
1001 			buffer->unmap_len = st->unmap_len;
1002 			buffer->unmap_single = st->unmap_single;
1003 			st->unmap_len = 0;
1004 		}
1005 	}
1006 
1007 	st->dma_addr += n;
1008 	return rc;
1009 }
1010 
1011 
1012 /**
1013  * tso_start_new_packet - generate a new header and prepare for the new packet
1014  * @tx_queue:		Efx TX queue
1015  * @skb:		Socket buffer
1016  * @st:			TSO state
1017  *
1018  * Generate a new header and prepare for the new packet.  Return 0 on
1019  * success, or -1 if failed to alloc header.
1020  */
1021 static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
1022 				const struct sk_buff *skb,
1023 				struct tso_state *st)
1024 {
1025 	struct efx_tso_header *tsoh;
1026 	struct tcphdr *tsoh_th;
1027 	unsigned ip_length;
1028 	u8 *header;
1029 
1030 	/* Allocate a DMA-mapped header buffer. */
1031 	if (likely(TSOH_SIZE(st->header_len) <= TSOH_STD_SIZE)) {
1032 		if (tx_queue->tso_headers_free == NULL) {
1033 			if (efx_tsoh_block_alloc(tx_queue))
1034 				return -1;
1035 		}
1036 		EFX_BUG_ON_PARANOID(!tx_queue->tso_headers_free);
1037 		tsoh = tx_queue->tso_headers_free;
1038 		tx_queue->tso_headers_free = tsoh->next;
1039 		tsoh->unmap_len = 0;
1040 	} else {
1041 		tx_queue->tso_long_headers++;
1042 		tsoh = efx_tsoh_heap_alloc(tx_queue, st->header_len);
1043 		if (unlikely(!tsoh))
1044 			return -1;
1045 	}
1046 
1047 	header = TSOH_BUFFER(tsoh);
1048 	tsoh_th = (struct tcphdr *)(header + SKB_TCP_OFF(skb));
1049 
1050 	/* Copy and update the headers. */
1051 	memcpy(header, skb->data, st->header_len);
1052 
1053 	tsoh_th->seq = htonl(st->seqnum);
1054 	st->seqnum += skb_shinfo(skb)->gso_size;
1055 	if (st->out_len > skb_shinfo(skb)->gso_size) {
1056 		/* This packet will not finish the TSO burst. */
1057 		ip_length = st->full_packet_size - ETH_HDR_LEN(skb);
1058 		tsoh_th->fin = 0;
1059 		tsoh_th->psh = 0;
1060 	} else {
1061 		/* This packet will be the last in the TSO burst. */
1062 		ip_length = st->header_len - ETH_HDR_LEN(skb) + st->out_len;
1063 		tsoh_th->fin = tcp_hdr(skb)->fin;
1064 		tsoh_th->psh = tcp_hdr(skb)->psh;
1065 	}
1066 
1067 	if (st->protocol == htons(ETH_P_IP)) {
1068 		struct iphdr *tsoh_iph =
1069 			(struct iphdr *)(header + SKB_IPV4_OFF(skb));
1070 
1071 		tsoh_iph->tot_len = htons(ip_length);
1072 
1073 		/* Linux leaves suitable gaps in the IP ID space for us to fill. */
1074 		tsoh_iph->id = htons(st->ipv4_id);
1075 		st->ipv4_id++;
1076 	} else {
1077 		struct ipv6hdr *tsoh_iph =
1078 			(struct ipv6hdr *)(header + SKB_IPV6_OFF(skb));
1079 
1080 		tsoh_iph->payload_len = htons(ip_length - sizeof(*tsoh_iph));
1081 	}
1082 
1083 	st->packet_space = skb_shinfo(skb)->gso_size;
1084 	++tx_queue->tso_packets;
1085 
1086 	/* Form a descriptor for this header. */
1087 	efx_tso_put_header(tx_queue, tsoh, st->header_len);
1088 
1089 	return 0;
1090 }
1091 
1092 
1093 /**
1094  * efx_enqueue_skb_tso - segment and transmit a TSO socket buffer
1095  * @tx_queue:		Efx TX queue
1096  * @skb:		Socket buffer
1097  *
1098  * Context: You must hold netif_tx_lock() to call this function.
1099  *
1100  * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if
1101  * @skb was not enqueued.  In all cases @skb is consumed.  Return
1102  * %NETDEV_TX_OK or %NETDEV_TX_BUSY.
1103  */
1104 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
1105 			       struct sk_buff *skb)
1106 {
1107 	struct efx_nic *efx = tx_queue->efx;
1108 	int frag_i, rc, rc2 = NETDEV_TX_OK;
1109 	struct tso_state state;
1110 
1111 	/* Find the packet protocol and sanity-check it */
1112 	state.protocol = efx_tso_check_protocol(skb);
1113 
1114 	EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
1115 
1116 	tso_start(&state, skb);
1117 
1118 	/* Assume that skb header area contains exactly the headers, and
1119 	 * all payload is in the frag list.
1120 	 */
1121 	if (skb_headlen(skb) == state.header_len) {
1122 		/* Grab the first payload fragment. */
1123 		EFX_BUG_ON_PARANOID(skb_shinfo(skb)->nr_frags < 1);
1124 		frag_i = 0;
1125 		rc = tso_get_fragment(&state, efx,
1126 				      skb_shinfo(skb)->frags + frag_i);
1127 		if (rc)
1128 			goto mem_err;
1129 	} else {
1130 		rc = tso_get_head_fragment(&state, efx, skb);
1131 		if (rc)
1132 			goto mem_err;
1133 		frag_i = -1;
1134 	}
1135 
1136 	if (tso_start_new_packet(tx_queue, skb, &state) < 0)
1137 		goto mem_err;
1138 
1139 	while (1) {
1140 		rc = tso_fill_packet_with_fragment(tx_queue, skb, &state);
1141 		if (unlikely(rc)) {
1142 			rc2 = NETDEV_TX_BUSY;
1143 			goto unwind;
1144 		}
1145 
1146 		/* Move onto the next fragment? */
1147 		if (state.in_len == 0) {
1148 			if (++frag_i >= skb_shinfo(skb)->nr_frags)
1149 				/* End of payload reached. */
1150 				break;
1151 			rc = tso_get_fragment(&state, efx,
1152 					      skb_shinfo(skb)->frags + frag_i);
1153 			if (rc)
1154 				goto mem_err;
1155 		}
1156 
1157 		/* Start at new packet? */
1158 		if (state.packet_space == 0 &&
1159 		    tso_start_new_packet(tx_queue, skb, &state) < 0)
1160 			goto mem_err;
1161 	}
1162 
1163 	/* Pass off to hardware */
1164 	efx_nic_push_buffers(tx_queue);
1165 
1166 	tx_queue->tso_bursts++;
1167 	return NETDEV_TX_OK;
1168 
1169  mem_err:
1170 	netif_err(efx, tx_err, efx->net_dev,
1171 		  "Out of memory for TSO headers, or PCI mapping error\n");
1172 	dev_kfree_skb_any(skb);
1173 
1174  unwind:
1175 	/* Free the DMA mapping we were in the process of writing out */
1176 	if (state.unmap_len) {
1177 		if (state.unmap_single)
1178 			pci_unmap_single(efx->pci_dev, state.unmap_addr,
1179 					 state.unmap_len, PCI_DMA_TODEVICE);
1180 		else
1181 			pci_unmap_page(efx->pci_dev, state.unmap_addr,
1182 				       state.unmap_len, PCI_DMA_TODEVICE);
1183 	}
1184 
1185 	efx_enqueue_unwind(tx_queue);
1186 	return rc2;
1187 }
1188 
1189 
1190 /*
1191  * Free up all TSO datastructures associated with tx_queue. This
1192  * routine should be called only once the tx_queue is both empty and
1193  * will no longer be used.
1194  */
1195 static void efx_fini_tso(struct efx_tx_queue *tx_queue)
1196 {
1197 	unsigned i;
1198 
1199 	if (tx_queue->buffer) {
1200 		for (i = 0; i <= tx_queue->ptr_mask; ++i)
1201 			efx_tsoh_free(tx_queue, &tx_queue->buffer[i]);
1202 	}
1203 
1204 	while (tx_queue->tso_headers_free != NULL)
1205 		efx_tsoh_block_free(tx_queue, tx_queue->tso_headers_free,
1206 				    tx_queue->efx->pci_dev);
1207 }
1208