xref: /freebsd/sys/dev/ixgbe/ix_txrx.c (revision 79b015331632b33f22bc8d8fef136c2672f2241f)
1 /******************************************************************************
2 
3   Copyright (c) 2001-2015, Intel Corporation
4   All rights reserved.
5 
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8 
9    1. Redistributions of source code must retain the above copyright notice,
10       this list of conditions and the following disclaimer.
11 
12    2. Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16    3. Neither the name of the Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   POSSIBILITY OF SUCH DAMAGE.
31 
32 ******************************************************************************/
33 /*$FreeBSD$*/
34 
35 
36 #ifndef IXGBE_STANDALONE_BUILD
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_rss.h"
40 #endif
41 
42 #include "ixgbe.h"
43 
44 #ifdef	RSS
45 #include <net/rss_config.h>
46 #include <netinet/in_rss.h>
47 #endif
48 
49 #ifdef DEV_NETMAP
50 #include <net/netmap.h>
51 #include <sys/selinfo.h>
52 #include <dev/netmap/netmap_kern.h>
53 
54 extern int ix_crcstrip;
55 #endif
56 
57 /*
58 ** HW RSC control:
59 **  this feature only works with
60 **  IPv4, and only on 82599 and later.
61 **  Also this will cause IP forwarding to
62 **  fail and that can't be controlled by
63 **  the stack as LRO can. For all these
64 **  reasons I've deemed it best to leave
65 **  this off and not bother with a tuneable
66 **  interface, this would need to be compiled
67 **  to enable.
68 */
69 static bool ixgbe_rsc_enable = FALSE;
70 
71 #ifdef IXGBE_FDIR
72 /*
73 ** For Flow Director: this is the
74 ** number of TX packets we sample
75 ** for the filter pool, this means
76 ** every 20th packet will be probed.
77 **
78 ** This feature can be disabled by
79 ** setting this to 0.
80 */
81 static int atr_sample_rate = 20;
82 #endif
83 
84 /* Shared PCI config read/write */
85 inline u16
86 ixgbe_read_pci_cfg(struct ixgbe_hw *hw, u32 reg)
87 {
88 	u16 value;
89 
90 	value = pci_read_config(((struct ixgbe_osdep *)hw->back)->dev,
91 	    reg, 2);
92 
93 	return (value);
94 }
95 
96 inline void
97 ixgbe_write_pci_cfg(struct ixgbe_hw *hw, u32 reg, u16 value)
98 {
99 	pci_write_config(((struct ixgbe_osdep *)hw->back)->dev,
100 	    reg, value, 2);
101 
102 	return;
103 }
104 
105 /*********************************************************************
106  *  Local Function prototypes
107  *********************************************************************/
108 static void	ixgbe_setup_transmit_ring(struct tx_ring *);
109 static void     ixgbe_free_transmit_buffers(struct tx_ring *);
110 static int	ixgbe_setup_receive_ring(struct rx_ring *);
111 static void     ixgbe_free_receive_buffers(struct rx_ring *);
112 
113 static void	ixgbe_rx_checksum(u32, struct mbuf *, u32);
114 static void	ixgbe_refresh_mbufs(struct rx_ring *, int);
115 static int      ixgbe_xmit(struct tx_ring *, struct mbuf **);
116 static int	ixgbe_tx_ctx_setup(struct tx_ring *,
117 		    struct mbuf *, u32 *, u32 *);
118 static int	ixgbe_tso_setup(struct tx_ring *,
119 		    struct mbuf *, u32 *, u32 *);
120 #ifdef IXGBE_FDIR
121 static void	ixgbe_atr(struct tx_ring *, struct mbuf *);
122 #endif
123 static __inline void ixgbe_rx_discard(struct rx_ring *, int);
124 static __inline void ixgbe_rx_input(struct rx_ring *, struct ifnet *,
125 		    struct mbuf *, u32);
126 
127 #ifdef IXGBE_LEGACY_TX
128 /*********************************************************************
129  *  Transmit entry point
130  *
131  *  ixgbe_start is called by the stack to initiate a transmit.
132  *  The driver will remain in this routine as long as there are
133  *  packets to transmit and transmit resources are available.
134  *  In case resources are not available stack is notified and
135  *  the packet is requeued.
136  **********************************************************************/
137 
138 void
139 ixgbe_start_locked(struct tx_ring *txr, struct ifnet * ifp)
140 {
141 	struct mbuf    *m_head;
142 	struct adapter *adapter = txr->adapter;
143 
144 	IXGBE_TX_LOCK_ASSERT(txr);
145 
146 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
147 		return;
148 	if (!adapter->link_active)
149 		return;
150 
151 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
152 		if (txr->tx_avail <= IXGBE_QUEUE_MIN_FREE)
153 			break;
154 
155 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
156 		if (m_head == NULL)
157 			break;
158 
159 		if (ixgbe_xmit(txr, &m_head)) {
160 			if (m_head != NULL)
161 				IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
162 			break;
163 		}
164 		/* Send a copy of the frame to the BPF listener */
165 		ETHER_BPF_MTAP(ifp, m_head);
166 	}
167 	return;
168 }
169 
170 /*
171  * Legacy TX start - called by the stack, this
172  * always uses the first tx ring, and should
173  * not be used with multiqueue tx enabled.
174  */
175 void
176 ixgbe_start(struct ifnet *ifp)
177 {
178 	struct adapter *adapter = ifp->if_softc;
179 	struct tx_ring	*txr = adapter->tx_rings;
180 
181 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
182 		IXGBE_TX_LOCK(txr);
183 		ixgbe_start_locked(txr, ifp);
184 		IXGBE_TX_UNLOCK(txr);
185 	}
186 	return;
187 }
188 
189 #else /* ! IXGBE_LEGACY_TX */
190 
191 /*
192 ** Multiqueue Transmit driver
193 **
194 */
195 int
196 ixgbe_mq_start(struct ifnet *ifp, struct mbuf *m)
197 {
198 	struct adapter	*adapter = ifp->if_softc;
199 	struct ix_queue	*que;
200 	struct tx_ring	*txr;
201 	int 		i, err = 0;
202 #ifdef	RSS
203 	uint32_t bucket_id;
204 #endif
205 
206 	/*
207 	 * When doing RSS, map it to the same outbound queue
208 	 * as the incoming flow would be mapped to.
209 	 *
210 	 * If everything is setup correctly, it should be the
211 	 * same bucket that the current CPU we're on is.
212 	 */
213 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
214 #ifdef	RSS
215 		if (rss_hash2bucket(m->m_pkthdr.flowid,
216 		    M_HASHTYPE_GET(m), &bucket_id) == 0)
217 			/* TODO: spit out something if bucket_id > num_queues? */
218 			i = bucket_id % adapter->num_queues;
219 		else
220 #endif
221 			i = m->m_pkthdr.flowid % adapter->num_queues;
222 	} else
223 		i = curcpu % adapter->num_queues;
224 
225 	/* Check for a hung queue and pick alternative */
226 	if (((1 << i) & adapter->active_queues) == 0)
227 		i = ffsl(adapter->active_queues);
228 
229 	txr = &adapter->tx_rings[i];
230 	que = &adapter->queues[i];
231 
232 	err = drbr_enqueue(ifp, txr->br, m);
233 	if (err)
234 		return (err);
235 	if (IXGBE_TX_TRYLOCK(txr)) {
236 		ixgbe_mq_start_locked(ifp, txr);
237 		IXGBE_TX_UNLOCK(txr);
238 	} else
239 		taskqueue_enqueue(que->tq, &txr->txq_task);
240 
241 	return (0);
242 }
243 
244 int
245 ixgbe_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
246 {
247 	struct adapter  *adapter = txr->adapter;
248         struct mbuf     *next;
249         int             enqueued = 0, err = 0;
250 
251 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
252 	    adapter->link_active == 0)
253 		return (ENETDOWN);
254 
255 	/* Process the queue */
256 #if __FreeBSD_version < 901504
257 	next = drbr_dequeue(ifp, txr->br);
258 	while (next != NULL) {
259 		if ((err = ixgbe_xmit(txr, &next)) != 0) {
260 			if (next != NULL)
261 				err = drbr_enqueue(ifp, txr->br, next);
262 #else
263 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
264 		if ((err = ixgbe_xmit(txr, &next)) != 0) {
265 			if (next == NULL) {
266 				drbr_advance(ifp, txr->br);
267 			} else {
268 				drbr_putback(ifp, txr->br, next);
269 			}
270 #endif
271 			break;
272 		}
273 #if __FreeBSD_version >= 901504
274 		drbr_advance(ifp, txr->br);
275 #endif
276 		enqueued++;
277 #if 0 // this is VF-only
278 #if __FreeBSD_version >= 1100036
279 		/*
280 		 * Since we're looking at the tx ring, we can check
281 		 * to see if we're a VF by examing our tail register
282 		 * address.
283 		 */
284 		if (txr->tail < IXGBE_TDT(0) && next->m_flags & M_MCAST)
285 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
286 #endif
287 #endif
288 		/* Send a copy of the frame to the BPF listener */
289 		ETHER_BPF_MTAP(ifp, next);
290 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
291 			break;
292 #if __FreeBSD_version < 901504
293 		next = drbr_dequeue(ifp, txr->br);
294 #endif
295 	}
296 
297 	if (txr->tx_avail < IXGBE_TX_CLEANUP_THRESHOLD)
298 		ixgbe_txeof(txr);
299 
300 	return (err);
301 }
302 
303 /*
304  * Called from a taskqueue to drain queued transmit packets.
305  */
306 void
307 ixgbe_deferred_mq_start(void *arg, int pending)
308 {
309 	struct tx_ring *txr = arg;
310 	struct adapter *adapter = txr->adapter;
311 	struct ifnet *ifp = adapter->ifp;
312 
313 	IXGBE_TX_LOCK(txr);
314 	if (!drbr_empty(ifp, txr->br))
315 		ixgbe_mq_start_locked(ifp, txr);
316 	IXGBE_TX_UNLOCK(txr);
317 }
318 
319 /*
320  * Flush all ring buffers
321  */
322 void
323 ixgbe_qflush(struct ifnet *ifp)
324 {
325 	struct adapter	*adapter = ifp->if_softc;
326 	struct tx_ring	*txr = adapter->tx_rings;
327 	struct mbuf	*m;
328 
329 	for (int i = 0; i < adapter->num_queues; i++, txr++) {
330 		IXGBE_TX_LOCK(txr);
331 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
332 			m_freem(m);
333 		IXGBE_TX_UNLOCK(txr);
334 	}
335 	if_qflush(ifp);
336 }
337 #endif /* IXGBE_LEGACY_TX */
338 
339 
340 /*********************************************************************
341  *
342  *  This routine maps the mbufs to tx descriptors, allowing the
343  *  TX engine to transmit the packets.
344  *  	- return 0 on success, positive on failure
345  *
346  **********************************************************************/
347 
348 static int
349 ixgbe_xmit(struct tx_ring *txr, struct mbuf **m_headp)
350 {
351 	struct adapter  *adapter = txr->adapter;
352 	u32		olinfo_status = 0, cmd_type_len;
353 	int             i, j, error, nsegs;
354 	int		first;
355 	bool		remap = TRUE;
356 	struct mbuf	*m_head;
357 	bus_dma_segment_t segs[adapter->num_segs];
358 	bus_dmamap_t	map;
359 	struct ixgbe_tx_buf *txbuf;
360 	union ixgbe_adv_tx_desc *txd = NULL;
361 
362 	m_head = *m_headp;
363 
364 	/* Basic descriptor defines */
365         cmd_type_len = (IXGBE_ADVTXD_DTYP_DATA |
366 	    IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT);
367 
368 	if (m_head->m_flags & M_VLANTAG)
369         	cmd_type_len |= IXGBE_ADVTXD_DCMD_VLE;
370 
371         /*
372          * Important to capture the first descriptor
373          * used because it will contain the index of
374          * the one we tell the hardware to report back
375          */
376         first = txr->next_avail_desc;
377 	txbuf = &txr->tx_buffers[first];
378 	map = txbuf->map;
379 
380 	/*
381 	 * Map the packet for DMA.
382 	 */
383 retry:
384 	error = bus_dmamap_load_mbuf_sg(txr->txtag, map,
385 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
386 
387 	if (__predict_false(error)) {
388 		struct mbuf *m;
389 
390 		switch (error) {
391 		case EFBIG:
392 			/* Try it again? - one try */
393 			if (remap == TRUE) {
394 				remap = FALSE;
395 				/*
396 				 * XXX: m_defrag will choke on
397 				 * non-MCLBYTES-sized clusters
398 				 */
399 				m = m_defrag(*m_headp, M_NOWAIT);
400 				if (m == NULL) {
401 					adapter->mbuf_defrag_failed++;
402 					m_freem(*m_headp);
403 					*m_headp = NULL;
404 					return (ENOBUFS);
405 				}
406 				*m_headp = m;
407 				goto retry;
408 			} else
409 				return (error);
410 		case ENOMEM:
411 			txr->no_tx_dma_setup++;
412 			return (error);
413 		default:
414 			txr->no_tx_dma_setup++;
415 			m_freem(*m_headp);
416 			*m_headp = NULL;
417 			return (error);
418 		}
419 	}
420 
421 	/* Make certain there are enough descriptors */
422 	if (nsegs > txr->tx_avail - 2) {
423 		txr->no_desc_avail++;
424 		bus_dmamap_unload(txr->txtag, map);
425 		return (ENOBUFS);
426 	}
427 	m_head = *m_headp;
428 
429 	/*
430 	 * Set up the appropriate offload context
431 	 * this will consume the first descriptor
432 	 */
433 	error = ixgbe_tx_ctx_setup(txr, m_head, &cmd_type_len, &olinfo_status);
434 	if (__predict_false(error)) {
435 		if (error == ENOBUFS)
436 			*m_headp = NULL;
437 		return (error);
438 	}
439 
440 #ifdef IXGBE_FDIR
441 	/* Do the flow director magic */
442 	if ((txr->atr_sample) && (!adapter->fdir_reinit)) {
443 		++txr->atr_count;
444 		if (txr->atr_count >= atr_sample_rate) {
445 			ixgbe_atr(txr, m_head);
446 			txr->atr_count = 0;
447 		}
448 	}
449 #endif
450 
451 	i = txr->next_avail_desc;
452 	for (j = 0; j < nsegs; j++) {
453 		bus_size_t seglen;
454 		bus_addr_t segaddr;
455 
456 		txbuf = &txr->tx_buffers[i];
457 		txd = &txr->tx_base[i];
458 		seglen = segs[j].ds_len;
459 		segaddr = htole64(segs[j].ds_addr);
460 
461 		txd->read.buffer_addr = segaddr;
462 		txd->read.cmd_type_len = htole32(txr->txd_cmd |
463 		    cmd_type_len |seglen);
464 		txd->read.olinfo_status = htole32(olinfo_status);
465 
466 		if (++i == txr->num_desc)
467 			i = 0;
468 	}
469 
470 	txd->read.cmd_type_len |=
471 	    htole32(IXGBE_TXD_CMD_EOP | IXGBE_TXD_CMD_RS);
472 	txr->tx_avail -= nsegs;
473 	txr->next_avail_desc = i;
474 
475 	txbuf->m_head = m_head;
476 	/*
477 	 * Here we swap the map so the last descriptor,
478 	 * which gets the completion interrupt has the
479 	 * real map, and the first descriptor gets the
480 	 * unused map from this descriptor.
481 	 */
482 	txr->tx_buffers[first].map = txbuf->map;
483 	txbuf->map = map;
484 	bus_dmamap_sync(txr->txtag, map, BUS_DMASYNC_PREWRITE);
485 
486         /* Set the EOP descriptor that will be marked done */
487         txbuf = &txr->tx_buffers[first];
488 	txbuf->eop = txd;
489 
490         bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
491             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
492 	/*
493 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
494 	 * hardware that this frame is available to transmit.
495 	 */
496 	++txr->total_packets;
497 	IXGBE_WRITE_REG(&adapter->hw, txr->tail, i);
498 
499 	/* Mark queue as having work */
500 	if (txr->busy == 0)
501 		txr->busy = 1;
502 
503 	return (0);
504 }
505 
506 
507 /*********************************************************************
508  *
509  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
510  *  the information needed to transmit a packet on the wire. This is
511  *  called only once at attach, setup is done every reset.
512  *
513  **********************************************************************/
514 int
515 ixgbe_allocate_transmit_buffers(struct tx_ring *txr)
516 {
517 	struct adapter *adapter = txr->adapter;
518 	device_t dev = adapter->dev;
519 	struct ixgbe_tx_buf *txbuf;
520 	int error, i;
521 
522 	/*
523 	 * Setup DMA descriptor areas.
524 	 */
525 	if ((error = bus_dma_tag_create(
526 			       bus_get_dma_tag(adapter->dev),	/* parent */
527 			       1, 0,		/* alignment, bounds */
528 			       BUS_SPACE_MAXADDR,	/* lowaddr */
529 			       BUS_SPACE_MAXADDR,	/* highaddr */
530 			       NULL, NULL,		/* filter, filterarg */
531 			       IXGBE_TSO_SIZE,		/* maxsize */
532 			       adapter->num_segs,	/* nsegments */
533 			       PAGE_SIZE,		/* maxsegsize */
534 			       0,			/* flags */
535 			       NULL,			/* lockfunc */
536 			       NULL,			/* lockfuncarg */
537 			       &txr->txtag))) {
538 		device_printf(dev,"Unable to allocate TX DMA tag\n");
539 		goto fail;
540 	}
541 
542 	if (!(txr->tx_buffers =
543 	    (struct ixgbe_tx_buf *) malloc(sizeof(struct ixgbe_tx_buf) *
544 	    adapter->num_tx_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
545 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
546 		error = ENOMEM;
547 		goto fail;
548 	}
549 
550         /* Create the descriptor buffer dma maps */
551 	txbuf = txr->tx_buffers;
552 	for (i = 0; i < adapter->num_tx_desc; i++, txbuf++) {
553 		error = bus_dmamap_create(txr->txtag, 0, &txbuf->map);
554 		if (error != 0) {
555 			device_printf(dev, "Unable to create TX DMA map\n");
556 			goto fail;
557 		}
558 	}
559 
560 	return 0;
561 fail:
562 	/* We free all, it handles case where we are in the middle */
563 	ixgbe_free_transmit_structures(adapter);
564 	return (error);
565 }
566 
567 /*********************************************************************
568  *
569  *  Initialize a transmit ring.
570  *
571  **********************************************************************/
572 static void
573 ixgbe_setup_transmit_ring(struct tx_ring *txr)
574 {
575 	struct adapter *adapter = txr->adapter;
576 	struct ixgbe_tx_buf *txbuf;
577 #ifdef DEV_NETMAP
578 	struct netmap_adapter *na = NA(adapter->ifp);
579 	struct netmap_slot *slot;
580 #endif /* DEV_NETMAP */
581 
582 	/* Clear the old ring contents */
583 	IXGBE_TX_LOCK(txr);
584 #ifdef DEV_NETMAP
585 	/*
586 	 * (under lock): if in netmap mode, do some consistency
587 	 * checks and set slot to entry 0 of the netmap ring.
588 	 */
589 	slot = netmap_reset(na, NR_TX, txr->me, 0);
590 #endif /* DEV_NETMAP */
591 	bzero((void *)txr->tx_base,
592 	      (sizeof(union ixgbe_adv_tx_desc)) * adapter->num_tx_desc);
593 	/* Reset indices */
594 	txr->next_avail_desc = 0;
595 	txr->next_to_clean = 0;
596 
597 	/* Free any existing tx buffers. */
598         txbuf = txr->tx_buffers;
599 	for (int i = 0; i < txr->num_desc; i++, txbuf++) {
600 		if (txbuf->m_head != NULL) {
601 			bus_dmamap_sync(txr->txtag, txbuf->map,
602 			    BUS_DMASYNC_POSTWRITE);
603 			bus_dmamap_unload(txr->txtag, txbuf->map);
604 			m_freem(txbuf->m_head);
605 			txbuf->m_head = NULL;
606 		}
607 #ifdef DEV_NETMAP
608 		/*
609 		 * In netmap mode, set the map for the packet buffer.
610 		 * NOTE: Some drivers (not this one) also need to set
611 		 * the physical buffer address in the NIC ring.
612 		 * Slots in the netmap ring (indexed by "si") are
613 		 * kring->nkr_hwofs positions "ahead" wrt the
614 		 * corresponding slot in the NIC ring. In some drivers
615 		 * (not here) nkr_hwofs can be negative. Function
616 		 * netmap_idx_n2k() handles wraparounds properly.
617 		 */
618 		if (slot) {
619 			int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
620 			netmap_load_map(na, txr->txtag,
621 			    txbuf->map, NMB(na, slot + si));
622 		}
623 #endif /* DEV_NETMAP */
624 		/* Clear the EOP descriptor pointer */
625 		txbuf->eop = NULL;
626         }
627 
628 #ifdef IXGBE_FDIR
629 	/* Set the rate at which we sample packets */
630 	if (adapter->hw.mac.type != ixgbe_mac_82598EB)
631 		txr->atr_sample = atr_sample_rate;
632 #endif
633 
634 	/* Set number of descriptors available */
635 	txr->tx_avail = adapter->num_tx_desc;
636 
637 	bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
638 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
639 	IXGBE_TX_UNLOCK(txr);
640 }
641 
642 /*********************************************************************
643  *
644  *  Initialize all transmit rings.
645  *
646  **********************************************************************/
647 int
648 ixgbe_setup_transmit_structures(struct adapter *adapter)
649 {
650 	struct tx_ring *txr = adapter->tx_rings;
651 
652 	for (int i = 0; i < adapter->num_queues; i++, txr++)
653 		ixgbe_setup_transmit_ring(txr);
654 
655 	return (0);
656 }
657 
658 /*********************************************************************
659  *
660  *  Free all transmit rings.
661  *
662  **********************************************************************/
663 void
664 ixgbe_free_transmit_structures(struct adapter *adapter)
665 {
666 	struct tx_ring *txr = adapter->tx_rings;
667 
668 	for (int i = 0; i < adapter->num_queues; i++, txr++) {
669 		IXGBE_TX_LOCK(txr);
670 		ixgbe_free_transmit_buffers(txr);
671 		ixgbe_dma_free(adapter, &txr->txdma);
672 		IXGBE_TX_UNLOCK(txr);
673 		IXGBE_TX_LOCK_DESTROY(txr);
674 	}
675 	free(adapter->tx_rings, M_DEVBUF);
676 }
677 
678 /*********************************************************************
679  *
680  *  Free transmit ring related data structures.
681  *
682  **********************************************************************/
683 static void
684 ixgbe_free_transmit_buffers(struct tx_ring *txr)
685 {
686 	struct adapter *adapter = txr->adapter;
687 	struct ixgbe_tx_buf *tx_buffer;
688 	int             i;
689 
690 	INIT_DEBUGOUT("ixgbe_free_transmit_ring: begin");
691 
692 	if (txr->tx_buffers == NULL)
693 		return;
694 
695 	tx_buffer = txr->tx_buffers;
696 	for (i = 0; i < adapter->num_tx_desc; i++, tx_buffer++) {
697 		if (tx_buffer->m_head != NULL) {
698 			bus_dmamap_sync(txr->txtag, tx_buffer->map,
699 			    BUS_DMASYNC_POSTWRITE);
700 			bus_dmamap_unload(txr->txtag,
701 			    tx_buffer->map);
702 			m_freem(tx_buffer->m_head);
703 			tx_buffer->m_head = NULL;
704 			if (tx_buffer->map != NULL) {
705 				bus_dmamap_destroy(txr->txtag,
706 				    tx_buffer->map);
707 				tx_buffer->map = NULL;
708 			}
709 		} else if (tx_buffer->map != NULL) {
710 			bus_dmamap_unload(txr->txtag,
711 			    tx_buffer->map);
712 			bus_dmamap_destroy(txr->txtag,
713 			    tx_buffer->map);
714 			tx_buffer->map = NULL;
715 		}
716 	}
717 #ifdef IXGBE_LEGACY_TX
718 	if (txr->br != NULL)
719 		buf_ring_free(txr->br, M_DEVBUF);
720 #endif
721 	if (txr->tx_buffers != NULL) {
722 		free(txr->tx_buffers, M_DEVBUF);
723 		txr->tx_buffers = NULL;
724 	}
725 	if (txr->txtag != NULL) {
726 		bus_dma_tag_destroy(txr->txtag);
727 		txr->txtag = NULL;
728 	}
729 	return;
730 }
731 
732 /*********************************************************************
733  *
734  *  Advanced Context Descriptor setup for VLAN, CSUM or TSO
735  *
736  **********************************************************************/
737 
738 static int
739 ixgbe_tx_ctx_setup(struct tx_ring *txr, struct mbuf *mp,
740     u32 *cmd_type_len, u32 *olinfo_status)
741 {
742 	struct adapter *adapter = txr->adapter;
743 	struct ixgbe_adv_tx_context_desc *TXD;
744 	struct ether_vlan_header *eh;
745 	struct ip *ip;
746 	struct ip6_hdr *ip6;
747 	u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0;
748 	int	ehdrlen, ip_hlen = 0;
749 	u16	etype;
750 	u8	ipproto = 0;
751 	int	offload = TRUE;
752 	int	ctxd = txr->next_avail_desc;
753 	u16	vtag = 0;
754 
755 	/* First check if TSO is to be used */
756 	if (mp->m_pkthdr.csum_flags & CSUM_TSO)
757 		return (ixgbe_tso_setup(txr, mp, cmd_type_len, olinfo_status));
758 
759 	if ((mp->m_pkthdr.csum_flags & CSUM_OFFLOAD) == 0)
760 		offload = FALSE;
761 
762 	/* Indicate the whole packet as payload when not doing TSO */
763        	*olinfo_status |= mp->m_pkthdr.len << IXGBE_ADVTXD_PAYLEN_SHIFT;
764 
765 	/* Now ready a context descriptor */
766 	TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd];
767 
768 	/*
769 	** In advanced descriptors the vlan tag must
770 	** be placed into the context descriptor. Hence
771 	** we need to make one even if not doing offloads.
772 	*/
773 	if (mp->m_flags & M_VLANTAG) {
774 		vtag = htole16(mp->m_pkthdr.ether_vtag);
775 		vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT);
776 	} else if (!IXGBE_IS_X550VF(adapter) && (offload == FALSE))
777 		return (0);
778 
779 	/*
780 	 * Determine where frame payload starts.
781 	 * Jump over vlan headers if already present,
782 	 * helpful for QinQ too.
783 	 */
784 	eh = mtod(mp, struct ether_vlan_header *);
785 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
786 		etype = ntohs(eh->evl_proto);
787 		ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
788 	} else {
789 		etype = ntohs(eh->evl_encap_proto);
790 		ehdrlen = ETHER_HDR_LEN;
791 	}
792 
793 	/* Set the ether header length */
794 	vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT;
795 
796 	if (offload == FALSE)
797 		goto no_offloads;
798 
799 	switch (etype) {
800 		case ETHERTYPE_IP:
801 			ip = (struct ip *)(mp->m_data + ehdrlen);
802 			ip_hlen = ip->ip_hl << 2;
803 			ipproto = ip->ip_p;
804 			type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
805 			break;
806 		case ETHERTYPE_IPV6:
807 			ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen);
808 			ip_hlen = sizeof(struct ip6_hdr);
809 			/* XXX-BZ this will go badly in case of ext hdrs. */
810 			ipproto = ip6->ip6_nxt;
811 			type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
812 			break;
813 		default:
814 			offload = FALSE;
815 			break;
816 	}
817 
818 	vlan_macip_lens |= ip_hlen;
819 
820 	switch (ipproto) {
821 		case IPPROTO_TCP:
822 			if (mp->m_pkthdr.csum_flags & CSUM_TCP)
823 				type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
824 			break;
825 
826 		case IPPROTO_UDP:
827 			if (mp->m_pkthdr.csum_flags & CSUM_UDP)
828 				type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP;
829 			break;
830 
831 #if __FreeBSD_version >= 800000
832 		case IPPROTO_SCTP:
833 			if (mp->m_pkthdr.csum_flags & CSUM_SCTP)
834 				type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP;
835 			break;
836 #endif
837 		default:
838 			offload = FALSE;
839 			break;
840 	}
841 
842 	if (offload) /* For the TX descriptor setup */
843 		*olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8;
844 
845 no_offloads:
846 	type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
847 
848 	/* Now copy bits into descriptor */
849 	TXD->vlan_macip_lens = htole32(vlan_macip_lens);
850 	TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
851 	TXD->seqnum_seed = htole32(0);
852 	TXD->mss_l4len_idx = htole32(0);
853 
854 	/* We've consumed the first desc, adjust counters */
855 	if (++ctxd == txr->num_desc)
856 		ctxd = 0;
857 	txr->next_avail_desc = ctxd;
858 	--txr->tx_avail;
859 
860         return (0);
861 }
862 
863 /**********************************************************************
864  *
865  *  Setup work for hardware segmentation offload (TSO) on
866  *  adapters using advanced tx descriptors
867  *
868  **********************************************************************/
869 static int
870 ixgbe_tso_setup(struct tx_ring *txr, struct mbuf *mp,
871     u32 *cmd_type_len, u32 *olinfo_status)
872 {
873 	struct ixgbe_adv_tx_context_desc *TXD;
874 	u32 vlan_macip_lens = 0, type_tucmd_mlhl = 0;
875 	u32 mss_l4len_idx = 0, paylen;
876 	u16 vtag = 0, eh_type;
877 	int ctxd, ehdrlen, ip_hlen, tcp_hlen;
878 	struct ether_vlan_header *eh;
879 #ifdef INET6
880 	struct ip6_hdr *ip6;
881 #endif
882 #ifdef INET
883 	struct ip *ip;
884 #endif
885 	struct tcphdr *th;
886 
887 
888 	/*
889 	 * Determine where frame payload starts.
890 	 * Jump over vlan headers if already present
891 	 */
892 	eh = mtod(mp, struct ether_vlan_header *);
893 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
894 		ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
895 		eh_type = eh->evl_proto;
896 	} else {
897 		ehdrlen = ETHER_HDR_LEN;
898 		eh_type = eh->evl_encap_proto;
899 	}
900 
901 	switch (ntohs(eh_type)) {
902 #ifdef INET6
903 	case ETHERTYPE_IPV6:
904 		ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen);
905 		/* XXX-BZ For now we do not pretend to support ext. hdrs. */
906 		if (ip6->ip6_nxt != IPPROTO_TCP)
907 			return (ENXIO);
908 		ip_hlen = sizeof(struct ip6_hdr);
909 		ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen);
910 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
911 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
912 		type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
913 		break;
914 #endif
915 #ifdef INET
916 	case ETHERTYPE_IP:
917 		ip = (struct ip *)(mp->m_data + ehdrlen);
918 		if (ip->ip_p != IPPROTO_TCP)
919 			return (ENXIO);
920 		ip->ip_sum = 0;
921 		ip_hlen = ip->ip_hl << 2;
922 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
923 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
924 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
925 		type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
926 		/* Tell transmit desc to also do IPv4 checksum. */
927 		*olinfo_status |= IXGBE_TXD_POPTS_IXSM << 8;
928 		break;
929 #endif
930 	default:
931 		panic("%s: CSUM_TSO but no supported IP version (0x%04x)",
932 		    __func__, ntohs(eh_type));
933 		break;
934 	}
935 
936 	ctxd = txr->next_avail_desc;
937 	TXD = (struct ixgbe_adv_tx_context_desc *) &txr->tx_base[ctxd];
938 
939 	tcp_hlen = th->th_off << 2;
940 
941 	/* This is used in the transmit desc in encap */
942 	paylen = mp->m_pkthdr.len - ehdrlen - ip_hlen - tcp_hlen;
943 
944 	/* VLAN MACLEN IPLEN */
945 	if (mp->m_flags & M_VLANTAG) {
946 		vtag = htole16(mp->m_pkthdr.ether_vtag);
947                 vlan_macip_lens |= (vtag << IXGBE_ADVTXD_VLAN_SHIFT);
948 	}
949 
950 	vlan_macip_lens |= ehdrlen << IXGBE_ADVTXD_MACLEN_SHIFT;
951 	vlan_macip_lens |= ip_hlen;
952 	TXD->vlan_macip_lens = htole32(vlan_macip_lens);
953 
954 	/* ADV DTYPE TUCMD */
955 	type_tucmd_mlhl |= IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
956 	type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
957 	TXD->type_tucmd_mlhl = htole32(type_tucmd_mlhl);
958 
959 	/* MSS L4LEN IDX */
960 	mss_l4len_idx |= (mp->m_pkthdr.tso_segsz << IXGBE_ADVTXD_MSS_SHIFT);
961 	mss_l4len_idx |= (tcp_hlen << IXGBE_ADVTXD_L4LEN_SHIFT);
962 	TXD->mss_l4len_idx = htole32(mss_l4len_idx);
963 
964 	TXD->seqnum_seed = htole32(0);
965 
966 	if (++ctxd == txr->num_desc)
967 		ctxd = 0;
968 
969 	txr->tx_avail--;
970 	txr->next_avail_desc = ctxd;
971 	*cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
972 	*olinfo_status |= IXGBE_TXD_POPTS_TXSM << 8;
973 	*olinfo_status |= paylen << IXGBE_ADVTXD_PAYLEN_SHIFT;
974 	++txr->tso_tx;
975 	return (0);
976 }
977 
978 
979 /**********************************************************************
980  *
981  *  Examine each tx_buffer in the used queue. If the hardware is done
982  *  processing the packet then free associated resources. The
983  *  tx_buffer is put back on the free queue.
984  *
985  **********************************************************************/
986 void
987 ixgbe_txeof(struct tx_ring *txr)
988 {
989 #ifdef DEV_NETMAP
990 	struct adapter		*adapter = txr->adapter;
991 	struct ifnet		*ifp = adapter->ifp;
992 #endif
993 	u32			work, processed = 0;
994 	u16			limit = txr->process_limit;
995 	struct ixgbe_tx_buf	*buf;
996 	union ixgbe_adv_tx_desc *txd;
997 
998 	mtx_assert(&txr->tx_mtx, MA_OWNED);
999 
1000 #ifdef DEV_NETMAP
1001 	if (ifp->if_capenable & IFCAP_NETMAP) {
1002 		struct netmap_adapter *na = NA(ifp);
1003 		struct netmap_kring *kring = &na->tx_rings[txr->me];
1004 		txd = txr->tx_base;
1005 		bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
1006 		    BUS_DMASYNC_POSTREAD);
1007 		/*
1008 		 * In netmap mode, all the work is done in the context
1009 		 * of the client thread. Interrupt handlers only wake up
1010 		 * clients, which may be sleeping on individual rings
1011 		 * or on a global resource for all rings.
1012 		 * To implement tx interrupt mitigation, we wake up the client
1013 		 * thread roughly every half ring, even if the NIC interrupts
1014 		 * more frequently. This is implemented as follows:
1015 		 * - ixgbe_txsync() sets kring->nr_kflags with the index of
1016 		 *   the slot that should wake up the thread (nkr_num_slots
1017 		 *   means the user thread should not be woken up);
1018 		 * - the driver ignores tx interrupts unless netmap_mitigate=0
1019 		 *   or the slot has the DD bit set.
1020 		 */
1021 		if (!netmap_mitigate ||
1022 		    (kring->nr_kflags < kring->nkr_num_slots &&
1023 		    txd[kring->nr_kflags].wb.status & IXGBE_TXD_STAT_DD)) {
1024 			netmap_tx_irq(ifp, txr->me);
1025 		}
1026 		return;
1027 	}
1028 #endif /* DEV_NETMAP */
1029 
1030 	if (txr->tx_avail == txr->num_desc) {
1031 		txr->busy = 0;
1032 		return;
1033 	}
1034 
1035 	/* Get work starting point */
1036 	work = txr->next_to_clean;
1037 	buf = &txr->tx_buffers[work];
1038 	txd = &txr->tx_base[work];
1039 	work -= txr->num_desc; /* The distance to ring end */
1040         bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
1041             BUS_DMASYNC_POSTREAD);
1042 
1043 	do {
1044 		union ixgbe_adv_tx_desc *eop= buf->eop;
1045 		if (eop == NULL) /* No work */
1046 			break;
1047 
1048 		if ((eop->wb.status & IXGBE_TXD_STAT_DD) == 0)
1049 			break;	/* I/O not complete */
1050 
1051 		if (buf->m_head) {
1052 			txr->bytes +=
1053 			    buf->m_head->m_pkthdr.len;
1054 			bus_dmamap_sync(txr->txtag,
1055 			    buf->map,
1056 			    BUS_DMASYNC_POSTWRITE);
1057 			bus_dmamap_unload(txr->txtag,
1058 			    buf->map);
1059 			m_freem(buf->m_head);
1060 			buf->m_head = NULL;
1061 		}
1062 		buf->eop = NULL;
1063 		++txr->tx_avail;
1064 
1065 		/* We clean the range if multi segment */
1066 		while (txd != eop) {
1067 			++txd;
1068 			++buf;
1069 			++work;
1070 			/* wrap the ring? */
1071 			if (__predict_false(!work)) {
1072 				work -= txr->num_desc;
1073 				buf = txr->tx_buffers;
1074 				txd = txr->tx_base;
1075 			}
1076 			if (buf->m_head) {
1077 				txr->bytes +=
1078 				    buf->m_head->m_pkthdr.len;
1079 				bus_dmamap_sync(txr->txtag,
1080 				    buf->map,
1081 				    BUS_DMASYNC_POSTWRITE);
1082 				bus_dmamap_unload(txr->txtag,
1083 				    buf->map);
1084 				m_freem(buf->m_head);
1085 				buf->m_head = NULL;
1086 			}
1087 			++txr->tx_avail;
1088 			buf->eop = NULL;
1089 
1090 		}
1091 		++txr->packets;
1092 		++processed;
1093 
1094 		/* Try the next packet */
1095 		++txd;
1096 		++buf;
1097 		++work;
1098 		/* reset with a wrap */
1099 		if (__predict_false(!work)) {
1100 			work -= txr->num_desc;
1101 			buf = txr->tx_buffers;
1102 			txd = txr->tx_base;
1103 		}
1104 		prefetch(txd);
1105 	} while (__predict_true(--limit));
1106 
1107 	bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
1108 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1109 
1110 	work += txr->num_desc;
1111 	txr->next_to_clean = work;
1112 
1113 	/*
1114 	** Queue Hang detection, we know there's
1115 	** work outstanding or the first return
1116 	** would have been taken, so increment busy
1117 	** if nothing managed to get cleaned, then
1118 	** in local_timer it will be checked and
1119 	** marked as HUNG if it exceeds a MAX attempt.
1120 	*/
1121 	if ((processed == 0) && (txr->busy != IXGBE_QUEUE_HUNG))
1122 		++txr->busy;
1123 	/*
1124 	** If anything gets cleaned we reset state to 1,
1125 	** note this will turn off HUNG if its set.
1126 	*/
1127 	if (processed)
1128 		txr->busy = 1;
1129 
1130 	if (txr->tx_avail == txr->num_desc)
1131 		txr->busy = 0;
1132 
1133 	return;
1134 }
1135 
1136 
1137 #ifdef IXGBE_FDIR
1138 /*
1139 ** This routine parses packet headers so that Flow
1140 ** Director can make a hashed filter table entry
1141 ** allowing traffic flows to be identified and kept
1142 ** on the same cpu.  This would be a performance
1143 ** hit, but we only do it at IXGBE_FDIR_RATE of
1144 ** packets.
1145 */
1146 static void
1147 ixgbe_atr(struct tx_ring *txr, struct mbuf *mp)
1148 {
1149 	struct adapter			*adapter = txr->adapter;
1150 	struct ix_queue			*que;
1151 	struct ip			*ip;
1152 	struct tcphdr			*th;
1153 	struct udphdr			*uh;
1154 	struct ether_vlan_header	*eh;
1155 	union ixgbe_atr_hash_dword	input = {.dword = 0};
1156 	union ixgbe_atr_hash_dword	common = {.dword = 0};
1157 	int  				ehdrlen, ip_hlen;
1158 	u16				etype;
1159 
1160 	eh = mtod(mp, struct ether_vlan_header *);
1161 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1162 		ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1163 		etype = eh->evl_proto;
1164 	} else {
1165 		ehdrlen = ETHER_HDR_LEN;
1166 		etype = eh->evl_encap_proto;
1167 	}
1168 
1169 	/* Only handling IPv4 */
1170 	if (etype != htons(ETHERTYPE_IP))
1171 		return;
1172 
1173 	ip = (struct ip *)(mp->m_data + ehdrlen);
1174 	ip_hlen = ip->ip_hl << 2;
1175 
1176 	/* check if we're UDP or TCP */
1177 	switch (ip->ip_p) {
1178 	case IPPROTO_TCP:
1179 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
1180 		/* src and dst are inverted */
1181 		common.port.dst ^= th->th_sport;
1182 		common.port.src ^= th->th_dport;
1183 		input.formatted.flow_type ^= IXGBE_ATR_FLOW_TYPE_TCPV4;
1184 		break;
1185 	case IPPROTO_UDP:
1186 		uh = (struct udphdr *)((caddr_t)ip + ip_hlen);
1187 		/* src and dst are inverted */
1188 		common.port.dst ^= uh->uh_sport;
1189 		common.port.src ^= uh->uh_dport;
1190 		input.formatted.flow_type ^= IXGBE_ATR_FLOW_TYPE_UDPV4;
1191 		break;
1192 	default:
1193 		return;
1194 	}
1195 
1196 	input.formatted.vlan_id = htobe16(mp->m_pkthdr.ether_vtag);
1197 	if (mp->m_pkthdr.ether_vtag)
1198 		common.flex_bytes ^= htons(ETHERTYPE_VLAN);
1199 	else
1200 		common.flex_bytes ^= etype;
1201 	common.ip ^= ip->ip_src.s_addr ^ ip->ip_dst.s_addr;
1202 
1203 	que = &adapter->queues[txr->me];
1204 	/*
1205 	** This assumes the Rx queue and Tx
1206 	** queue are bound to the same CPU
1207 	*/
1208 	ixgbe_fdir_add_signature_filter_82599(&adapter->hw,
1209 	    input, common, que->msix);
1210 }
1211 #endif /* IXGBE_FDIR */
1212 
1213 /*
1214 ** Used to detect a descriptor that has
1215 ** been merged by Hardware RSC.
1216 */
1217 static inline u32
1218 ixgbe_rsc_count(union ixgbe_adv_rx_desc *rx)
1219 {
1220 	return (le32toh(rx->wb.lower.lo_dword.data) &
1221 	    IXGBE_RXDADV_RSCCNT_MASK) >> IXGBE_RXDADV_RSCCNT_SHIFT;
1222 }
1223 
1224 /*********************************************************************
1225  *
1226  *  Initialize Hardware RSC (LRO) feature on 82599
1227  *  for an RX ring, this is toggled by the LRO capability
1228  *  even though it is transparent to the stack.
1229  *
1230  *  NOTE: since this HW feature only works with IPV4 and
1231  *        our testing has shown soft LRO to be as effective
1232  *        I have decided to disable this by default.
1233  *
1234  **********************************************************************/
1235 static void
1236 ixgbe_setup_hw_rsc(struct rx_ring *rxr)
1237 {
1238 	struct	adapter 	*adapter = rxr->adapter;
1239 	struct	ixgbe_hw	*hw = &adapter->hw;
1240 	u32			rscctrl, rdrxctl;
1241 
1242 	/* If turning LRO/RSC off we need to disable it */
1243 	if ((adapter->ifp->if_capenable & IFCAP_LRO) == 0) {
1244 		rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxr->me));
1245 		rscctrl &= ~IXGBE_RSCCTL_RSCEN;
1246 		return;
1247 	}
1248 
1249 	rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
1250 	rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
1251 #ifdef DEV_NETMAP /* crcstrip is optional in netmap */
1252 	if (adapter->ifp->if_capenable & IFCAP_NETMAP && !ix_crcstrip)
1253 #endif /* DEV_NETMAP */
1254 	rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP;
1255 	rdrxctl |= IXGBE_RDRXCTL_RSCACKC;
1256 	IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
1257 
1258 	rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxr->me));
1259 	rscctrl |= IXGBE_RSCCTL_RSCEN;
1260 	/*
1261 	** Limit the total number of descriptors that
1262 	** can be combined, so it does not exceed 64K
1263 	*/
1264 	if (rxr->mbuf_sz == MCLBYTES)
1265 		rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
1266 	else if (rxr->mbuf_sz == MJUMPAGESIZE)
1267 		rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
1268 	else if (rxr->mbuf_sz == MJUM9BYTES)
1269 		rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
1270 	else  /* Using 16K cluster */
1271 		rscctrl |= IXGBE_RSCCTL_MAXDESC_1;
1272 
1273 	IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(rxr->me), rscctrl);
1274 
1275 	/* Enable TCP header recognition */
1276 	IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0),
1277 	    (IXGBE_READ_REG(hw, IXGBE_PSRTYPE(0)) |
1278 	    IXGBE_PSRTYPE_TCPHDR));
1279 
1280 	/* Disable RSC for ACK packets */
1281 	IXGBE_WRITE_REG(hw, IXGBE_RSCDBU,
1282 	    (IXGBE_RSCDBU_RSCACKDIS | IXGBE_READ_REG(hw, IXGBE_RSCDBU)));
1283 
1284 	rxr->hw_rsc = TRUE;
1285 }
1286 /*********************************************************************
1287  *
1288  *  Refresh mbuf buffers for RX descriptor rings
1289  *   - now keeps its own state so discards due to resource
1290  *     exhaustion are unnecessary, if an mbuf cannot be obtained
1291  *     it just returns, keeping its placeholder, thus it can simply
1292  *     be recalled to try again.
1293  *
1294  **********************************************************************/
1295 static void
1296 ixgbe_refresh_mbufs(struct rx_ring *rxr, int limit)
1297 {
1298 	struct adapter		*adapter = rxr->adapter;
1299 	bus_dma_segment_t	seg[1];
1300 	struct ixgbe_rx_buf	*rxbuf;
1301 	struct mbuf		*mp;
1302 	int			i, j, nsegs, error;
1303 	bool			refreshed = FALSE;
1304 
1305 	i = j = rxr->next_to_refresh;
1306 	/* Control the loop with one beyond */
1307 	if (++j == rxr->num_desc)
1308 		j = 0;
1309 
1310 	while (j != limit) {
1311 		rxbuf = &rxr->rx_buffers[i];
1312 		if (rxbuf->buf == NULL) {
1313 			mp = m_getjcl(M_NOWAIT, MT_DATA,
1314 			    M_PKTHDR, rxr->mbuf_sz);
1315 			if (mp == NULL)
1316 				goto update;
1317 			if (adapter->max_frame_size <= (MCLBYTES - ETHER_ALIGN))
1318 				m_adj(mp, ETHER_ALIGN);
1319 		} else
1320 			mp = rxbuf->buf;
1321 
1322 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1323 
1324 		/* If we're dealing with an mbuf that was copied rather
1325 		 * than replaced, there's no need to go through busdma.
1326 		 */
1327 		if ((rxbuf->flags & IXGBE_RX_COPY) == 0) {
1328 			/* Get the memory mapping */
1329 			bus_dmamap_unload(rxr->ptag, rxbuf->pmap);
1330 			error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1331 			    rxbuf->pmap, mp, seg, &nsegs, BUS_DMA_NOWAIT);
1332 			if (error != 0) {
1333 				printf("Refresh mbufs: payload dmamap load"
1334 				    " failure - %d\n", error);
1335 				m_free(mp);
1336 				rxbuf->buf = NULL;
1337 				goto update;
1338 			}
1339 			rxbuf->buf = mp;
1340 			bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
1341 			    BUS_DMASYNC_PREREAD);
1342 			rxbuf->addr = rxr->rx_base[i].read.pkt_addr =
1343 			    htole64(seg[0].ds_addr);
1344 		} else {
1345 			rxr->rx_base[i].read.pkt_addr = rxbuf->addr;
1346 			rxbuf->flags &= ~IXGBE_RX_COPY;
1347 		}
1348 
1349 		refreshed = TRUE;
1350 		/* Next is precalculated */
1351 		i = j;
1352 		rxr->next_to_refresh = i;
1353 		if (++j == rxr->num_desc)
1354 			j = 0;
1355 	}
1356 update:
1357 	if (refreshed) /* Update hardware tail index */
1358 		IXGBE_WRITE_REG(&adapter->hw,
1359 		    rxr->tail, rxr->next_to_refresh);
1360 	return;
1361 }
1362 
1363 /*********************************************************************
1364  *
1365  *  Allocate memory for rx_buffer structures. Since we use one
1366  *  rx_buffer per received packet, the maximum number of rx_buffer's
1367  *  that we'll need is equal to the number of receive descriptors
1368  *  that we've allocated.
1369  *
1370  **********************************************************************/
1371 int
1372 ixgbe_allocate_receive_buffers(struct rx_ring *rxr)
1373 {
1374 	struct	adapter 	*adapter = rxr->adapter;
1375 	device_t 		dev = adapter->dev;
1376 	struct ixgbe_rx_buf 	*rxbuf;
1377 	int             	bsize, error;
1378 
1379 	bsize = sizeof(struct ixgbe_rx_buf) * rxr->num_desc;
1380 	if (!(rxr->rx_buffers =
1381 	    (struct ixgbe_rx_buf *) malloc(bsize,
1382 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
1383 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
1384 		error = ENOMEM;
1385 		goto fail;
1386 	}
1387 
1388 	if ((error = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
1389 				   1, 0,	/* alignment, bounds */
1390 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1391 				   BUS_SPACE_MAXADDR,	/* highaddr */
1392 				   NULL, NULL,		/* filter, filterarg */
1393 				   MJUM16BYTES,		/* maxsize */
1394 				   1,			/* nsegments */
1395 				   MJUM16BYTES,		/* maxsegsize */
1396 				   0,			/* flags */
1397 				   NULL,		/* lockfunc */
1398 				   NULL,		/* lockfuncarg */
1399 				   &rxr->ptag))) {
1400 		device_printf(dev, "Unable to create RX DMA tag\n");
1401 		goto fail;
1402 	}
1403 
1404 	for (int i = 0; i < rxr->num_desc; i++, rxbuf++) {
1405 		rxbuf = &rxr->rx_buffers[i];
1406 		error = bus_dmamap_create(rxr->ptag, 0, &rxbuf->pmap);
1407 		if (error) {
1408 			device_printf(dev, "Unable to create RX dma map\n");
1409 			goto fail;
1410 		}
1411 	}
1412 
1413 	return (0);
1414 
1415 fail:
1416 	/* Frees all, but can handle partial completion */
1417 	ixgbe_free_receive_structures(adapter);
1418 	return (error);
1419 }
1420 
1421 
1422 static void
1423 ixgbe_free_receive_ring(struct rx_ring *rxr)
1424 {
1425 	struct ixgbe_rx_buf       *rxbuf;
1426 
1427 	for (int i = 0; i < rxr->num_desc; i++) {
1428 		rxbuf = &rxr->rx_buffers[i];
1429 		if (rxbuf->buf != NULL) {
1430 			bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
1431 			    BUS_DMASYNC_POSTREAD);
1432 			bus_dmamap_unload(rxr->ptag, rxbuf->pmap);
1433 			rxbuf->buf->m_flags |= M_PKTHDR;
1434 			m_freem(rxbuf->buf);
1435 			rxbuf->buf = NULL;
1436 			rxbuf->flags = 0;
1437 		}
1438 	}
1439 }
1440 
1441 
1442 /*********************************************************************
1443  *
1444  *  Initialize a receive ring and its buffers.
1445  *
1446  **********************************************************************/
1447 static int
1448 ixgbe_setup_receive_ring(struct rx_ring *rxr)
1449 {
1450 	struct	adapter 	*adapter;
1451 	struct ifnet		*ifp;
1452 	device_t		dev;
1453 	struct ixgbe_rx_buf	*rxbuf;
1454 	bus_dma_segment_t	seg[1];
1455 	struct lro_ctrl		*lro = &rxr->lro;
1456 	int			rsize, nsegs, error = 0;
1457 #ifdef DEV_NETMAP
1458 	struct netmap_adapter *na = NA(rxr->adapter->ifp);
1459 	struct netmap_slot *slot;
1460 #endif /* DEV_NETMAP */
1461 
1462 	adapter = rxr->adapter;
1463 	ifp = adapter->ifp;
1464 	dev = adapter->dev;
1465 
1466 	/* Clear the ring contents */
1467 	IXGBE_RX_LOCK(rxr);
1468 #ifdef DEV_NETMAP
1469 	/* same as in ixgbe_setup_transmit_ring() */
1470 	slot = netmap_reset(na, NR_RX, rxr->me, 0);
1471 #endif /* DEV_NETMAP */
1472 	rsize = roundup2(adapter->num_rx_desc *
1473 	    sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN);
1474 	bzero((void *)rxr->rx_base, rsize);
1475 	/* Cache the size */
1476 	rxr->mbuf_sz = adapter->rx_mbuf_sz;
1477 
1478 	/* Free current RX buffer structs and their mbufs */
1479 	ixgbe_free_receive_ring(rxr);
1480 
1481 	/* Now replenish the mbufs */
1482 	for (int j = 0; j != rxr->num_desc; ++j) {
1483 		struct mbuf	*mp;
1484 
1485 		rxbuf = &rxr->rx_buffers[j];
1486 #ifdef DEV_NETMAP
1487 		/*
1488 		 * In netmap mode, fill the map and set the buffer
1489 		 * address in the NIC ring, considering the offset
1490 		 * between the netmap and NIC rings (see comment in
1491 		 * ixgbe_setup_transmit_ring() ). No need to allocate
1492 		 * an mbuf, so end the block with a continue;
1493 		 */
1494 		if (slot) {
1495 			int sj = netmap_idx_n2k(&na->rx_rings[rxr->me], j);
1496 			uint64_t paddr;
1497 			void *addr;
1498 
1499 			addr = PNMB(na, slot + sj, &paddr);
1500 			netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
1501 			/* Update descriptor and the cached value */
1502 			rxr->rx_base[j].read.pkt_addr = htole64(paddr);
1503 			rxbuf->addr = htole64(paddr);
1504 			continue;
1505 		}
1506 #endif /* DEV_NETMAP */
1507 		rxbuf->flags = 0;
1508 		rxbuf->buf = m_getjcl(M_NOWAIT, MT_DATA,
1509 		    M_PKTHDR, adapter->rx_mbuf_sz);
1510 		if (rxbuf->buf == NULL) {
1511 			error = ENOBUFS;
1512                         goto fail;
1513 		}
1514 		mp = rxbuf->buf;
1515 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1516 		/* Get the memory mapping */
1517 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1518 		    rxbuf->pmap, mp, seg,
1519 		    &nsegs, BUS_DMA_NOWAIT);
1520 		if (error != 0)
1521                         goto fail;
1522 		bus_dmamap_sync(rxr->ptag,
1523 		    rxbuf->pmap, BUS_DMASYNC_PREREAD);
1524 		/* Update the descriptor and the cached value */
1525 		rxr->rx_base[j].read.pkt_addr = htole64(seg[0].ds_addr);
1526 		rxbuf->addr = htole64(seg[0].ds_addr);
1527 	}
1528 
1529 
1530 	/* Setup our descriptor indices */
1531 	rxr->next_to_check = 0;
1532 	rxr->next_to_refresh = 0;
1533 	rxr->lro_enabled = FALSE;
1534 	rxr->rx_copies = 0;
1535 	rxr->rx_bytes = 0;
1536 	rxr->vtag_strip = FALSE;
1537 
1538 	bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
1539 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1540 
1541 	/*
1542 	** Now set up the LRO interface:
1543 	*/
1544 	if (ixgbe_rsc_enable)
1545 		ixgbe_setup_hw_rsc(rxr);
1546 	else if (ifp->if_capenable & IFCAP_LRO) {
1547 		int err = tcp_lro_init(lro);
1548 		if (err) {
1549 			device_printf(dev, "LRO Initialization failed!\n");
1550 			goto fail;
1551 		}
1552 		INIT_DEBUGOUT("RX Soft LRO Initialized\n");
1553 		rxr->lro_enabled = TRUE;
1554 		lro->ifp = adapter->ifp;
1555 	}
1556 
1557 	IXGBE_RX_UNLOCK(rxr);
1558 	return (0);
1559 
1560 fail:
1561 	ixgbe_free_receive_ring(rxr);
1562 	IXGBE_RX_UNLOCK(rxr);
1563 	return (error);
1564 }
1565 
1566 /*********************************************************************
1567  *
1568  *  Initialize all receive rings.
1569  *
1570  **********************************************************************/
1571 int
1572 ixgbe_setup_receive_structures(struct adapter *adapter)
1573 {
1574 	struct rx_ring *rxr = adapter->rx_rings;
1575 	int j;
1576 
1577 	for (j = 0; j < adapter->num_queues; j++, rxr++)
1578 		if (ixgbe_setup_receive_ring(rxr))
1579 			goto fail;
1580 
1581 	return (0);
1582 fail:
1583 	/*
1584 	 * Free RX buffers allocated so far, we will only handle
1585 	 * the rings that completed, the failing case will have
1586 	 * cleaned up for itself. 'j' failed, so its the terminus.
1587 	 */
1588 	for (int i = 0; i < j; ++i) {
1589 		rxr = &adapter->rx_rings[i];
1590 		ixgbe_free_receive_ring(rxr);
1591 	}
1592 
1593 	return (ENOBUFS);
1594 }
1595 
1596 
1597 /*********************************************************************
1598  *
1599  *  Free all receive rings.
1600  *
1601  **********************************************************************/
1602 void
1603 ixgbe_free_receive_structures(struct adapter *adapter)
1604 {
1605 	struct rx_ring *rxr = adapter->rx_rings;
1606 
1607 	INIT_DEBUGOUT("ixgbe_free_receive_structures: begin");
1608 
1609 	for (int i = 0; i < adapter->num_queues; i++, rxr++) {
1610 		struct lro_ctrl		*lro = &rxr->lro;
1611 		ixgbe_free_receive_buffers(rxr);
1612 		/* Free LRO memory */
1613 		tcp_lro_free(lro);
1614 		/* Free the ring memory as well */
1615 		ixgbe_dma_free(adapter, &rxr->rxdma);
1616 	}
1617 
1618 	free(adapter->rx_rings, M_DEVBUF);
1619 }
1620 
1621 
1622 /*********************************************************************
1623  *
1624  *  Free receive ring data structures
1625  *
1626  **********************************************************************/
1627 void
1628 ixgbe_free_receive_buffers(struct rx_ring *rxr)
1629 {
1630 	struct adapter		*adapter = rxr->adapter;
1631 	struct ixgbe_rx_buf	*rxbuf;
1632 
1633 	INIT_DEBUGOUT("ixgbe_free_receive_buffers: begin");
1634 
1635 	/* Cleanup any existing buffers */
1636 	if (rxr->rx_buffers != NULL) {
1637 		for (int i = 0; i < adapter->num_rx_desc; i++) {
1638 			rxbuf = &rxr->rx_buffers[i];
1639 			if (rxbuf->buf != NULL) {
1640 				bus_dmamap_sync(rxr->ptag, rxbuf->pmap,
1641 				    BUS_DMASYNC_POSTREAD);
1642 				bus_dmamap_unload(rxr->ptag, rxbuf->pmap);
1643 				rxbuf->buf->m_flags |= M_PKTHDR;
1644 				m_freem(rxbuf->buf);
1645 			}
1646 			rxbuf->buf = NULL;
1647 			if (rxbuf->pmap != NULL) {
1648 				bus_dmamap_destroy(rxr->ptag, rxbuf->pmap);
1649 				rxbuf->pmap = NULL;
1650 			}
1651 		}
1652 		if (rxr->rx_buffers != NULL) {
1653 			free(rxr->rx_buffers, M_DEVBUF);
1654 			rxr->rx_buffers = NULL;
1655 		}
1656 	}
1657 
1658 	if (rxr->ptag != NULL) {
1659 		bus_dma_tag_destroy(rxr->ptag);
1660 		rxr->ptag = NULL;
1661 	}
1662 
1663 	return;
1664 }
1665 
1666 static __inline void
1667 ixgbe_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u32 ptype)
1668 {
1669 
1670         /*
1671          * ATM LRO is only for IP/TCP packets and TCP checksum of the packet
1672          * should be computed by hardware. Also it should not have VLAN tag in
1673          * ethernet header.  In case of IPv6 we do not yet support ext. hdrs.
1674          */
1675         if (rxr->lro_enabled &&
1676             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
1677             (ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 &&
1678             ((ptype & (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP)) ==
1679             (IXGBE_RXDADV_PKTTYPE_IPV4 | IXGBE_RXDADV_PKTTYPE_TCP) ||
1680             (ptype & (IXGBE_RXDADV_PKTTYPE_IPV6 | IXGBE_RXDADV_PKTTYPE_TCP)) ==
1681             (IXGBE_RXDADV_PKTTYPE_IPV6 | IXGBE_RXDADV_PKTTYPE_TCP)) &&
1682             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
1683             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
1684                 /*
1685                  * Send to the stack if:
1686                  **  - LRO not enabled, or
1687                  **  - no LRO resources, or
1688                  **  - lro enqueue fails
1689                  */
1690                 if (rxr->lro.lro_cnt != 0)
1691                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
1692                                 return;
1693         }
1694 	IXGBE_RX_UNLOCK(rxr);
1695         (*ifp->if_input)(ifp, m);
1696 	IXGBE_RX_LOCK(rxr);
1697 }
1698 
1699 static __inline void
1700 ixgbe_rx_discard(struct rx_ring *rxr, int i)
1701 {
1702 	struct ixgbe_rx_buf	*rbuf;
1703 
1704 	rbuf = &rxr->rx_buffers[i];
1705 
1706 
1707 	/*
1708 	** With advanced descriptors the writeback
1709 	** clobbers the buffer addrs, so its easier
1710 	** to just free the existing mbufs and take
1711 	** the normal refresh path to get new buffers
1712 	** and mapping.
1713 	*/
1714 
1715 	if (rbuf->fmp != NULL) {/* Partial chain ? */
1716 		rbuf->fmp->m_flags |= M_PKTHDR;
1717 		m_freem(rbuf->fmp);
1718 		rbuf->fmp = NULL;
1719 		rbuf->buf = NULL; /* rbuf->buf is part of fmp's chain */
1720 	} else if (rbuf->buf) {
1721 		m_free(rbuf->buf);
1722 		rbuf->buf = NULL;
1723 	}
1724 	bus_dmamap_unload(rxr->ptag, rbuf->pmap);
1725 
1726 	rbuf->flags = 0;
1727 
1728 	return;
1729 }
1730 
1731 
1732 /*********************************************************************
1733  *
1734  *  This routine executes in interrupt context. It replenishes
1735  *  the mbufs in the descriptor and sends data which has been
1736  *  dma'ed into host memory to upper layer.
1737  *
1738  *  Return TRUE for more work, FALSE for all clean.
1739  *********************************************************************/
1740 bool
1741 ixgbe_rxeof(struct ix_queue *que)
1742 {
1743 	struct adapter		*adapter = que->adapter;
1744 	struct rx_ring		*rxr = que->rxr;
1745 	struct ifnet		*ifp = adapter->ifp;
1746 	struct lro_ctrl		*lro = &rxr->lro;
1747 	struct lro_entry	*queued;
1748 	int			i, nextp, processed = 0;
1749 	u32			staterr = 0;
1750 	u16			count = rxr->process_limit;
1751 	union ixgbe_adv_rx_desc	*cur;
1752 	struct ixgbe_rx_buf	*rbuf, *nbuf;
1753 	u16			pkt_info;
1754 
1755 	IXGBE_RX_LOCK(rxr);
1756 
1757 #ifdef DEV_NETMAP
1758 	/* Same as the txeof routine: wakeup clients on intr. */
1759 	if (netmap_rx_irq(ifp, rxr->me, &processed)) {
1760 		IXGBE_RX_UNLOCK(rxr);
1761 		return (FALSE);
1762 	}
1763 #endif /* DEV_NETMAP */
1764 
1765 	for (i = rxr->next_to_check; count != 0;) {
1766 		struct mbuf	*sendmp, *mp;
1767 		u32		rsc, ptype;
1768 		u16		len;
1769 		u16		vtag = 0;
1770 		bool		eop;
1771 
1772 		/* Sync the ring. */
1773 		bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
1774 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1775 
1776 		cur = &rxr->rx_base[i];
1777 		staterr = le32toh(cur->wb.upper.status_error);
1778 		pkt_info = le16toh(cur->wb.lower.lo_dword.hs_rss.pkt_info);
1779 
1780 		if ((staterr & IXGBE_RXD_STAT_DD) == 0)
1781 			break;
1782 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1783 			break;
1784 
1785 		count--;
1786 		sendmp = NULL;
1787 		nbuf = NULL;
1788 		rsc = 0;
1789 		cur->wb.upper.status_error = 0;
1790 		rbuf = &rxr->rx_buffers[i];
1791 		mp = rbuf->buf;
1792 
1793 		len = le16toh(cur->wb.upper.length);
1794 		ptype = le32toh(cur->wb.lower.lo_dword.data) &
1795 		    IXGBE_RXDADV_PKTTYPE_MASK;
1796 		eop = ((staterr & IXGBE_RXD_STAT_EOP) != 0);
1797 
1798 		/* Make sure bad packets are discarded */
1799 		if (eop && (staterr & IXGBE_RXDADV_ERR_FRAME_ERR_MASK) != 0) {
1800 #if __FreeBSD_version >= 1100036
1801 			if (IXGBE_IS_VF(adapter))
1802 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
1803 #endif
1804 			rxr->rx_discarded++;
1805 			ixgbe_rx_discard(rxr, i);
1806 			goto next_desc;
1807 		}
1808 
1809 		/*
1810 		** On 82599 which supports a hardware
1811 		** LRO (called HW RSC), packets need
1812 		** not be fragmented across sequential
1813 		** descriptors, rather the next descriptor
1814 		** is indicated in bits of the descriptor.
1815 		** This also means that we might proceses
1816 		** more than one packet at a time, something
1817 		** that has never been true before, it
1818 		** required eliminating global chain pointers
1819 		** in favor of what we are doing here.  -jfv
1820 		*/
1821 		if (!eop) {
1822 			/*
1823 			** Figure out the next descriptor
1824 			** of this frame.
1825 			*/
1826 			if (rxr->hw_rsc == TRUE) {
1827 				rsc = ixgbe_rsc_count(cur);
1828 				rxr->rsc_num += (rsc - 1);
1829 			}
1830 			if (rsc) { /* Get hardware index */
1831 				nextp = ((staterr &
1832 				    IXGBE_RXDADV_NEXTP_MASK) >>
1833 				    IXGBE_RXDADV_NEXTP_SHIFT);
1834 			} else { /* Just sequential */
1835 				nextp = i + 1;
1836 				if (nextp == adapter->num_rx_desc)
1837 					nextp = 0;
1838 			}
1839 			nbuf = &rxr->rx_buffers[nextp];
1840 			prefetch(nbuf);
1841 		}
1842 		/*
1843 		** Rather than using the fmp/lmp global pointers
1844 		** we now keep the head of a packet chain in the
1845 		** buffer struct and pass this along from one
1846 		** descriptor to the next, until we get EOP.
1847 		*/
1848 		mp->m_len = len;
1849 		/*
1850 		** See if there is a stored head
1851 		** that determines what we are
1852 		*/
1853 		sendmp = rbuf->fmp;
1854 		if (sendmp != NULL) {  /* secondary frag */
1855 			rbuf->buf = rbuf->fmp = NULL;
1856 			mp->m_flags &= ~M_PKTHDR;
1857 			sendmp->m_pkthdr.len += mp->m_len;
1858 		} else {
1859 			/*
1860 			 * Optimize.  This might be a small packet,
1861 			 * maybe just a TCP ACK.  Do a fast copy that
1862 			 * is cache aligned into a new mbuf, and
1863 			 * leave the old mbuf+cluster for re-use.
1864 			 */
1865 			if (eop && len <= IXGBE_RX_COPY_LEN) {
1866 				sendmp = m_gethdr(M_NOWAIT, MT_DATA);
1867 				if (sendmp != NULL) {
1868 					sendmp->m_data +=
1869 					    IXGBE_RX_COPY_ALIGN;
1870 					ixgbe_bcopy(mp->m_data,
1871 					    sendmp->m_data, len);
1872 					sendmp->m_len = len;
1873 					rxr->rx_copies++;
1874 					rbuf->flags |= IXGBE_RX_COPY;
1875 				}
1876 			}
1877 			if (sendmp == NULL) {
1878 				rbuf->buf = rbuf->fmp = NULL;
1879 				sendmp = mp;
1880 			}
1881 
1882 			/* first desc of a non-ps chain */
1883 			sendmp->m_flags |= M_PKTHDR;
1884 			sendmp->m_pkthdr.len = mp->m_len;
1885 		}
1886 		++processed;
1887 
1888 		/* Pass the head pointer on */
1889 		if (eop == 0) {
1890 			nbuf->fmp = sendmp;
1891 			sendmp = NULL;
1892 			mp->m_next = nbuf->buf;
1893 		} else { /* Sending this frame */
1894 			sendmp->m_pkthdr.rcvif = ifp;
1895 			rxr->rx_packets++;
1896 			/* capture data for AIM */
1897 			rxr->bytes += sendmp->m_pkthdr.len;
1898 			rxr->rx_bytes += sendmp->m_pkthdr.len;
1899 			/* Process vlan info */
1900 			if ((rxr->vtag_strip) &&
1901 			    (staterr & IXGBE_RXD_STAT_VP))
1902 				vtag = le16toh(cur->wb.upper.vlan);
1903 			if (vtag) {
1904 				sendmp->m_pkthdr.ether_vtag = vtag;
1905 				sendmp->m_flags |= M_VLANTAG;
1906 			}
1907 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
1908 				ixgbe_rx_checksum(staterr, sendmp, ptype);
1909 
1910                         /*
1911                          * In case of multiqueue, we have RXCSUM.PCSD bit set
1912                          * and never cleared. This means we have RSS hash
1913                          * available to be used.
1914                          */
1915                         if (adapter->num_queues > 1) {
1916                                 sendmp->m_pkthdr.flowid =
1917                                     le32toh(cur->wb.lower.hi_dword.rss);
1918                                 switch (pkt_info & IXGBE_RXDADV_RSSTYPE_MASK) {
1919                                     case IXGBE_RXDADV_RSSTYPE_IPV4_TCP:
1920                                         M_HASHTYPE_SET(sendmp,
1921                                             M_HASHTYPE_RSS_TCP_IPV4);
1922                                         break;
1923                                     case IXGBE_RXDADV_RSSTYPE_IPV4:
1924                                         M_HASHTYPE_SET(sendmp,
1925                                             M_HASHTYPE_RSS_IPV4);
1926                                         break;
1927                                     case IXGBE_RXDADV_RSSTYPE_IPV6_TCP:
1928                                         M_HASHTYPE_SET(sendmp,
1929                                             M_HASHTYPE_RSS_TCP_IPV6);
1930                                         break;
1931                                     case IXGBE_RXDADV_RSSTYPE_IPV6_EX:
1932                                         M_HASHTYPE_SET(sendmp,
1933                                             M_HASHTYPE_RSS_IPV6_EX);
1934                                         break;
1935                                     case IXGBE_RXDADV_RSSTYPE_IPV6:
1936                                         M_HASHTYPE_SET(sendmp,
1937                                             M_HASHTYPE_RSS_IPV6);
1938                                         break;
1939                                     case IXGBE_RXDADV_RSSTYPE_IPV6_TCP_EX:
1940                                         M_HASHTYPE_SET(sendmp,
1941                                             M_HASHTYPE_RSS_TCP_IPV6_EX);
1942                                         break;
1943                                     case IXGBE_RXDADV_RSSTYPE_IPV4_UDP:
1944                                         M_HASHTYPE_SET(sendmp,
1945                                             M_HASHTYPE_RSS_UDP_IPV4);
1946                                         break;
1947                                     case IXGBE_RXDADV_RSSTYPE_IPV6_UDP:
1948                                         M_HASHTYPE_SET(sendmp,
1949                                             M_HASHTYPE_RSS_UDP_IPV6);
1950                                         break;
1951                                     case IXGBE_RXDADV_RSSTYPE_IPV6_UDP_EX:
1952                                         M_HASHTYPE_SET(sendmp,
1953                                             M_HASHTYPE_RSS_UDP_IPV6_EX);
1954                                         break;
1955                                     default:
1956                                         M_HASHTYPE_SET(sendmp,
1957                                             M_HASHTYPE_OPAQUE);
1958                                 }
1959                         } else {
1960                                 sendmp->m_pkthdr.flowid = que->msix;
1961 				M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE);
1962 			}
1963 		}
1964 next_desc:
1965 		bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map,
1966 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1967 
1968 		/* Advance our pointers to the next descriptor. */
1969 		if (++i == rxr->num_desc)
1970 			i = 0;
1971 
1972 		/* Now send to the stack or do LRO */
1973 		if (sendmp != NULL) {
1974 			rxr->next_to_check = i;
1975 			ixgbe_rx_input(rxr, ifp, sendmp, ptype);
1976 			i = rxr->next_to_check;
1977 		}
1978 
1979                /* Every 8 descriptors we go to refresh mbufs */
1980 		if (processed == 8) {
1981 			ixgbe_refresh_mbufs(rxr, i);
1982 			processed = 0;
1983 		}
1984 	}
1985 
1986 	/* Refresh any remaining buf structs */
1987 	if (ixgbe_rx_unrefreshed(rxr))
1988 		ixgbe_refresh_mbufs(rxr, i);
1989 
1990 	rxr->next_to_check = i;
1991 
1992 	/*
1993 	 * Flush any outstanding LRO work
1994 	 */
1995 	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1996 		SLIST_REMOVE_HEAD(&lro->lro_active, next);
1997 		tcp_lro_flush(lro, queued);
1998 	}
1999 
2000 	IXGBE_RX_UNLOCK(rxr);
2001 
2002 	/*
2003 	** Still have cleaning to do?
2004 	*/
2005 	if ((staterr & IXGBE_RXD_STAT_DD) != 0)
2006 		return (TRUE);
2007 	else
2008 		return (FALSE);
2009 }
2010 
2011 
2012 /*********************************************************************
2013  *
2014  *  Verify that the hardware indicated that the checksum is valid.
2015  *  Inform the stack about the status of checksum so that stack
2016  *  doesn't spend time verifying the checksum.
2017  *
2018  *********************************************************************/
2019 static void
2020 ixgbe_rx_checksum(u32 staterr, struct mbuf * mp, u32 ptype)
2021 {
2022 	u16	status = (u16) staterr;
2023 	u8	errors = (u8) (staterr >> 24);
2024 	bool	sctp = FALSE;
2025 
2026 	if ((ptype & IXGBE_RXDADV_PKTTYPE_ETQF) == 0 &&
2027 	    (ptype & IXGBE_RXDADV_PKTTYPE_SCTP) != 0)
2028 		sctp = TRUE;
2029 
2030 	if (status & IXGBE_RXD_STAT_IPCS) {
2031 		if (!(errors & IXGBE_RXD_ERR_IPE)) {
2032 			/* IP Checksum Good */
2033 			mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
2034 			mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
2035 
2036 		} else
2037 			mp->m_pkthdr.csum_flags = 0;
2038 	}
2039 	if (status & IXGBE_RXD_STAT_L4CS) {
2040 		u64 type = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2041 #if __FreeBSD_version >= 800000
2042 		if (sctp)
2043 			type = CSUM_SCTP_VALID;
2044 #endif
2045 		if (!(errors & IXGBE_RXD_ERR_TCPE)) {
2046 			mp->m_pkthdr.csum_flags |= type;
2047 			if (!sctp)
2048 				mp->m_pkthdr.csum_data = htons(0xffff);
2049 		}
2050 	}
2051 	return;
2052 }
2053 
2054 /********************************************************************
2055  * Manage DMA'able memory.
2056  *******************************************************************/
2057 static void
2058 ixgbe_dmamap_cb(void *arg, bus_dma_segment_t * segs, int nseg, int error)
2059 {
2060 	if (error)
2061 		return;
2062 	*(bus_addr_t *) arg = segs->ds_addr;
2063 	return;
2064 }
2065 
2066 int
2067 ixgbe_dma_malloc(struct adapter *adapter, bus_size_t size,
2068 		struct ixgbe_dma_alloc *dma, int mapflags)
2069 {
2070 	device_t dev = adapter->dev;
2071 	int             r;
2072 
2073 	r = bus_dma_tag_create(bus_get_dma_tag(adapter->dev),	/* parent */
2074 			       DBA_ALIGN, 0,	/* alignment, bounds */
2075 			       BUS_SPACE_MAXADDR,	/* lowaddr */
2076 			       BUS_SPACE_MAXADDR,	/* highaddr */
2077 			       NULL, NULL,	/* filter, filterarg */
2078 			       size,	/* maxsize */
2079 			       1,	/* nsegments */
2080 			       size,	/* maxsegsize */
2081 			       BUS_DMA_ALLOCNOW,	/* flags */
2082 			       NULL,	/* lockfunc */
2083 			       NULL,	/* lockfuncarg */
2084 			       &dma->dma_tag);
2085 	if (r != 0) {
2086 		device_printf(dev,"ixgbe_dma_malloc: bus_dma_tag_create failed; "
2087 		       "error %u\n", r);
2088 		goto fail_0;
2089 	}
2090 	r = bus_dmamem_alloc(dma->dma_tag, (void **)&dma->dma_vaddr,
2091 			     BUS_DMA_NOWAIT, &dma->dma_map);
2092 	if (r != 0) {
2093 		device_printf(dev,"ixgbe_dma_malloc: bus_dmamem_alloc failed; "
2094 		       "error %u\n", r);
2095 		goto fail_1;
2096 	}
2097 	r = bus_dmamap_load(dma->dma_tag, dma->dma_map, dma->dma_vaddr,
2098 			    size,
2099 			    ixgbe_dmamap_cb,
2100 			    &dma->dma_paddr,
2101 			    mapflags | BUS_DMA_NOWAIT);
2102 	if (r != 0) {
2103 		device_printf(dev,"ixgbe_dma_malloc: bus_dmamap_load failed; "
2104 		       "error %u\n", r);
2105 		goto fail_2;
2106 	}
2107 	dma->dma_size = size;
2108 	return (0);
2109 fail_2:
2110 	bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map);
2111 fail_1:
2112 	bus_dma_tag_destroy(dma->dma_tag);
2113 fail_0:
2114 	dma->dma_tag = NULL;
2115 	return (r);
2116 }
2117 
2118 void
2119 ixgbe_dma_free(struct adapter *adapter, struct ixgbe_dma_alloc *dma)
2120 {
2121 	bus_dmamap_sync(dma->dma_tag, dma->dma_map,
2122 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
2123 	bus_dmamap_unload(dma->dma_tag, dma->dma_map);
2124 	bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map);
2125 	bus_dma_tag_destroy(dma->dma_tag);
2126 }
2127 
2128 
2129 /*********************************************************************
2130  *
2131  *  Allocate memory for the transmit and receive rings, and then
2132  *  the descriptors associated with each, called only once at attach.
2133  *
2134  **********************************************************************/
2135 int
2136 ixgbe_allocate_queues(struct adapter *adapter)
2137 {
2138 	device_t	dev = adapter->dev;
2139 	struct ix_queue	*que;
2140 	struct tx_ring	*txr;
2141 	struct rx_ring	*rxr;
2142 	int rsize, tsize, error = IXGBE_SUCCESS;
2143 	int txconf = 0, rxconf = 0;
2144 #ifdef PCI_IOV
2145 	enum ixgbe_iov_mode iov_mode;
2146 #endif
2147 
2148         /* First allocate the top level queue structs */
2149         if (!(adapter->queues =
2150             (struct ix_queue *) malloc(sizeof(struct ix_queue) *
2151             adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) {
2152                 device_printf(dev, "Unable to allocate queue memory\n");
2153                 error = ENOMEM;
2154                 goto fail;
2155         }
2156 
2157 	/* First allocate the TX ring struct memory */
2158 	if (!(adapter->tx_rings =
2159 	    (struct tx_ring *) malloc(sizeof(struct tx_ring) *
2160 	    adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) {
2161 		device_printf(dev, "Unable to allocate TX ring memory\n");
2162 		error = ENOMEM;
2163 		goto tx_fail;
2164 	}
2165 
2166 	/* Next allocate the RX */
2167 	if (!(adapter->rx_rings =
2168 	    (struct rx_ring *) malloc(sizeof(struct rx_ring) *
2169 	    adapter->num_queues, M_DEVBUF, M_NOWAIT | M_ZERO))) {
2170 		device_printf(dev, "Unable to allocate RX ring memory\n");
2171 		error = ENOMEM;
2172 		goto rx_fail;
2173 	}
2174 
2175 	/* For the ring itself */
2176 	tsize = roundup2(adapter->num_tx_desc *
2177 	    sizeof(union ixgbe_adv_tx_desc), DBA_ALIGN);
2178 
2179 #ifdef PCI_IOV
2180 	iov_mode = ixgbe_get_iov_mode(adapter);
2181 	adapter->pool = ixgbe_max_vfs(iov_mode);
2182 #else
2183 	adapter->pool = 0;
2184 #endif
2185 	/*
2186 	 * Now set up the TX queues, txconf is needed to handle the
2187 	 * possibility that things fail midcourse and we need to
2188 	 * undo memory gracefully
2189 	 */
2190 	for (int i = 0; i < adapter->num_queues; i++, txconf++) {
2191 		/* Set up some basics */
2192 		txr = &adapter->tx_rings[i];
2193 		txr->adapter = adapter;
2194 #ifdef PCI_IOV
2195 		txr->me = ixgbe_pf_que_index(iov_mode, i);
2196 #else
2197 		txr->me = i;
2198 #endif
2199 		txr->num_desc = adapter->num_tx_desc;
2200 
2201 		/* Initialize the TX side lock */
2202 		snprintf(txr->mtx_name, sizeof(txr->mtx_name), "%s:tx(%d)",
2203 		    device_get_nameunit(dev), txr->me);
2204 		mtx_init(&txr->tx_mtx, txr->mtx_name, NULL, MTX_DEF);
2205 
2206 		if (ixgbe_dma_malloc(adapter, tsize,
2207 			&txr->txdma, BUS_DMA_NOWAIT)) {
2208 			device_printf(dev,
2209 			    "Unable to allocate TX Descriptor memory\n");
2210 			error = ENOMEM;
2211 			goto err_tx_desc;
2212 		}
2213 		txr->tx_base = (union ixgbe_adv_tx_desc *)txr->txdma.dma_vaddr;
2214 		bzero((void *)txr->tx_base, tsize);
2215 
2216         	/* Now allocate transmit buffers for the ring */
2217         	if (ixgbe_allocate_transmit_buffers(txr)) {
2218 			device_printf(dev,
2219 			    "Critical Failure setting up transmit buffers\n");
2220 			error = ENOMEM;
2221 			goto err_tx_desc;
2222         	}
2223 #ifndef IXGBE_LEGACY_TX
2224 		/* Allocate a buf ring */
2225 		txr->br = buf_ring_alloc(IXGBE_BR_SIZE, M_DEVBUF,
2226 		    M_WAITOK, &txr->tx_mtx);
2227 		if (txr->br == NULL) {
2228 			device_printf(dev,
2229 			    "Critical Failure setting up buf ring\n");
2230 			error = ENOMEM;
2231 			goto err_tx_desc;
2232         	}
2233 #endif
2234 	}
2235 
2236 	/*
2237 	 * Next the RX queues...
2238 	 */
2239 	rsize = roundup2(adapter->num_rx_desc *
2240 	    sizeof(union ixgbe_adv_rx_desc), DBA_ALIGN);
2241 	for (int i = 0; i < adapter->num_queues; i++, rxconf++) {
2242 		rxr = &adapter->rx_rings[i];
2243 		/* Set up some basics */
2244 		rxr->adapter = adapter;
2245 #ifdef PCI_IOV
2246 		rxr->me = ixgbe_pf_que_index(iov_mode, i);
2247 #else
2248 		rxr->me = i;
2249 #endif
2250 		rxr->num_desc = adapter->num_rx_desc;
2251 
2252 		/* Initialize the RX side lock */
2253 		snprintf(rxr->mtx_name, sizeof(rxr->mtx_name), "%s:rx(%d)",
2254 		    device_get_nameunit(dev), rxr->me);
2255 		mtx_init(&rxr->rx_mtx, rxr->mtx_name, NULL, MTX_DEF);
2256 
2257 		if (ixgbe_dma_malloc(adapter, rsize,
2258 			&rxr->rxdma, BUS_DMA_NOWAIT)) {
2259 			device_printf(dev,
2260 			    "Unable to allocate RxDescriptor memory\n");
2261 			error = ENOMEM;
2262 			goto err_rx_desc;
2263 		}
2264 		rxr->rx_base = (union ixgbe_adv_rx_desc *)rxr->rxdma.dma_vaddr;
2265 		bzero((void *)rxr->rx_base, rsize);
2266 
2267         	/* Allocate receive buffers for the ring*/
2268 		if (ixgbe_allocate_receive_buffers(rxr)) {
2269 			device_printf(dev,
2270 			    "Critical Failure setting up receive buffers\n");
2271 			error = ENOMEM;
2272 			goto err_rx_desc;
2273 		}
2274 	}
2275 
2276 	/*
2277 	** Finally set up the queue holding structs
2278 	*/
2279 	for (int i = 0; i < adapter->num_queues; i++) {
2280 		que = &adapter->queues[i];
2281 		que->adapter = adapter;
2282 		que->me = i;
2283 		que->txr = &adapter->tx_rings[i];
2284 		que->rxr = &adapter->rx_rings[i];
2285 	}
2286 
2287 	return (0);
2288 
2289 err_rx_desc:
2290 	for (rxr = adapter->rx_rings; rxconf > 0; rxr++, rxconf--)
2291 		ixgbe_dma_free(adapter, &rxr->rxdma);
2292 err_tx_desc:
2293 	for (txr = adapter->tx_rings; txconf > 0; txr++, txconf--)
2294 		ixgbe_dma_free(adapter, &txr->txdma);
2295 	free(adapter->rx_rings, M_DEVBUF);
2296 rx_fail:
2297 	free(adapter->tx_rings, M_DEVBUF);
2298 tx_fail:
2299 	free(adapter->queues, M_DEVBUF);
2300 fail:
2301 	return (error);
2302 }
2303