xref: /freebsd/sys/dev/ixl/ixl_txrx.c (revision b5864e6de2f3aa8eb9bb269ec86282598b5201b1)
1 /******************************************************************************
2 
3   Copyright (c) 2013-2015, Intel Corporation
4   All rights reserved.
5 
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8 
9    1. Redistributions of source code must retain the above copyright notice,
10       this list of conditions and the following disclaimer.
11 
12    2. Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16    3. Neither the name of the Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   POSSIBILITY OF SUCH DAMAGE.
31 
32 ******************************************************************************/
33 /*$FreeBSD$*/
34 
35 /*
36 **	IXL driver TX/RX Routines:
37 **	    This was seperated to allow usage by
38 ** 	    both the PF and VF drivers.
39 */
40 
41 #ifndef IXL_STANDALONE_BUILD
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_rss.h"
45 #endif
46 
47 #include "ixl.h"
48 
49 #ifdef RSS
50 #include <net/rss_config.h>
51 #endif
52 
53 /* Local Prototypes */
54 static void	ixl_rx_checksum(struct mbuf *, u32, u32, u8);
55 static void	ixl_refresh_mbufs(struct ixl_queue *, int);
56 static int      ixl_xmit(struct ixl_queue *, struct mbuf **);
57 static int	ixl_tx_setup_offload(struct ixl_queue *,
58 		    struct mbuf *, u32 *, u32 *);
59 static bool	ixl_tso_setup(struct ixl_queue *, struct mbuf *);
60 
61 static inline void ixl_rx_discard(struct rx_ring *, int);
62 static inline void ixl_rx_input(struct rx_ring *, struct ifnet *,
63 		    struct mbuf *, u8);
64 
65 static inline bool ixl_tso_detect_sparse(struct mbuf *mp);
66 static int	ixl_tx_setup_offload(struct ixl_queue *que,
67     struct mbuf *mp, u32 *cmd, u32 *off);
68 static inline u32 ixl_get_tx_head(struct ixl_queue *que);
69 
70 #ifdef DEV_NETMAP
71 #include <dev/netmap/if_ixl_netmap.h>
72 #endif /* DEV_NETMAP */
73 
74 /*
75  * @key key is saved into this parameter
76  */
77 void
78 ixl_get_default_rss_key(u32 *key)
79 {
80 	MPASS(key != NULL);
81 
82 	u32 rss_seed[IXL_RSS_KEY_SIZE_REG] = {0x41b01687,
83 	    0x183cfd8c, 0xce880440, 0x580cbc3c,
84 	    0x35897377, 0x328b25e1, 0x4fa98922,
85 	    0xb7d90c14, 0xd5bad70d, 0xcd15a2c1,
86 	    0x0, 0x0, 0x0};
87 
88 	bcopy(rss_seed, key, IXL_RSS_KEY_SIZE);
89 }
90 
91 /*
92 ** Multiqueue Transmit driver
93 */
94 int
95 ixl_mq_start(struct ifnet *ifp, struct mbuf *m)
96 {
97 	struct ixl_vsi		*vsi = ifp->if_softc;
98 	struct ixl_queue	*que;
99 	struct tx_ring		*txr;
100 	int 			err, i;
101 #ifdef RSS
102 	u32			bucket_id;
103 #endif
104 
105 	/*
106 	** Which queue to use:
107 	**
108 	** When doing RSS, map it to the same outbound
109 	** queue as the incoming flow would be mapped to.
110 	** If everything is setup correctly, it should be
111 	** the same bucket that the current CPU we're on is.
112 	*/
113 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
114 #ifdef  RSS
115 		if (rss_hash2bucket(m->m_pkthdr.flowid,
116 		    M_HASHTYPE_GET(m), &bucket_id) == 0) {
117 			i = bucket_id % vsi->num_queues;
118                 } else
119 #endif
120                         i = m->m_pkthdr.flowid % vsi->num_queues;
121         } else
122 		i = curcpu % vsi->num_queues;
123 
124 	que = &vsi->queues[i];
125 	txr = &que->txr;
126 
127 	err = drbr_enqueue(ifp, txr->br, m);
128 	if (err)
129 		return (err);
130 	if (IXL_TX_TRYLOCK(txr)) {
131 		ixl_mq_start_locked(ifp, txr);
132 		IXL_TX_UNLOCK(txr);
133 	} else
134 		taskqueue_enqueue(que->tq, &que->tx_task);
135 
136 	return (0);
137 }
138 
139 int
140 ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
141 {
142 	struct ixl_queue	*que = txr->que;
143 	struct ixl_vsi		*vsi = que->vsi;
144         struct mbuf		*next;
145         int			err = 0;
146 
147 
148 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
149 	    vsi->link_active == 0)
150 		return (ENETDOWN);
151 
152 	/* Process the transmit queue */
153 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
154 		if ((err = ixl_xmit(que, &next)) != 0) {
155 			if (next == NULL)
156 				drbr_advance(ifp, txr->br);
157 			else
158 				drbr_putback(ifp, txr->br, next);
159 			break;
160 		}
161 		drbr_advance(ifp, txr->br);
162 		/* Send a copy of the frame to the BPF listener */
163 		ETHER_BPF_MTAP(ifp, next);
164 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
165 			break;
166 	}
167 
168 	if (txr->avail < IXL_TX_CLEANUP_THRESHOLD)
169 		ixl_txeof(que);
170 
171 	return (err);
172 }
173 
174 /*
175  * Called from a taskqueue to drain queued transmit packets.
176  */
177 void
178 ixl_deferred_mq_start(void *arg, int pending)
179 {
180 	struct ixl_queue	*que = arg;
181         struct tx_ring		*txr = &que->txr;
182 	struct ixl_vsi		*vsi = que->vsi;
183         struct ifnet		*ifp = vsi->ifp;
184 
185 	IXL_TX_LOCK(txr);
186 	if (!drbr_empty(ifp, txr->br))
187 		ixl_mq_start_locked(ifp, txr);
188 	IXL_TX_UNLOCK(txr);
189 }
190 
191 /*
192 ** Flush all queue ring buffers
193 */
194 void
195 ixl_qflush(struct ifnet *ifp)
196 {
197 	struct ixl_vsi	*vsi = ifp->if_softc;
198 
199         for (int i = 0; i < vsi->num_queues; i++) {
200 		struct ixl_queue *que = &vsi->queues[i];
201 		struct tx_ring	*txr = &que->txr;
202 		struct mbuf	*m;
203 		IXL_TX_LOCK(txr);
204 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
205 			m_freem(m);
206 		IXL_TX_UNLOCK(txr);
207 	}
208 	if_qflush(ifp);
209 }
210 
211 /*
212 ** Find mbuf chains passed to the driver
213 ** that are 'sparse', using more than 8
214 ** mbufs to deliver an mss-size chunk of data
215 */
216 static inline bool
217 ixl_tso_detect_sparse(struct mbuf *mp)
218 {
219 	struct mbuf	*m;
220 	int		num = 0, mss;
221 	bool		ret = FALSE;
222 
223 	mss = mp->m_pkthdr.tso_segsz;
224 	for (m = mp->m_next; m != NULL; m = m->m_next) {
225 		num++;
226 		mss -= m->m_len;
227 		if (mss < 1)
228 			break;
229 		if (m->m_next == NULL)
230 			break;
231 	}
232 	if (num > IXL_SPARSE_CHAIN)
233 		ret = TRUE;
234 
235 	return (ret);
236 }
237 
238 
239 /*********************************************************************
240  *
241  *  This routine maps the mbufs to tx descriptors, allowing the
242  *  TX engine to transmit the packets.
243  *  	- return 0 on success, positive on failure
244  *
245  **********************************************************************/
246 #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
247 
248 static int
249 ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp)
250 {
251 	struct ixl_vsi		*vsi = que->vsi;
252 	struct i40e_hw		*hw = vsi->hw;
253 	struct tx_ring		*txr = &que->txr;
254 	struct ixl_tx_buf	*buf;
255 	struct i40e_tx_desc	*txd = NULL;
256 	struct mbuf		*m_head, *m;
257 	int             	i, j, error, nsegs;
258 	int			first, last = 0;
259 	u16			vtag = 0;
260 	u32			cmd, off;
261 	bus_dmamap_t		map;
262 	bus_dma_tag_t		tag;
263 	bus_dma_segment_t	segs[IXL_MAX_TSO_SEGS];
264 
265 	cmd = off = 0;
266 	m_head = *m_headp;
267 
268         /*
269          * Important to capture the first descriptor
270          * used because it will contain the index of
271          * the one we tell the hardware to report back
272          */
273         first = txr->next_avail;
274 	buf = &txr->buffers[first];
275 	map = buf->map;
276 	tag = txr->tx_tag;
277 
278 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
279 		/* Use larger mapping for TSO */
280 		tag = txr->tso_tag;
281 		if (ixl_tso_detect_sparse(m_head)) {
282 			m = m_defrag(m_head, M_NOWAIT);
283 			if (m == NULL) {
284 				m_freem(*m_headp);
285 				*m_headp = NULL;
286 				return (ENOBUFS);
287 			}
288 			*m_headp = m;
289 		}
290 	}
291 
292 	/*
293 	 * Map the packet for DMA.
294 	 */
295 	error = bus_dmamap_load_mbuf_sg(tag, map,
296 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
297 
298 	if (error == EFBIG) {
299 		struct mbuf *m;
300 
301 		m = m_defrag(*m_headp, M_NOWAIT);
302 		if (m == NULL) {
303 			que->mbuf_defrag_failed++;
304 			m_freem(*m_headp);
305 			*m_headp = NULL;
306 			return (ENOBUFS);
307 		}
308 		*m_headp = m;
309 
310 		/* Try it again */
311 		error = bus_dmamap_load_mbuf_sg(tag, map,
312 		    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
313 
314 		if (error == ENOMEM) {
315 			que->tx_dmamap_failed++;
316 			return (error);
317 		} else if (error != 0) {
318 			que->tx_dmamap_failed++;
319 			m_freem(*m_headp);
320 			*m_headp = NULL;
321 			return (error);
322 		}
323 	} else if (error == ENOMEM) {
324 		que->tx_dmamap_failed++;
325 		return (error);
326 	} else if (error != 0) {
327 		que->tx_dmamap_failed++;
328 		m_freem(*m_headp);
329 		*m_headp = NULL;
330 		return (error);
331 	}
332 
333 	/* Make certain there are enough descriptors */
334 	if (nsegs > txr->avail - 2) {
335 		txr->no_desc++;
336 		error = ENOBUFS;
337 		goto xmit_fail;
338 	}
339 	m_head = *m_headp;
340 
341 	/* Set up the TSO/CSUM offload */
342 	if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) {
343 		error = ixl_tx_setup_offload(que, m_head, &cmd, &off);
344 		if (error)
345 			goto xmit_fail;
346 	}
347 
348 	cmd |= I40E_TX_DESC_CMD_ICRC;
349 	/* Grab the VLAN tag */
350 	if (m_head->m_flags & M_VLANTAG) {
351 		cmd |= I40E_TX_DESC_CMD_IL2TAG1;
352 		vtag = htole16(m_head->m_pkthdr.ether_vtag);
353 	}
354 
355 	i = txr->next_avail;
356 	for (j = 0; j < nsegs; j++) {
357 		bus_size_t seglen;
358 
359 		buf = &txr->buffers[i];
360 		buf->tag = tag; /* Keep track of the type tag */
361 		txd = &txr->base[i];
362 		seglen = segs[j].ds_len;
363 
364 		txd->buffer_addr = htole64(segs[j].ds_addr);
365 		txd->cmd_type_offset_bsz =
366 		    htole64(I40E_TX_DESC_DTYPE_DATA
367 		    | ((u64)cmd  << I40E_TXD_QW1_CMD_SHIFT)
368 		    | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT)
369 		    | ((u64)seglen  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
370 		    | ((u64)vtag  << I40E_TXD_QW1_L2TAG1_SHIFT));
371 
372 		last = i; /* descriptor that will get completion IRQ */
373 
374 		if (++i == que->num_desc)
375 			i = 0;
376 
377 		buf->m_head = NULL;
378 		buf->eop_index = -1;
379 	}
380 	/* Set the last descriptor for report */
381 	txd->cmd_type_offset_bsz |=
382 	    htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT));
383 	txr->avail -= nsegs;
384 	txr->next_avail = i;
385 
386 	buf->m_head = m_head;
387 	/* Swap the dma map between the first and last descriptor */
388 	txr->buffers[first].map = buf->map;
389 	buf->map = map;
390 	bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE);
391 
392         /* Set the index of the descriptor that will be marked done */
393         buf = &txr->buffers[first];
394 	buf->eop_index = last;
395 
396         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
397             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
398 	/*
399 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
400 	 * hardware that this frame is available to transmit.
401 	 */
402 	++txr->total_packets;
403 	wr32(hw, txr->tail, i);
404 
405 	/* Mark outstanding work */
406 	if (que->busy == 0)
407 		que->busy = 1;
408 	return (0);
409 
410 xmit_fail:
411 	bus_dmamap_unload(tag, buf->map);
412 	return (error);
413 }
414 
415 
416 /*********************************************************************
417  *
418  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
419  *  the information needed to transmit a packet on the wire. This is
420  *  called only once at attach, setup is done every reset.
421  *
422  **********************************************************************/
423 int
424 ixl_allocate_tx_data(struct ixl_queue *que)
425 {
426 	struct tx_ring		*txr = &que->txr;
427 	struct ixl_vsi		*vsi = que->vsi;
428 	device_t		dev = vsi->dev;
429 	struct ixl_tx_buf	*buf;
430 	int			error = 0;
431 
432 	/*
433 	 * Setup DMA descriptor areas.
434 	 */
435 	if ((error = bus_dma_tag_create(NULL,		/* parent */
436 			       1, 0,			/* alignment, bounds */
437 			       BUS_SPACE_MAXADDR,	/* lowaddr */
438 			       BUS_SPACE_MAXADDR,	/* highaddr */
439 			       NULL, NULL,		/* filter, filterarg */
440 			       IXL_TSO_SIZE,		/* maxsize */
441 			       IXL_MAX_TX_SEGS,		/* nsegments */
442 			       PAGE_SIZE,		/* maxsegsize */
443 			       0,			/* flags */
444 			       NULL,			/* lockfunc */
445 			       NULL,			/* lockfuncarg */
446 			       &txr->tx_tag))) {
447 		device_printf(dev,"Unable to allocate TX DMA tag\n");
448 		goto fail;
449 	}
450 
451 	/* Make a special tag for TSO */
452 	if ((error = bus_dma_tag_create(NULL,		/* parent */
453 			       1, 0,			/* alignment, bounds */
454 			       BUS_SPACE_MAXADDR,	/* lowaddr */
455 			       BUS_SPACE_MAXADDR,	/* highaddr */
456 			       NULL, NULL,		/* filter, filterarg */
457 			       IXL_TSO_SIZE,		/* maxsize */
458 			       IXL_MAX_TSO_SEGS,	/* nsegments */
459 			       PAGE_SIZE,		/* maxsegsize */
460 			       0,			/* flags */
461 			       NULL,			/* lockfunc */
462 			       NULL,			/* lockfuncarg */
463 			       &txr->tso_tag))) {
464 		device_printf(dev,"Unable to allocate TX TSO DMA tag\n");
465 		goto fail;
466 	}
467 
468 	if (!(txr->buffers =
469 	    (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) *
470 	    que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
471 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
472 		error = ENOMEM;
473 		goto fail;
474 	}
475 
476         /* Create the descriptor buffer default dma maps */
477 	buf = txr->buffers;
478 	for (int i = 0; i < que->num_desc; i++, buf++) {
479 		buf->tag = txr->tx_tag;
480 		error = bus_dmamap_create(buf->tag, 0, &buf->map);
481 		if (error != 0) {
482 			device_printf(dev, "Unable to create TX DMA map\n");
483 			goto fail;
484 		}
485 	}
486 fail:
487 	return (error);
488 }
489 
490 
491 /*********************************************************************
492  *
493  *  (Re)Initialize a queue transmit ring.
494  *	- called by init, it clears the descriptor ring,
495  *	  and frees any stale mbufs
496  *
497  **********************************************************************/
498 void
499 ixl_init_tx_ring(struct ixl_queue *que)
500 {
501 #ifdef DEV_NETMAP
502 	struct netmap_adapter *na = NA(que->vsi->ifp);
503 	struct netmap_slot *slot;
504 #endif /* DEV_NETMAP */
505 	struct tx_ring		*txr = &que->txr;
506 	struct ixl_tx_buf	*buf;
507 
508 	/* Clear the old ring contents */
509 	IXL_TX_LOCK(txr);
510 
511 #ifdef DEV_NETMAP
512 	/*
513 	 * (under lock): if in netmap mode, do some consistency
514 	 * checks and set slot to entry 0 of the netmap ring.
515 	 */
516 	slot = netmap_reset(na, NR_TX, que->me, 0);
517 #endif /* DEV_NETMAP */
518 
519 	bzero((void *)txr->base,
520 	      (sizeof(struct i40e_tx_desc)) * que->num_desc);
521 
522 	/* Reset indices */
523 	txr->next_avail = 0;
524 	txr->next_to_clean = 0;
525 
526 #ifdef IXL_FDIR
527 	/* Initialize flow director */
528 	txr->atr_rate = ixl_atr_rate;
529 	txr->atr_count = 0;
530 #endif
531 
532 	/* Free any existing tx mbufs. */
533         buf = txr->buffers;
534 	for (int i = 0; i < que->num_desc; i++, buf++) {
535 		if (buf->m_head != NULL) {
536 			bus_dmamap_sync(buf->tag, buf->map,
537 			    BUS_DMASYNC_POSTWRITE);
538 			bus_dmamap_unload(buf->tag, buf->map);
539 			m_freem(buf->m_head);
540 			buf->m_head = NULL;
541 		}
542 #ifdef DEV_NETMAP
543 		/*
544 		 * In netmap mode, set the map for the packet buffer.
545 		 * NOTE: Some drivers (not this one) also need to set
546 		 * the physical buffer address in the NIC ring.
547 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
548 		 * netmap slot index, si
549 		 */
550 		if (slot) {
551 			int si = netmap_idx_n2k(&na->tx_rings[que->me], i);
552 			netmap_load_map(na, buf->tag, buf->map, NMB(na, slot + si));
553 		}
554 #endif /* DEV_NETMAP */
555 		/* Clear the EOP index */
556 		buf->eop_index = -1;
557         }
558 
559 	/* Set number of descriptors available */
560 	txr->avail = que->num_desc;
561 
562 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
563 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
564 	IXL_TX_UNLOCK(txr);
565 }
566 
567 
568 /*********************************************************************
569  *
570  *  Free transmit ring related data structures.
571  *
572  **********************************************************************/
573 void
574 ixl_free_que_tx(struct ixl_queue *que)
575 {
576 	struct tx_ring *txr = &que->txr;
577 	struct ixl_tx_buf *buf;
578 
579 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
580 
581 	for (int i = 0; i < que->num_desc; i++) {
582 		buf = &txr->buffers[i];
583 		if (buf->m_head != NULL) {
584 			bus_dmamap_sync(buf->tag, buf->map,
585 			    BUS_DMASYNC_POSTWRITE);
586 			bus_dmamap_unload(buf->tag,
587 			    buf->map);
588 			m_freem(buf->m_head);
589 			buf->m_head = NULL;
590 			if (buf->map != NULL) {
591 				bus_dmamap_destroy(buf->tag,
592 				    buf->map);
593 				buf->map = NULL;
594 			}
595 		} else if (buf->map != NULL) {
596 			bus_dmamap_unload(buf->tag,
597 			    buf->map);
598 			bus_dmamap_destroy(buf->tag,
599 			    buf->map);
600 			buf->map = NULL;
601 		}
602 	}
603 	if (txr->br != NULL)
604 		buf_ring_free(txr->br, M_DEVBUF);
605 	if (txr->buffers != NULL) {
606 		free(txr->buffers, M_DEVBUF);
607 		txr->buffers = NULL;
608 	}
609 	if (txr->tx_tag != NULL) {
610 		bus_dma_tag_destroy(txr->tx_tag);
611 		txr->tx_tag = NULL;
612 	}
613 	if (txr->tso_tag != NULL) {
614 		bus_dma_tag_destroy(txr->tso_tag);
615 		txr->tso_tag = NULL;
616 	}
617 
618 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
619 	return;
620 }
621 
622 /*********************************************************************
623  *
624  *  Setup descriptor for hw offloads
625  *
626  **********************************************************************/
627 
628 static int
629 ixl_tx_setup_offload(struct ixl_queue *que,
630     struct mbuf *mp, u32 *cmd, u32 *off)
631 {
632 	struct ether_vlan_header	*eh;
633 #ifdef INET
634 	struct ip			*ip = NULL;
635 #endif
636 	struct tcphdr			*th = NULL;
637 #ifdef INET6
638 	struct ip6_hdr			*ip6;
639 #endif
640 	int				elen, ip_hlen = 0, tcp_hlen;
641 	u16				etype;
642 	u8				ipproto = 0;
643 	bool				tso = FALSE;
644 
645 	/* Set up the TSO context descriptor if required */
646 	if (mp->m_pkthdr.csum_flags & CSUM_TSO) {
647 		tso = ixl_tso_setup(que, mp);
648 		if (tso)
649 			++que->tso;
650 		else
651 			return (ENXIO);
652 	}
653 
654 	/*
655 	 * Determine where frame payload starts.
656 	 * Jump over vlan headers if already present,
657 	 * helpful for QinQ too.
658 	 */
659 	eh = mtod(mp, struct ether_vlan_header *);
660 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
661 		etype = ntohs(eh->evl_proto);
662 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
663 	} else {
664 		etype = ntohs(eh->evl_encap_proto);
665 		elen = ETHER_HDR_LEN;
666 	}
667 
668 	switch (etype) {
669 #ifdef INET
670 		case ETHERTYPE_IP:
671 			ip = (struct ip *)(mp->m_data + elen);
672 			ip_hlen = ip->ip_hl << 2;
673 			ipproto = ip->ip_p;
674 			th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
675 			/* The IP checksum must be recalculated with TSO */
676 			if (tso)
677 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
678 			else
679 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4;
680 			break;
681 #endif
682 #ifdef INET6
683 		case ETHERTYPE_IPV6:
684 			ip6 = (struct ip6_hdr *)(mp->m_data + elen);
685 			ip_hlen = sizeof(struct ip6_hdr);
686 			ipproto = ip6->ip6_nxt;
687 			th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
688 			*cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
689 			break;
690 #endif
691 		default:
692 			break;
693 	}
694 
695 	*off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
696 	*off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
697 
698 	switch (ipproto) {
699 		case IPPROTO_TCP:
700 			tcp_hlen = th->th_off << 2;
701 			if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) {
702 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
703 				*off |= (tcp_hlen >> 2) <<
704 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
705 			}
706 #ifdef IXL_FDIR
707 			ixl_atr(que, th, etype);
708 #endif
709 			break;
710 		case IPPROTO_UDP:
711 			if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) {
712 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
713 				*off |= (sizeof(struct udphdr) >> 2) <<
714 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
715 			}
716 			break;
717 
718 		case IPPROTO_SCTP:
719 			if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) {
720 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
721 				*off |= (sizeof(struct sctphdr) >> 2) <<
722 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
723 			}
724 			/* Fall Thru */
725 		default:
726 			break;
727 	}
728 
729         return (0);
730 }
731 
732 
733 /**********************************************************************
734  *
735  *  Setup context for hardware segmentation offload (TSO)
736  *
737  **********************************************************************/
738 static bool
739 ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp)
740 {
741 	struct tx_ring			*txr = &que->txr;
742 	struct i40e_tx_context_desc	*TXD;
743 	struct ixl_tx_buf		*buf;
744 	u32				cmd, mss, type, tsolen;
745 	u16				etype;
746 	int				idx, elen, ip_hlen, tcp_hlen;
747 	struct ether_vlan_header	*eh;
748 #ifdef INET
749 	struct ip			*ip;
750 #endif
751 #ifdef INET6
752 	struct ip6_hdr			*ip6;
753 #endif
754 #if defined(INET6) || defined(INET)
755 	struct tcphdr			*th;
756 #endif
757 	u64				type_cmd_tso_mss;
758 
759 	/*
760 	 * Determine where frame payload starts.
761 	 * Jump over vlan headers if already present
762 	 */
763 	eh = mtod(mp, struct ether_vlan_header *);
764 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
765 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
766 		etype = eh->evl_proto;
767 	} else {
768 		elen = ETHER_HDR_LEN;
769 		etype = eh->evl_encap_proto;
770 	}
771 
772         switch (ntohs(etype)) {
773 #ifdef INET6
774 	case ETHERTYPE_IPV6:
775 		ip6 = (struct ip6_hdr *)(mp->m_data + elen);
776 		if (ip6->ip6_nxt != IPPROTO_TCP)
777 			return (ENXIO);
778 		ip_hlen = sizeof(struct ip6_hdr);
779 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
780 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
781 		tcp_hlen = th->th_off << 2;
782 		/*
783 		 * The corresponding flag is set by the stack in the IPv4
784 		 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
785 		 * So, set it here because the rest of the flow requires it.
786 		 */
787 		mp->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
788 		break;
789 #endif
790 #ifdef INET
791 	case ETHERTYPE_IP:
792 		ip = (struct ip *)(mp->m_data + elen);
793 		if (ip->ip_p != IPPROTO_TCP)
794 			return (ENXIO);
795 		ip->ip_sum = 0;
796 		ip_hlen = ip->ip_hl << 2;
797 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
798 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
799 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
800 		tcp_hlen = th->th_off << 2;
801 		break;
802 #endif
803 	default:
804 		printf("%s: CSUM_TSO but no supported IP version (0x%04x)",
805 		    __func__, ntohs(etype));
806 		return FALSE;
807         }
808 
809         /* Ensure we have at least the IP+TCP header in the first mbuf. */
810         if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr))
811 		return FALSE;
812 
813 	idx = txr->next_avail;
814 	buf = &txr->buffers[idx];
815 	TXD = (struct i40e_tx_context_desc *) &txr->base[idx];
816 	tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen);
817 
818 	type = I40E_TX_DESC_DTYPE_CONTEXT;
819 	cmd = I40E_TX_CTX_DESC_TSO;
820 	/* ERJ: this must not be less than 64 */
821 	mss = mp->m_pkthdr.tso_segsz;
822 
823 	type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) |
824 	    ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
825 	    ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
826 	    ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
827 	TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss);
828 
829 	TXD->tunneling_params = htole32(0);
830 	buf->m_head = NULL;
831 	buf->eop_index = -1;
832 
833 	if (++idx == que->num_desc)
834 		idx = 0;
835 
836 	txr->avail--;
837 	txr->next_avail = idx;
838 
839 	return TRUE;
840 }
841 
842 /*
843 ** ixl_get_tx_head - Retrieve the value from the
844 **    location the HW records its HEAD index
845 */
846 static inline u32
847 ixl_get_tx_head(struct ixl_queue *que)
848 {
849 	struct tx_ring  *txr = &que->txr;
850 	void *head = &txr->base[que->num_desc];
851 	return LE32_TO_CPU(*(volatile __le32 *)head);
852 }
853 
854 /**********************************************************************
855  *
856  *  Examine each tx_buffer in the used queue. If the hardware is done
857  *  processing the packet then free associated resources. The
858  *  tx_buffer is put back on the free queue.
859  *
860  **********************************************************************/
861 bool
862 ixl_txeof(struct ixl_queue *que)
863 {
864 	struct tx_ring		*txr = &que->txr;
865 	u32			first, last, head, done, processed;
866 	struct ixl_tx_buf	*buf;
867 	struct i40e_tx_desc	*tx_desc, *eop_desc;
868 
869 
870 	mtx_assert(&txr->mtx, MA_OWNED);
871 
872 #ifdef DEV_NETMAP
873 	// XXX todo: implement moderation
874 	if (netmap_tx_irq(que->vsi->ifp, que->me))
875 		return FALSE;
876 #endif /* DEF_NETMAP */
877 
878 	/* These are not the descriptors you seek, move along :) */
879 	if (txr->avail == que->num_desc) {
880 		que->busy = 0;
881 		return FALSE;
882 	}
883 
884 	processed = 0;
885 	first = txr->next_to_clean;
886 	buf = &txr->buffers[first];
887 	tx_desc = (struct i40e_tx_desc *)&txr->base[first];
888 	last = buf->eop_index;
889 	if (last == -1)
890 		return FALSE;
891 	eop_desc = (struct i40e_tx_desc *)&txr->base[last];
892 
893 	/* Get the Head WB value */
894 	head = ixl_get_tx_head(que);
895 
896 	/*
897 	** Get the index of the first descriptor
898 	** BEYOND the EOP and call that 'done'.
899 	** I do this so the comparison in the
900 	** inner while loop below can be simple
901 	*/
902 	if (++last == que->num_desc) last = 0;
903 	done = last;
904 
905         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
906             BUS_DMASYNC_POSTREAD);
907 	/*
908 	** The HEAD index of the ring is written in a
909 	** defined location, this rather than a done bit
910 	** is what is used to keep track of what must be
911 	** 'cleaned'.
912 	*/
913 	while (first != head) {
914 		/* We clean the range of the packet */
915 		while (first != done) {
916 			++txr->avail;
917 			++processed;
918 
919 			if (buf->m_head) {
920 				txr->bytes += /* for ITR adjustment */
921 				    buf->m_head->m_pkthdr.len;
922 				txr->tx_bytes += /* for TX stats */
923 				    buf->m_head->m_pkthdr.len;
924 				bus_dmamap_sync(buf->tag,
925 				    buf->map,
926 				    BUS_DMASYNC_POSTWRITE);
927 				bus_dmamap_unload(buf->tag,
928 				    buf->map);
929 				m_freem(buf->m_head);
930 				buf->m_head = NULL;
931 				buf->map = NULL;
932 			}
933 			buf->eop_index = -1;
934 
935 			if (++first == que->num_desc)
936 				first = 0;
937 
938 			buf = &txr->buffers[first];
939 			tx_desc = &txr->base[first];
940 		}
941 		++txr->packets;
942 		/* See if there is more work now */
943 		last = buf->eop_index;
944 		if (last != -1) {
945 			eop_desc = &txr->base[last];
946 			/* Get next done point */
947 			if (++last == que->num_desc) last = 0;
948 			done = last;
949 		} else
950 			break;
951 	}
952 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
953 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
954 
955 	txr->next_to_clean = first;
956 
957 
958 	/*
959 	** Hang detection, we know there's
960 	** work outstanding or the first return
961 	** would have been taken, so indicate an
962 	** unsuccessful pass, in local_timer if
963 	** the value is too great the queue will
964 	** be considered hung. If anything has been
965 	** cleaned then reset the state.
966 	*/
967 	if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG))
968 		++que->busy;
969 
970 	if (processed)
971 		que->busy = 1; /* Note this turns off HUNG */
972 
973 	/*
974 	 * If there are no pending descriptors, clear the timeout.
975 	 */
976 	if (txr->avail == que->num_desc) {
977 		que->busy = 0;
978 		return FALSE;
979 	}
980 
981 	return TRUE;
982 }
983 
984 /*********************************************************************
985  *
986  *  Refresh mbuf buffers for RX descriptor rings
987  *   - now keeps its own state so discards due to resource
988  *     exhaustion are unnecessary, if an mbuf cannot be obtained
989  *     it just returns, keeping its placeholder, thus it can simply
990  *     be recalled to try again.
991  *
992  **********************************************************************/
993 static void
994 ixl_refresh_mbufs(struct ixl_queue *que, int limit)
995 {
996 	struct ixl_vsi		*vsi = que->vsi;
997 	struct rx_ring		*rxr = &que->rxr;
998 	bus_dma_segment_t	hseg[1];
999 	bus_dma_segment_t	pseg[1];
1000 	struct ixl_rx_buf	*buf;
1001 	struct mbuf		*mh, *mp;
1002 	int			i, j, nsegs, error;
1003 	bool			refreshed = FALSE;
1004 
1005 	i = j = rxr->next_refresh;
1006 	/* Control the loop with one beyond */
1007 	if (++j == que->num_desc)
1008 		j = 0;
1009 
1010 	while (j != limit) {
1011 		buf = &rxr->buffers[i];
1012 		if (rxr->hdr_split == FALSE)
1013 			goto no_split;
1014 
1015 		if (buf->m_head == NULL) {
1016 			mh = m_gethdr(M_NOWAIT, MT_DATA);
1017 			if (mh == NULL)
1018 				goto update;
1019 		} else
1020 			mh = buf->m_head;
1021 
1022 		mh->m_pkthdr.len = mh->m_len = MHLEN;
1023 		mh->m_len = MHLEN;
1024 		mh->m_flags |= M_PKTHDR;
1025 		/* Get the memory mapping */
1026 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1027 		    buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT);
1028 		if (error != 0) {
1029 			printf("Refresh mbufs: hdr dmamap load"
1030 			    " failure - %d\n", error);
1031 			m_free(mh);
1032 			buf->m_head = NULL;
1033 			goto update;
1034 		}
1035 		buf->m_head = mh;
1036 		bus_dmamap_sync(rxr->htag, buf->hmap,
1037 		    BUS_DMASYNC_PREREAD);
1038 		rxr->base[i].read.hdr_addr =
1039 		   htole64(hseg[0].ds_addr);
1040 
1041 no_split:
1042 		if (buf->m_pack == NULL) {
1043 			mp = m_getjcl(M_NOWAIT, MT_DATA,
1044 			    M_PKTHDR, rxr->mbuf_sz);
1045 			if (mp == NULL)
1046 				goto update;
1047 		} else
1048 			mp = buf->m_pack;
1049 
1050 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1051 		/* Get the memory mapping */
1052 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1053 		    buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
1054 		if (error != 0) {
1055 			printf("Refresh mbufs: payload dmamap load"
1056 			    " failure - %d\n", error);
1057 			m_free(mp);
1058 			buf->m_pack = NULL;
1059 			goto update;
1060 		}
1061 		buf->m_pack = mp;
1062 		bus_dmamap_sync(rxr->ptag, buf->pmap,
1063 		    BUS_DMASYNC_PREREAD);
1064 		rxr->base[i].read.pkt_addr =
1065 		   htole64(pseg[0].ds_addr);
1066 		/* Used only when doing header split */
1067 		rxr->base[i].read.hdr_addr = 0;
1068 
1069 		refreshed = TRUE;
1070 		/* Next is precalculated */
1071 		i = j;
1072 		rxr->next_refresh = i;
1073 		if (++j == que->num_desc)
1074 			j = 0;
1075 	}
1076 update:
1077 	if (refreshed) /* Update hardware tail index */
1078 		wr32(vsi->hw, rxr->tail, rxr->next_refresh);
1079 	return;
1080 }
1081 
1082 
1083 /*********************************************************************
1084  *
1085  *  Allocate memory for rx_buffer structures. Since we use one
1086  *  rx_buffer per descriptor, the maximum number of rx_buffer's
1087  *  that we'll need is equal to the number of receive descriptors
1088  *  that we've defined.
1089  *
1090  **********************************************************************/
1091 int
1092 ixl_allocate_rx_data(struct ixl_queue *que)
1093 {
1094 	struct rx_ring		*rxr = &que->rxr;
1095 	struct ixl_vsi		*vsi = que->vsi;
1096 	device_t 		dev = vsi->dev;
1097 	struct ixl_rx_buf 	*buf;
1098 	int             	i, bsize, error;
1099 
1100 	bsize = sizeof(struct ixl_rx_buf) * que->num_desc;
1101 	if (!(rxr->buffers =
1102 	    (struct ixl_rx_buf *) malloc(bsize,
1103 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
1104 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
1105 		error = ENOMEM;
1106 		return (error);
1107 	}
1108 
1109 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1110 				   1, 0,	/* alignment, bounds */
1111 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1112 				   BUS_SPACE_MAXADDR,	/* highaddr */
1113 				   NULL, NULL,		/* filter, filterarg */
1114 				   MSIZE,		/* maxsize */
1115 				   1,			/* nsegments */
1116 				   MSIZE,		/* maxsegsize */
1117 				   0,			/* flags */
1118 				   NULL,		/* lockfunc */
1119 				   NULL,		/* lockfuncarg */
1120 				   &rxr->htag))) {
1121 		device_printf(dev, "Unable to create RX DMA htag\n");
1122 		return (error);
1123 	}
1124 
1125 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1126 				   1, 0,	/* alignment, bounds */
1127 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1128 				   BUS_SPACE_MAXADDR,	/* highaddr */
1129 				   NULL, NULL,		/* filter, filterarg */
1130 				   MJUM16BYTES,		/* maxsize */
1131 				   1,			/* nsegments */
1132 				   MJUM16BYTES,		/* maxsegsize */
1133 				   0,			/* flags */
1134 				   NULL,		/* lockfunc */
1135 				   NULL,		/* lockfuncarg */
1136 				   &rxr->ptag))) {
1137 		device_printf(dev, "Unable to create RX DMA ptag\n");
1138 		return (error);
1139 	}
1140 
1141 	for (i = 0; i < que->num_desc; i++) {
1142 		buf = &rxr->buffers[i];
1143 		error = bus_dmamap_create(rxr->htag,
1144 		    BUS_DMA_NOWAIT, &buf->hmap);
1145 		if (error) {
1146 			device_printf(dev, "Unable to create RX head map\n");
1147 			break;
1148 		}
1149 		error = bus_dmamap_create(rxr->ptag,
1150 		    BUS_DMA_NOWAIT, &buf->pmap);
1151 		if (error) {
1152 			device_printf(dev, "Unable to create RX pkt map\n");
1153 			break;
1154 		}
1155 	}
1156 
1157 	return (error);
1158 }
1159 
1160 
1161 /*********************************************************************
1162  *
1163  *  (Re)Initialize the queue receive ring and its buffers.
1164  *
1165  **********************************************************************/
1166 int
1167 ixl_init_rx_ring(struct ixl_queue *que)
1168 {
1169 	struct	rx_ring 	*rxr = &que->rxr;
1170 	struct ixl_vsi		*vsi = que->vsi;
1171 #if defined(INET6) || defined(INET)
1172 	struct ifnet		*ifp = vsi->ifp;
1173 	struct lro_ctrl		*lro = &rxr->lro;
1174 #endif
1175 	struct ixl_rx_buf	*buf;
1176 	bus_dma_segment_t	pseg[1], hseg[1];
1177 	int			rsize, nsegs, error = 0;
1178 #ifdef DEV_NETMAP
1179 	struct netmap_adapter *na = NA(que->vsi->ifp);
1180 	struct netmap_slot *slot;
1181 #endif /* DEV_NETMAP */
1182 
1183 	IXL_RX_LOCK(rxr);
1184 #ifdef DEV_NETMAP
1185 	/* same as in ixl_init_tx_ring() */
1186 	slot = netmap_reset(na, NR_RX, que->me, 0);
1187 #endif /* DEV_NETMAP */
1188 	/* Clear the ring contents */
1189 	rsize = roundup2(que->num_desc *
1190 	    sizeof(union i40e_rx_desc), DBA_ALIGN);
1191 	bzero((void *)rxr->base, rsize);
1192 	/* Cleanup any existing buffers */
1193 	for (int i = 0; i < que->num_desc; i++) {
1194 		buf = &rxr->buffers[i];
1195 		if (buf->m_head != NULL) {
1196 			bus_dmamap_sync(rxr->htag, buf->hmap,
1197 			    BUS_DMASYNC_POSTREAD);
1198 			bus_dmamap_unload(rxr->htag, buf->hmap);
1199 			buf->m_head->m_flags |= M_PKTHDR;
1200 			m_freem(buf->m_head);
1201 		}
1202 		if (buf->m_pack != NULL) {
1203 			bus_dmamap_sync(rxr->ptag, buf->pmap,
1204 			    BUS_DMASYNC_POSTREAD);
1205 			bus_dmamap_unload(rxr->ptag, buf->pmap);
1206 			buf->m_pack->m_flags |= M_PKTHDR;
1207 			m_freem(buf->m_pack);
1208 		}
1209 		buf->m_head = NULL;
1210 		buf->m_pack = NULL;
1211 	}
1212 
1213 	/* header split is off */
1214 	rxr->hdr_split = FALSE;
1215 
1216 	/* Now replenish the mbufs */
1217 	for (int j = 0; j != que->num_desc; ++j) {
1218 		struct mbuf	*mh, *mp;
1219 
1220 		buf = &rxr->buffers[j];
1221 #ifdef DEV_NETMAP
1222 		/*
1223 		 * In netmap mode, fill the map and set the buffer
1224 		 * address in the NIC ring, considering the offset
1225 		 * between the netmap and NIC rings (see comment in
1226 		 * ixgbe_setup_transmit_ring() ). No need to allocate
1227 		 * an mbuf, so end the block with a continue;
1228 		 */
1229 		if (slot) {
1230 			int sj = netmap_idx_n2k(&na->rx_rings[que->me], j);
1231 			uint64_t paddr;
1232 			void *addr;
1233 
1234 			addr = PNMB(na, slot + sj, &paddr);
1235 			netmap_load_map(na, rxr->dma.tag, buf->pmap, addr);
1236 			/* Update descriptor and the cached value */
1237 			rxr->base[j].read.pkt_addr = htole64(paddr);
1238 			rxr->base[j].read.hdr_addr = 0;
1239 			continue;
1240 		}
1241 #endif /* DEV_NETMAP */
1242 		/*
1243 		** Don't allocate mbufs if not
1244 		** doing header split, its wasteful
1245 		*/
1246 		if (rxr->hdr_split == FALSE)
1247 			goto skip_head;
1248 
1249 		/* First the header */
1250 		buf->m_head = m_gethdr(M_NOWAIT, MT_DATA);
1251 		if (buf->m_head == NULL) {
1252 			error = ENOBUFS;
1253 			goto fail;
1254 		}
1255 		m_adj(buf->m_head, ETHER_ALIGN);
1256 		mh = buf->m_head;
1257 		mh->m_len = mh->m_pkthdr.len = MHLEN;
1258 		mh->m_flags |= M_PKTHDR;
1259 		/* Get the memory mapping */
1260 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1261 		    buf->hmap, buf->m_head, hseg,
1262 		    &nsegs, BUS_DMA_NOWAIT);
1263 		if (error != 0) /* Nothing elegant to do here */
1264 			goto fail;
1265 		bus_dmamap_sync(rxr->htag,
1266 		    buf->hmap, BUS_DMASYNC_PREREAD);
1267 		/* Update descriptor */
1268 		rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr);
1269 
1270 skip_head:
1271 		/* Now the payload cluster */
1272 		buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA,
1273 		    M_PKTHDR, rxr->mbuf_sz);
1274 		if (buf->m_pack == NULL) {
1275 			error = ENOBUFS;
1276                         goto fail;
1277 		}
1278 		mp = buf->m_pack;
1279 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1280 		/* Get the memory mapping */
1281 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1282 		    buf->pmap, mp, pseg,
1283 		    &nsegs, BUS_DMA_NOWAIT);
1284 		if (error != 0)
1285                         goto fail;
1286 		bus_dmamap_sync(rxr->ptag,
1287 		    buf->pmap, BUS_DMASYNC_PREREAD);
1288 		/* Update descriptor */
1289 		rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr);
1290 		rxr->base[j].read.hdr_addr = 0;
1291 	}
1292 
1293 
1294 	/* Setup our descriptor indices */
1295 	rxr->next_check = 0;
1296 	rxr->next_refresh = 0;
1297 	rxr->lro_enabled = FALSE;
1298 	rxr->split = 0;
1299 	rxr->bytes = 0;
1300 	rxr->discard = FALSE;
1301 
1302 	wr32(vsi->hw, rxr->tail, que->num_desc - 1);
1303 	ixl_flush(vsi->hw);
1304 
1305 #if defined(INET6) || defined(INET)
1306 	/*
1307 	** Now set up the LRO interface:
1308 	*/
1309 	if (ifp->if_capenable & IFCAP_LRO) {
1310 		int err = tcp_lro_init(lro);
1311 		if (err) {
1312 			if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me);
1313 			goto fail;
1314 		}
1315 		INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me);
1316 		rxr->lro_enabled = TRUE;
1317 		lro->ifp = vsi->ifp;
1318 	}
1319 #endif
1320 
1321 	bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1322 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1323 
1324 fail:
1325 	IXL_RX_UNLOCK(rxr);
1326 	return (error);
1327 }
1328 
1329 
1330 /*********************************************************************
1331  *
1332  *  Free station receive ring data structures
1333  *
1334  **********************************************************************/
1335 void
1336 ixl_free_que_rx(struct ixl_queue *que)
1337 {
1338 	struct rx_ring		*rxr = &que->rxr;
1339 	struct ixl_rx_buf	*buf;
1340 
1341 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
1342 
1343 	/* Cleanup any existing buffers */
1344 	if (rxr->buffers != NULL) {
1345 		for (int i = 0; i < que->num_desc; i++) {
1346 			buf = &rxr->buffers[i];
1347 			if (buf->m_head != NULL) {
1348 				bus_dmamap_sync(rxr->htag, buf->hmap,
1349 				    BUS_DMASYNC_POSTREAD);
1350 				bus_dmamap_unload(rxr->htag, buf->hmap);
1351 				buf->m_head->m_flags |= M_PKTHDR;
1352 				m_freem(buf->m_head);
1353 			}
1354 			if (buf->m_pack != NULL) {
1355 				bus_dmamap_sync(rxr->ptag, buf->pmap,
1356 				    BUS_DMASYNC_POSTREAD);
1357 				bus_dmamap_unload(rxr->ptag, buf->pmap);
1358 				buf->m_pack->m_flags |= M_PKTHDR;
1359 				m_freem(buf->m_pack);
1360 			}
1361 			buf->m_head = NULL;
1362 			buf->m_pack = NULL;
1363 			if (buf->hmap != NULL) {
1364 				bus_dmamap_destroy(rxr->htag, buf->hmap);
1365 				buf->hmap = NULL;
1366 			}
1367 			if (buf->pmap != NULL) {
1368 				bus_dmamap_destroy(rxr->ptag, buf->pmap);
1369 				buf->pmap = NULL;
1370 			}
1371 		}
1372 		if (rxr->buffers != NULL) {
1373 			free(rxr->buffers, M_DEVBUF);
1374 			rxr->buffers = NULL;
1375 		}
1376 	}
1377 
1378 	if (rxr->htag != NULL) {
1379 		bus_dma_tag_destroy(rxr->htag);
1380 		rxr->htag = NULL;
1381 	}
1382 	if (rxr->ptag != NULL) {
1383 		bus_dma_tag_destroy(rxr->ptag);
1384 		rxr->ptag = NULL;
1385 	}
1386 
1387 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
1388 	return;
1389 }
1390 
1391 static inline void
1392 ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype)
1393 {
1394 
1395 #if defined(INET6) || defined(INET)
1396         /*
1397          * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet
1398          * should be computed by hardware. Also it should not have VLAN tag in
1399          * ethernet header.
1400          */
1401         if (rxr->lro_enabled &&
1402             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
1403             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
1404             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
1405                 /*
1406                  * Send to the stack if:
1407                  **  - LRO not enabled, or
1408                  **  - no LRO resources, or
1409                  **  - lro enqueue fails
1410                  */
1411                 if (rxr->lro.lro_cnt != 0)
1412                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
1413                                 return;
1414         }
1415 #endif
1416 	IXL_RX_UNLOCK(rxr);
1417         (*ifp->if_input)(ifp, m);
1418 	IXL_RX_LOCK(rxr);
1419 }
1420 
1421 
1422 static inline void
1423 ixl_rx_discard(struct rx_ring *rxr, int i)
1424 {
1425 	struct ixl_rx_buf	*rbuf;
1426 
1427 	rbuf = &rxr->buffers[i];
1428 
1429         if (rbuf->fmp != NULL) {/* Partial chain ? */
1430 		rbuf->fmp->m_flags |= M_PKTHDR;
1431                 m_freem(rbuf->fmp);
1432                 rbuf->fmp = NULL;
1433 	}
1434 
1435 	/*
1436 	** With advanced descriptors the writeback
1437 	** clobbers the buffer addrs, so its easier
1438 	** to just free the existing mbufs and take
1439 	** the normal refresh path to get new buffers
1440 	** and mapping.
1441 	*/
1442 	if (rbuf->m_head) {
1443 		m_free(rbuf->m_head);
1444 		rbuf->m_head = NULL;
1445 	}
1446 
1447 	if (rbuf->m_pack) {
1448 		m_free(rbuf->m_pack);
1449 		rbuf->m_pack = NULL;
1450 	}
1451 
1452 	return;
1453 }
1454 
1455 #ifdef RSS
1456 /*
1457 ** i40e_ptype_to_hash: parse the packet type
1458 ** to determine the appropriate hash.
1459 */
1460 static inline int
1461 ixl_ptype_to_hash(u8 ptype)
1462 {
1463         struct i40e_rx_ptype_decoded	decoded;
1464 	u8				ex = 0;
1465 
1466 	decoded = decode_rx_desc_ptype(ptype);
1467 	ex = decoded.outer_frag;
1468 
1469 	if (!decoded.known)
1470 		return M_HASHTYPE_OPAQUE_HASH;
1471 
1472 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_L2)
1473 		return M_HASHTYPE_OPAQUE_HASH;
1474 
1475 	/* Note: anything that gets to this point is IP */
1476         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) {
1477 		switch (decoded.inner_prot) {
1478 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1479 				if (ex)
1480 					return M_HASHTYPE_RSS_TCP_IPV6_EX;
1481 				else
1482 					return M_HASHTYPE_RSS_TCP_IPV6;
1483 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1484 				if (ex)
1485 					return M_HASHTYPE_RSS_UDP_IPV6_EX;
1486 				else
1487 					return M_HASHTYPE_RSS_UDP_IPV6;
1488 			default:
1489 				if (ex)
1490 					return M_HASHTYPE_RSS_IPV6_EX;
1491 				else
1492 					return M_HASHTYPE_RSS_IPV6;
1493 		}
1494 	}
1495         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1496 		switch (decoded.inner_prot) {
1497 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1498 					return M_HASHTYPE_RSS_TCP_IPV4;
1499 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1500 				if (ex)
1501 					return M_HASHTYPE_RSS_UDP_IPV4_EX;
1502 				else
1503 					return M_HASHTYPE_RSS_UDP_IPV4;
1504 			default:
1505 					return M_HASHTYPE_RSS_IPV4;
1506 		}
1507 	}
1508 	/* We should never get here!! */
1509 	return M_HASHTYPE_OPAQUE_HASH;
1510 }
1511 #endif /* RSS */
1512 
1513 /*********************************************************************
1514  *
1515  *  This routine executes in interrupt context. It replenishes
1516  *  the mbufs in the descriptor and sends data which has been
1517  *  dma'ed into host memory to upper layer.
1518  *
1519  *  We loop at most count times if count is > 0, or until done if
1520  *  count < 0.
1521  *
1522  *  Return TRUE for more work, FALSE for all clean.
1523  *********************************************************************/
1524 bool
1525 ixl_rxeof(struct ixl_queue *que, int count)
1526 {
1527 	struct ixl_vsi		*vsi = que->vsi;
1528 	struct rx_ring		*rxr = &que->rxr;
1529 	struct ifnet		*ifp = vsi->ifp;
1530 #if defined(INET6) || defined(INET)
1531 	struct lro_ctrl		*lro = &rxr->lro;
1532 #endif
1533 	int			i, nextp, processed = 0;
1534 	union i40e_rx_desc	*cur;
1535 	struct ixl_rx_buf	*rbuf, *nbuf;
1536 
1537 
1538 	IXL_RX_LOCK(rxr);
1539 
1540 #ifdef DEV_NETMAP
1541 	if (netmap_rx_irq(ifp, que->me, &count)) {
1542 		IXL_RX_UNLOCK(rxr);
1543 		return (FALSE);
1544 	}
1545 #endif /* DEV_NETMAP */
1546 
1547 	for (i = rxr->next_check; count != 0;) {
1548 		struct mbuf	*sendmp, *mh, *mp;
1549 		u32		status, error;
1550 		u16		hlen, plen, vtag;
1551 		u64		qword;
1552 		u8		ptype;
1553 		bool		eop;
1554 
1555 		/* Sync the ring. */
1556 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1557 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1558 
1559 		cur = &rxr->base[i];
1560 		qword = le64toh(cur->wb.qword1.status_error_len);
1561 		status = (qword & I40E_RXD_QW1_STATUS_MASK)
1562 		    >> I40E_RXD_QW1_STATUS_SHIFT;
1563 		error = (qword & I40E_RXD_QW1_ERROR_MASK)
1564 		    >> I40E_RXD_QW1_ERROR_SHIFT;
1565 		plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK)
1566 		    >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1567 		hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK)
1568 		    >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
1569 		ptype = (qword & I40E_RXD_QW1_PTYPE_MASK)
1570 		    >> I40E_RXD_QW1_PTYPE_SHIFT;
1571 
1572 		if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) {
1573 			++rxr->not_done;
1574 			break;
1575 		}
1576 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1577 			break;
1578 
1579 		count--;
1580 		sendmp = NULL;
1581 		nbuf = NULL;
1582 		cur->wb.qword1.status_error_len = 0;
1583 		rbuf = &rxr->buffers[i];
1584 		mh = rbuf->m_head;
1585 		mp = rbuf->m_pack;
1586 		eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT));
1587 		if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT))
1588 			vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1);
1589 		else
1590 			vtag = 0;
1591 
1592 		/*
1593 		** Make sure bad packets are discarded,
1594 		** note that only EOP descriptor has valid
1595 		** error results.
1596 		*/
1597                 if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1598 			rxr->desc_errs++;
1599 			ixl_rx_discard(rxr, i);
1600 			goto next_desc;
1601 		}
1602 
1603 		/* Prefetch the next buffer */
1604 		if (!eop) {
1605 			nextp = i + 1;
1606 			if (nextp == que->num_desc)
1607 				nextp = 0;
1608 			nbuf = &rxr->buffers[nextp];
1609 			prefetch(nbuf);
1610 		}
1611 
1612 		/*
1613 		** The header mbuf is ONLY used when header
1614 		** split is enabled, otherwise we get normal
1615 		** behavior, ie, both header and payload
1616 		** are DMA'd into the payload buffer.
1617 		**
1618 		** Rather than using the fmp/lmp global pointers
1619 		** we now keep the head of a packet chain in the
1620 		** buffer struct and pass this along from one
1621 		** descriptor to the next, until we get EOP.
1622 		*/
1623 		if (rxr->hdr_split && (rbuf->fmp == NULL)) {
1624 			if (hlen > IXL_RX_HDR)
1625 				hlen = IXL_RX_HDR;
1626 			mh->m_len = hlen;
1627 			mh->m_flags |= M_PKTHDR;
1628 			mh->m_next = NULL;
1629 			mh->m_pkthdr.len = mh->m_len;
1630 			/* Null buf pointer so it is refreshed */
1631 			rbuf->m_head = NULL;
1632 			/*
1633 			** Check the payload length, this
1634 			** could be zero if its a small
1635 			** packet.
1636 			*/
1637 			if (plen > 0) {
1638 				mp->m_len = plen;
1639 				mp->m_next = NULL;
1640 				mp->m_flags &= ~M_PKTHDR;
1641 				mh->m_next = mp;
1642 				mh->m_pkthdr.len += mp->m_len;
1643 				/* Null buf pointer so it is refreshed */
1644 				rbuf->m_pack = NULL;
1645 				rxr->split++;
1646 			}
1647 			/*
1648 			** Now create the forward
1649 			** chain so when complete
1650 			** we wont have to.
1651 			*/
1652                         if (eop == 0) {
1653 				/* stash the chain head */
1654                                 nbuf->fmp = mh;
1655 				/* Make forward chain */
1656                                 if (plen)
1657                                         mp->m_next = nbuf->m_pack;
1658                                 else
1659                                         mh->m_next = nbuf->m_pack;
1660                         } else {
1661 				/* Singlet, prepare to send */
1662                                 sendmp = mh;
1663                                 if (vtag) {
1664                                         sendmp->m_pkthdr.ether_vtag = vtag;
1665                                         sendmp->m_flags |= M_VLANTAG;
1666                                 }
1667                         }
1668 		} else {
1669 			/*
1670 			** Either no header split, or a
1671 			** secondary piece of a fragmented
1672 			** split packet.
1673 			*/
1674 			mp->m_len = plen;
1675 			/*
1676 			** See if there is a stored head
1677 			** that determines what we are
1678 			*/
1679 			sendmp = rbuf->fmp;
1680 			rbuf->m_pack = rbuf->fmp = NULL;
1681 
1682 			if (sendmp != NULL) /* secondary frag */
1683 				sendmp->m_pkthdr.len += mp->m_len;
1684 			else {
1685 				/* first desc of a non-ps chain */
1686 				sendmp = mp;
1687 				sendmp->m_flags |= M_PKTHDR;
1688 				sendmp->m_pkthdr.len = mp->m_len;
1689                         }
1690 			/* Pass the head pointer on */
1691 			if (eop == 0) {
1692 				nbuf->fmp = sendmp;
1693 				sendmp = NULL;
1694 				mp->m_next = nbuf->m_pack;
1695 			}
1696 		}
1697 		++processed;
1698 		/* Sending this frame? */
1699 		if (eop) {
1700 			sendmp->m_pkthdr.rcvif = ifp;
1701 			/* gather stats */
1702 			rxr->rx_packets++;
1703 			rxr->rx_bytes += sendmp->m_pkthdr.len;
1704 			/* capture data for dynamic ITR adjustment */
1705 			rxr->packets++;
1706 			rxr->bytes += sendmp->m_pkthdr.len;
1707 			/* Set VLAN tag (field only valid in eop desc) */
1708 			if (vtag) {
1709 				sendmp->m_pkthdr.ether_vtag = vtag;
1710 				sendmp->m_flags |= M_VLANTAG;
1711 			}
1712 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
1713 				ixl_rx_checksum(sendmp, status, error, ptype);
1714 #ifdef RSS
1715 			sendmp->m_pkthdr.flowid =
1716 			    le32toh(cur->wb.qword0.hi_dword.rss);
1717 			M_HASHTYPE_SET(sendmp, ixl_ptype_to_hash(ptype));
1718 #else
1719 			sendmp->m_pkthdr.flowid = que->msix;
1720 			M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE);
1721 #endif
1722 		}
1723 next_desc:
1724 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1725 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1726 
1727 		/* Advance our pointers to the next descriptor. */
1728 		if (++i == que->num_desc)
1729 			i = 0;
1730 
1731 		/* Now send to the stack or do LRO */
1732 		if (sendmp != NULL) {
1733 			rxr->next_check = i;
1734 			ixl_rx_input(rxr, ifp, sendmp, ptype);
1735 			i = rxr->next_check;
1736 		}
1737 
1738                /* Every 8 descriptors we go to refresh mbufs */
1739 		if (processed == 8) {
1740 			ixl_refresh_mbufs(que, i);
1741 			processed = 0;
1742 		}
1743 	}
1744 
1745 	/* Refresh any remaining buf structs */
1746 	if (ixl_rx_unrefreshed(que))
1747 		ixl_refresh_mbufs(que, i);
1748 
1749 	rxr->next_check = i;
1750 
1751 #if defined(INET6) || defined(INET)
1752 	/*
1753 	 * Flush any outstanding LRO work
1754 	 */
1755 	tcp_lro_flush_all(lro);
1756 #endif
1757 
1758 	IXL_RX_UNLOCK(rxr);
1759 	return (FALSE);
1760 }
1761 
1762 
1763 /*********************************************************************
1764  *
1765  *  Verify that the hardware indicated that the checksum is valid.
1766  *  Inform the stack about the status of checksum so that stack
1767  *  doesn't spend time verifying the checksum.
1768  *
1769  *********************************************************************/
1770 static void
1771 ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype)
1772 {
1773 	struct i40e_rx_ptype_decoded decoded;
1774 
1775 	decoded = decode_rx_desc_ptype(ptype);
1776 
1777 	/* Errors? */
1778  	if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
1779 	    (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) {
1780 		mp->m_pkthdr.csum_flags = 0;
1781 		return;
1782 	}
1783 
1784 	/* IPv6 with extension headers likely have bad csum */
1785 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1786 	    decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6)
1787 		if (status &
1788 		    (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) {
1789 			mp->m_pkthdr.csum_flags = 0;
1790 			return;
1791 		}
1792 
1793 
1794 	/* IP Checksum Good */
1795 	mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
1796 	mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
1797 
1798 	if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) {
1799 		mp->m_pkthdr.csum_flags |=
1800 		    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1801 		mp->m_pkthdr.csum_data |= htons(0xffff);
1802 	}
1803 	return;
1804 }
1805 
1806 #if __FreeBSD_version >= 1100000
1807 uint64_t
1808 ixl_get_counter(if_t ifp, ift_counter cnt)
1809 {
1810 	struct ixl_vsi *vsi;
1811 
1812 	vsi = if_getsoftc(ifp);
1813 
1814 	switch (cnt) {
1815 	case IFCOUNTER_IPACKETS:
1816 		return (vsi->ipackets);
1817 	case IFCOUNTER_IERRORS:
1818 		return (vsi->ierrors);
1819 	case IFCOUNTER_OPACKETS:
1820 		return (vsi->opackets);
1821 	case IFCOUNTER_OERRORS:
1822 		return (vsi->oerrors);
1823 	case IFCOUNTER_COLLISIONS:
1824 		/* Collisions are by standard impossible in 40G/10G Ethernet */
1825 		return (0);
1826 	case IFCOUNTER_IBYTES:
1827 		return (vsi->ibytes);
1828 	case IFCOUNTER_OBYTES:
1829 		return (vsi->obytes);
1830 	case IFCOUNTER_IMCASTS:
1831 		return (vsi->imcasts);
1832 	case IFCOUNTER_OMCASTS:
1833 		return (vsi->omcasts);
1834 	case IFCOUNTER_IQDROPS:
1835 		return (vsi->iqdrops);
1836 	case IFCOUNTER_OQDROPS:
1837 		return (vsi->oqdrops);
1838 	case IFCOUNTER_NOPROTO:
1839 		return (vsi->noproto);
1840 	default:
1841 		return (if_get_counter_default(ifp, cnt));
1842 	}
1843 }
1844 #endif
1845 
1846