xref: /freebsd/sys/dev/ixl/ixl_txrx.c (revision 884d26c84cba3ffc3d4e626306098fcdfe6a0c2b)
1 /******************************************************************************
2 
3   Copyright (c) 2013-2015, Intel Corporation
4   All rights reserved.
5 
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8 
9    1. Redistributions of source code must retain the above copyright notice,
10       this list of conditions and the following disclaimer.
11 
12    2. Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16    3. Neither the name of the Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   POSSIBILITY OF SUCH DAMAGE.
31 
32 ******************************************************************************/
33 /*$FreeBSD$*/
34 
35 /*
36 **	IXL driver TX/RX Routines:
37 **	    This was seperated to allow usage by
38 ** 	    both the BASE and the VF drivers.
39 */
40 
41 #ifndef IXL_STANDALONE_BUILD
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_rss.h"
45 #endif
46 
47 #include "ixl.h"
48 
49 #ifdef RSS
50 #include <net/rss_config.h>
51 #endif
52 
53 /* Local Prototypes */
54 static void	ixl_rx_checksum(struct mbuf *, u32, u32, u8);
55 static void	ixl_refresh_mbufs(struct ixl_queue *, int);
56 static int      ixl_xmit(struct ixl_queue *, struct mbuf **);
57 static int	ixl_tx_setup_offload(struct ixl_queue *,
58 		    struct mbuf *, u32 *, u32 *);
59 static bool	ixl_tso_setup(struct ixl_queue *, struct mbuf *);
60 
61 static __inline void ixl_rx_discard(struct rx_ring *, int);
62 static __inline void ixl_rx_input(struct rx_ring *, struct ifnet *,
63 		    struct mbuf *, u8);
64 
65 #ifdef DEV_NETMAP
66 #include <dev/netmap/if_ixl_netmap.h>
67 #endif /* DEV_NETMAP */
68 
69 /*
70 ** Multiqueue Transmit driver
71 */
72 int
73 ixl_mq_start(struct ifnet *ifp, struct mbuf *m)
74 {
75 	struct ixl_vsi		*vsi = ifp->if_softc;
76 	struct ixl_queue	*que;
77 	struct tx_ring		*txr;
78 	int 			err, i;
79 #ifdef RSS
80 	u32			bucket_id;
81 #endif
82 
83 	/*
84 	** Which queue to use:
85 	**
86 	** When doing RSS, map it to the same outbound
87 	** queue as the incoming flow would be mapped to.
88 	** If everything is setup correctly, it should be
89 	** the same bucket that the current CPU we're on is.
90 	*/
91 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
92 #ifdef  RSS
93 		if (rss_hash2bucket(m->m_pkthdr.flowid,
94 		    M_HASHTYPE_GET(m), &bucket_id) == 0) {
95 			i = bucket_id % vsi->num_queues;
96                 } else
97 #endif
98                         i = m->m_pkthdr.flowid % vsi->num_queues;
99         } else
100 		i = curcpu % vsi->num_queues;
101 	/*
102 	** This may not be perfect, but until something
103 	** better comes along it will keep from scheduling
104 	** on stalled queues.
105 	*/
106 	if (((1 << i) & vsi->active_queues) == 0)
107 		i = ffsl(vsi->active_queues);
108 
109 	que = &vsi->queues[i];
110 	txr = &que->txr;
111 
112 	err = drbr_enqueue(ifp, txr->br, m);
113 	if (err)
114 		return (err);
115 	if (IXL_TX_TRYLOCK(txr)) {
116 		ixl_mq_start_locked(ifp, txr);
117 		IXL_TX_UNLOCK(txr);
118 	} else
119 		taskqueue_enqueue(que->tq, &que->tx_task);
120 
121 	return (0);
122 }
123 
124 int
125 ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
126 {
127 	struct ixl_queue	*que = txr->que;
128 	struct ixl_vsi		*vsi = que->vsi;
129         struct mbuf		*next;
130         int			err = 0;
131 
132 
133 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
134 	    vsi->link_active == 0)
135 		return (ENETDOWN);
136 
137 	/* Process the transmit queue */
138 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
139 		if ((err = ixl_xmit(que, &next)) != 0) {
140 			if (next == NULL)
141 				drbr_advance(ifp, txr->br);
142 			else
143 				drbr_putback(ifp, txr->br, next);
144 			break;
145 		}
146 		drbr_advance(ifp, txr->br);
147 		/* Send a copy of the frame to the BPF listener */
148 		ETHER_BPF_MTAP(ifp, next);
149 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
150 			break;
151 	}
152 
153 	if (txr->avail < IXL_TX_CLEANUP_THRESHOLD)
154 		ixl_txeof(que);
155 
156 	return (err);
157 }
158 
159 /*
160  * Called from a taskqueue to drain queued transmit packets.
161  */
162 void
163 ixl_deferred_mq_start(void *arg, int pending)
164 {
165 	struct ixl_queue	*que = arg;
166         struct tx_ring		*txr = &que->txr;
167 	struct ixl_vsi		*vsi = que->vsi;
168         struct ifnet		*ifp = vsi->ifp;
169 
170 	IXL_TX_LOCK(txr);
171 	if (!drbr_empty(ifp, txr->br))
172 		ixl_mq_start_locked(ifp, txr);
173 	IXL_TX_UNLOCK(txr);
174 }
175 
176 /*
177 ** Flush all queue ring buffers
178 */
179 void
180 ixl_qflush(struct ifnet *ifp)
181 {
182 	struct ixl_vsi	*vsi = ifp->if_softc;
183 
184         for (int i = 0; i < vsi->num_queues; i++) {
185 		struct ixl_queue *que = &vsi->queues[i];
186 		struct tx_ring	*txr = &que->txr;
187 		struct mbuf	*m;
188 		IXL_TX_LOCK(txr);
189 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
190 			m_freem(m);
191 		IXL_TX_UNLOCK(txr);
192 	}
193 	if_qflush(ifp);
194 }
195 
196 /*
197 ** Find mbuf chains passed to the driver
198 ** that are 'sparse', using more than 8
199 ** mbufs to deliver an mss-size chunk of data
200 */
201 static inline bool
202 ixl_tso_detect_sparse(struct mbuf *mp)
203 {
204 	struct mbuf	*m;
205 	int		num = 0, mss;
206 	bool		ret = FALSE;
207 
208 	mss = mp->m_pkthdr.tso_segsz;
209 	for (m = mp->m_next; m != NULL; m = m->m_next) {
210 		num++;
211 		mss -= m->m_len;
212 		if (mss < 1)
213 			break;
214 		if (m->m_next == NULL)
215 			break;
216 	}
217 	if (num > IXL_SPARSE_CHAIN)
218 		ret = TRUE;
219 
220 	return (ret);
221 }
222 
223 
224 /*********************************************************************
225  *
226  *  This routine maps the mbufs to tx descriptors, allowing the
227  *  TX engine to transmit the packets.
228  *  	- return 0 on success, positive on failure
229  *
230  **********************************************************************/
231 #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
232 
233 static int
234 ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp)
235 {
236 	struct ixl_vsi		*vsi = que->vsi;
237 	struct i40e_hw		*hw = vsi->hw;
238 	struct tx_ring		*txr = &que->txr;
239 	struct ixl_tx_buf	*buf;
240 	struct i40e_tx_desc	*txd = NULL;
241 	struct mbuf		*m_head, *m;
242 	int             	i, j, error, nsegs, maxsegs;
243 	int			first, last = 0;
244 	u16			vtag = 0;
245 	u32			cmd, off;
246 	bus_dmamap_t		map;
247 	bus_dma_tag_t		tag;
248 	bus_dma_segment_t	segs[IXL_MAX_TSO_SEGS];
249 
250 	cmd = off = 0;
251 	m_head = *m_headp;
252 
253         /*
254          * Important to capture the first descriptor
255          * used because it will contain the index of
256          * the one we tell the hardware to report back
257          */
258         first = txr->next_avail;
259 	buf = &txr->buffers[first];
260 	map = buf->map;
261 	tag = txr->tx_tag;
262 	maxsegs = IXL_MAX_TX_SEGS;
263 
264 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
265 		/* Use larger mapping for TSO */
266 		tag = txr->tso_tag;
267 		maxsegs = IXL_MAX_TSO_SEGS;
268 		if (ixl_tso_detect_sparse(m_head)) {
269 			m = m_defrag(m_head, M_NOWAIT);
270 			if (m == NULL) {
271 				m_freem(*m_headp);
272 				*m_headp = NULL;
273 				return (ENOBUFS);
274 			}
275 			*m_headp = m;
276 		}
277 	}
278 
279 	/*
280 	 * Map the packet for DMA.
281 	 */
282 	error = bus_dmamap_load_mbuf_sg(tag, map,
283 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
284 
285 	if (error == EFBIG) {
286 		struct mbuf *m;
287 
288 		m = m_defrag(*m_headp, M_NOWAIT);
289 		if (m == NULL) {
290 			que->mbuf_defrag_failed++;
291 			m_freem(*m_headp);
292 			*m_headp = NULL;
293 			return (ENOBUFS);
294 		}
295 		*m_headp = m;
296 
297 		/* Try it again */
298 		error = bus_dmamap_load_mbuf_sg(tag, map,
299 		    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
300 
301 		if (error == ENOMEM) {
302 			que->tx_dma_setup++;
303 			return (error);
304 		} else if (error != 0) {
305 			que->tx_dma_setup++;
306 			m_freem(*m_headp);
307 			*m_headp = NULL;
308 			return (error);
309 		}
310 	} else if (error == ENOMEM) {
311 		que->tx_dma_setup++;
312 		return (error);
313 	} else if (error != 0) {
314 		que->tx_dma_setup++;
315 		m_freem(*m_headp);
316 		*m_headp = NULL;
317 		return (error);
318 	}
319 
320 	/* Make certain there are enough descriptors */
321 	if (nsegs > txr->avail - 2) {
322 		txr->no_desc++;
323 		error = ENOBUFS;
324 		goto xmit_fail;
325 	}
326 	m_head = *m_headp;
327 
328 	/* Set up the TSO/CSUM offload */
329 	if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) {
330 		error = ixl_tx_setup_offload(que, m_head, &cmd, &off);
331 		if (error)
332 			goto xmit_fail;
333 	}
334 
335 	cmd |= I40E_TX_DESC_CMD_ICRC;
336 	/* Grab the VLAN tag */
337 	if (m_head->m_flags & M_VLANTAG) {
338 		cmd |= I40E_TX_DESC_CMD_IL2TAG1;
339 		vtag = htole16(m_head->m_pkthdr.ether_vtag);
340 	}
341 
342 	i = txr->next_avail;
343 	for (j = 0; j < nsegs; j++) {
344 		bus_size_t seglen;
345 
346 		buf = &txr->buffers[i];
347 		buf->tag = tag; /* Keep track of the type tag */
348 		txd = &txr->base[i];
349 		seglen = segs[j].ds_len;
350 
351 		txd->buffer_addr = htole64(segs[j].ds_addr);
352 		txd->cmd_type_offset_bsz =
353 		    htole64(I40E_TX_DESC_DTYPE_DATA
354 		    | ((u64)cmd  << I40E_TXD_QW1_CMD_SHIFT)
355 		    | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT)
356 		    | ((u64)seglen  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
357 		    | ((u64)vtag  << I40E_TXD_QW1_L2TAG1_SHIFT));
358 
359 		last = i; /* descriptor that will get completion IRQ */
360 
361 		if (++i == que->num_desc)
362 			i = 0;
363 
364 		buf->m_head = NULL;
365 		buf->eop_index = -1;
366 	}
367 	/* Set the last descriptor for report */
368 	txd->cmd_type_offset_bsz |=
369 	    htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT));
370 	txr->avail -= nsegs;
371 	txr->next_avail = i;
372 
373 	buf->m_head = m_head;
374 	/* Swap the dma map between the first and last descriptor */
375 	txr->buffers[first].map = buf->map;
376 	buf->map = map;
377 	bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE);
378 
379         /* Set the index of the descriptor that will be marked done */
380         buf = &txr->buffers[first];
381 	buf->eop_index = last;
382 
383         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
384             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
385 	/*
386 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
387 	 * hardware that this frame is available to transmit.
388 	 */
389 	++txr->total_packets;
390 	wr32(hw, txr->tail, i);
391 
392 	/* Mark outstanding work */
393 	if (que->busy == 0)
394 		que->busy = 1;
395 	return (0);
396 
397 xmit_fail:
398 	bus_dmamap_unload(tag, buf->map);
399 	return (error);
400 }
401 
402 
403 /*********************************************************************
404  *
405  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
406  *  the information needed to transmit a packet on the wire. This is
407  *  called only once at attach, setup is done every reset.
408  *
409  **********************************************************************/
410 int
411 ixl_allocate_tx_data(struct ixl_queue *que)
412 {
413 	struct tx_ring		*txr = &que->txr;
414 	struct ixl_vsi		*vsi = que->vsi;
415 	device_t		dev = vsi->dev;
416 	struct ixl_tx_buf	*buf;
417 	int			error = 0;
418 
419 	/*
420 	 * Setup DMA descriptor areas.
421 	 */
422 	if ((error = bus_dma_tag_create(NULL,		/* parent */
423 			       1, 0,			/* alignment, bounds */
424 			       BUS_SPACE_MAXADDR,	/* lowaddr */
425 			       BUS_SPACE_MAXADDR,	/* highaddr */
426 			       NULL, NULL,		/* filter, filterarg */
427 			       IXL_TSO_SIZE,		/* maxsize */
428 			       IXL_MAX_TX_SEGS,		/* nsegments */
429 			       PAGE_SIZE,		/* maxsegsize */
430 			       0,			/* flags */
431 			       NULL,			/* lockfunc */
432 			       NULL,			/* lockfuncarg */
433 			       &txr->tx_tag))) {
434 		device_printf(dev,"Unable to allocate TX DMA tag\n");
435 		goto fail;
436 	}
437 
438 	/* Make a special tag for TSO */
439 	if ((error = bus_dma_tag_create(NULL,		/* parent */
440 			       1, 0,			/* alignment, bounds */
441 			       BUS_SPACE_MAXADDR,	/* lowaddr */
442 			       BUS_SPACE_MAXADDR,	/* highaddr */
443 			       NULL, NULL,		/* filter, filterarg */
444 			       IXL_TSO_SIZE,		/* maxsize */
445 			       IXL_MAX_TSO_SEGS,	/* nsegments */
446 			       PAGE_SIZE,		/* maxsegsize */
447 			       0,			/* flags */
448 			       NULL,			/* lockfunc */
449 			       NULL,			/* lockfuncarg */
450 			       &txr->tso_tag))) {
451 		device_printf(dev,"Unable to allocate TX TSO DMA tag\n");
452 		goto fail;
453 	}
454 
455 	if (!(txr->buffers =
456 	    (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) *
457 	    que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
458 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
459 		error = ENOMEM;
460 		goto fail;
461 	}
462 
463         /* Create the descriptor buffer default dma maps */
464 	buf = txr->buffers;
465 	for (int i = 0; i < que->num_desc; i++, buf++) {
466 		buf->tag = txr->tx_tag;
467 		error = bus_dmamap_create(buf->tag, 0, &buf->map);
468 		if (error != 0) {
469 			device_printf(dev, "Unable to create TX DMA map\n");
470 			goto fail;
471 		}
472 	}
473 fail:
474 	return (error);
475 }
476 
477 
478 /*********************************************************************
479  *
480  *  (Re)Initialize a queue transmit ring.
481  *	- called by init, it clears the descriptor ring,
482  *	  and frees any stale mbufs
483  *
484  **********************************************************************/
485 void
486 ixl_init_tx_ring(struct ixl_queue *que)
487 {
488 #ifdef DEV_NETMAP
489 	struct netmap_adapter *na = NA(que->vsi->ifp);
490 	struct netmap_slot *slot;
491 #endif /* DEV_NETMAP */
492 	struct tx_ring		*txr = &que->txr;
493 	struct ixl_tx_buf	*buf;
494 
495 	/* Clear the old ring contents */
496 	IXL_TX_LOCK(txr);
497 
498 #ifdef DEV_NETMAP
499 	/*
500 	 * (under lock): if in netmap mode, do some consistency
501 	 * checks and set slot to entry 0 of the netmap ring.
502 	 */
503 	slot = netmap_reset(na, NR_TX, que->me, 0);
504 #endif /* DEV_NETMAP */
505 
506 	bzero((void *)txr->base,
507 	      (sizeof(struct i40e_tx_desc)) * que->num_desc);
508 
509 	/* Reset indices */
510 	txr->next_avail = 0;
511 	txr->next_to_clean = 0;
512 
513 #ifdef IXL_FDIR
514 	/* Initialize flow director */
515 	txr->atr_rate = ixl_atr_rate;
516 	txr->atr_count = 0;
517 #endif
518 
519 	/* Free any existing tx mbufs. */
520         buf = txr->buffers;
521 	for (int i = 0; i < que->num_desc; i++, buf++) {
522 		if (buf->m_head != NULL) {
523 			bus_dmamap_sync(buf->tag, buf->map,
524 			    BUS_DMASYNC_POSTWRITE);
525 			bus_dmamap_unload(buf->tag, buf->map);
526 			m_freem(buf->m_head);
527 			buf->m_head = NULL;
528 		}
529 #ifdef DEV_NETMAP
530 		/*
531 		 * In netmap mode, set the map for the packet buffer.
532 		 * NOTE: Some drivers (not this one) also need to set
533 		 * the physical buffer address in the NIC ring.
534 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
535 		 * netmap slot index, si
536 		 */
537 		if (slot) {
538 			int si = netmap_idx_n2k(&na->tx_rings[que->me], i);
539 			netmap_load_map(na, buf->tag, buf->map, NMB(na, slot + si));
540 		}
541 #endif /* DEV_NETMAP */
542 		/* Clear the EOP index */
543 		buf->eop_index = -1;
544         }
545 
546 	/* Set number of descriptors available */
547 	txr->avail = que->num_desc;
548 
549 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
550 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
551 	IXL_TX_UNLOCK(txr);
552 }
553 
554 
555 /*********************************************************************
556  *
557  *  Free transmit ring related data structures.
558  *
559  **********************************************************************/
560 void
561 ixl_free_que_tx(struct ixl_queue *que)
562 {
563 	struct tx_ring *txr = &que->txr;
564 	struct ixl_tx_buf *buf;
565 
566 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
567 
568 	for (int i = 0; i < que->num_desc; i++) {
569 		buf = &txr->buffers[i];
570 		if (buf->m_head != NULL) {
571 			bus_dmamap_sync(buf->tag, buf->map,
572 			    BUS_DMASYNC_POSTWRITE);
573 			bus_dmamap_unload(buf->tag,
574 			    buf->map);
575 			m_freem(buf->m_head);
576 			buf->m_head = NULL;
577 			if (buf->map != NULL) {
578 				bus_dmamap_destroy(buf->tag,
579 				    buf->map);
580 				buf->map = NULL;
581 			}
582 		} else if (buf->map != NULL) {
583 			bus_dmamap_unload(buf->tag,
584 			    buf->map);
585 			bus_dmamap_destroy(buf->tag,
586 			    buf->map);
587 			buf->map = NULL;
588 		}
589 	}
590 	if (txr->br != NULL)
591 		buf_ring_free(txr->br, M_DEVBUF);
592 	if (txr->buffers != NULL) {
593 		free(txr->buffers, M_DEVBUF);
594 		txr->buffers = NULL;
595 	}
596 	if (txr->tx_tag != NULL) {
597 		bus_dma_tag_destroy(txr->tx_tag);
598 		txr->tx_tag = NULL;
599 	}
600 	if (txr->tso_tag != NULL) {
601 		bus_dma_tag_destroy(txr->tso_tag);
602 		txr->tso_tag = NULL;
603 	}
604 
605 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
606 	return;
607 }
608 
609 /*********************************************************************
610  *
611  *  Setup descriptor for hw offloads
612  *
613  **********************************************************************/
614 
615 static int
616 ixl_tx_setup_offload(struct ixl_queue *que,
617     struct mbuf *mp, u32 *cmd, u32 *off)
618 {
619 	struct ether_vlan_header	*eh;
620 #ifdef INET
621 	struct ip			*ip = NULL;
622 #endif
623 	struct tcphdr			*th = NULL;
624 #ifdef INET6
625 	struct ip6_hdr			*ip6;
626 #endif
627 	int				elen, ip_hlen = 0, tcp_hlen;
628 	u16				etype;
629 	u8				ipproto = 0;
630 	bool				tso = FALSE;
631 
632 	/* Set up the TSO context descriptor if required */
633 	if (mp->m_pkthdr.csum_flags & CSUM_TSO) {
634 		tso = ixl_tso_setup(que, mp);
635 		if (tso)
636 			++que->tso;
637 		else
638 			return (ENXIO);
639 	}
640 
641 	/*
642 	 * Determine where frame payload starts.
643 	 * Jump over vlan headers if already present,
644 	 * helpful for QinQ too.
645 	 */
646 	eh = mtod(mp, struct ether_vlan_header *);
647 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
648 		etype = ntohs(eh->evl_proto);
649 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
650 	} else {
651 		etype = ntohs(eh->evl_encap_proto);
652 		elen = ETHER_HDR_LEN;
653 	}
654 
655 	switch (etype) {
656 #ifdef INET
657 		case ETHERTYPE_IP:
658 			ip = (struct ip *)(mp->m_data + elen);
659 			ip_hlen = ip->ip_hl << 2;
660 			ipproto = ip->ip_p;
661 			th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
662 			/* The IP checksum must be recalculated with TSO */
663 			if (tso)
664 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
665 			else
666 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4;
667 			break;
668 #endif
669 #ifdef INET6
670 		case ETHERTYPE_IPV6:
671 			ip6 = (struct ip6_hdr *)(mp->m_data + elen);
672 			ip_hlen = sizeof(struct ip6_hdr);
673 			ipproto = ip6->ip6_nxt;
674 			th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
675 			*cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
676 			break;
677 #endif
678 		default:
679 			break;
680 	}
681 
682 	*off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
683 	*off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
684 
685 	switch (ipproto) {
686 		case IPPROTO_TCP:
687 			tcp_hlen = th->th_off << 2;
688 			if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) {
689 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
690 				*off |= (tcp_hlen >> 2) <<
691 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
692 			}
693 #ifdef IXL_FDIR
694 			ixl_atr(que, th, etype);
695 #endif
696 			break;
697 		case IPPROTO_UDP:
698 			if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) {
699 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
700 				*off |= (sizeof(struct udphdr) >> 2) <<
701 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
702 			}
703 			break;
704 
705 		case IPPROTO_SCTP:
706 			if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) {
707 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
708 				*off |= (sizeof(struct sctphdr) >> 2) <<
709 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
710 			}
711 			/* Fall Thru */
712 		default:
713 			break;
714 	}
715 
716         return (0);
717 }
718 
719 
720 /**********************************************************************
721  *
722  *  Setup context for hardware segmentation offload (TSO)
723  *
724  **********************************************************************/
725 static bool
726 ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp)
727 {
728 	struct tx_ring			*txr = &que->txr;
729 	struct i40e_tx_context_desc	*TXD;
730 	struct ixl_tx_buf		*buf;
731 	u32				cmd, mss, type, tsolen;
732 	u16				etype;
733 	int				idx, elen, ip_hlen, tcp_hlen;
734 	struct ether_vlan_header	*eh;
735 #ifdef INET
736 	struct ip			*ip;
737 #endif
738 #ifdef INET6
739 	struct ip6_hdr			*ip6;
740 #endif
741 #if defined(INET6) || defined(INET)
742 	struct tcphdr			*th;
743 #endif
744 	u64				type_cmd_tso_mss;
745 
746 	/*
747 	 * Determine where frame payload starts.
748 	 * Jump over vlan headers if already present
749 	 */
750 	eh = mtod(mp, struct ether_vlan_header *);
751 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
752 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
753 		etype = eh->evl_proto;
754 	} else {
755 		elen = ETHER_HDR_LEN;
756 		etype = eh->evl_encap_proto;
757 	}
758 
759         switch (ntohs(etype)) {
760 #ifdef INET6
761 	case ETHERTYPE_IPV6:
762 		ip6 = (struct ip6_hdr *)(mp->m_data + elen);
763 		if (ip6->ip6_nxt != IPPROTO_TCP)
764 			return (ENXIO);
765 		ip_hlen = sizeof(struct ip6_hdr);
766 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
767 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
768 		tcp_hlen = th->th_off << 2;
769 		/*
770 		 * The corresponding flag is set by the stack in the IPv4
771 		 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
772 		 * So, set it here because the rest of the flow requires it.
773 		 */
774 		mp->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
775 		break;
776 #endif
777 #ifdef INET
778 	case ETHERTYPE_IP:
779 		ip = (struct ip *)(mp->m_data + elen);
780 		if (ip->ip_p != IPPROTO_TCP)
781 			return (ENXIO);
782 		ip->ip_sum = 0;
783 		ip_hlen = ip->ip_hl << 2;
784 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
785 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
786 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
787 		tcp_hlen = th->th_off << 2;
788 		break;
789 #endif
790 	default:
791 		printf("%s: CSUM_TSO but no supported IP version (0x%04x)",
792 		    __func__, ntohs(etype));
793 		return FALSE;
794         }
795 
796         /* Ensure we have at least the IP+TCP header in the first mbuf. */
797         if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr))
798 		return FALSE;
799 
800 	idx = txr->next_avail;
801 	buf = &txr->buffers[idx];
802 	TXD = (struct i40e_tx_context_desc *) &txr->base[idx];
803 	tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen);
804 
805 	type = I40E_TX_DESC_DTYPE_CONTEXT;
806 	cmd = I40E_TX_CTX_DESC_TSO;
807 	mss = mp->m_pkthdr.tso_segsz;
808 
809 	type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) |
810 	    ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
811 	    ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
812 	    ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
813 	TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss);
814 
815 	TXD->tunneling_params = htole32(0);
816 	buf->m_head = NULL;
817 	buf->eop_index = -1;
818 
819 	if (++idx == que->num_desc)
820 		idx = 0;
821 
822 	txr->avail--;
823 	txr->next_avail = idx;
824 
825 	return TRUE;
826 }
827 
828 /*
829 ** ixl_get_tx_head - Retrieve the value from the
830 **    location the HW records its HEAD index
831 */
832 static inline u32
833 ixl_get_tx_head(struct ixl_queue *que)
834 {
835 	struct tx_ring  *txr = &que->txr;
836 	void *head = &txr->base[que->num_desc];
837 	return LE32_TO_CPU(*(volatile __le32 *)head);
838 }
839 
840 /**********************************************************************
841  *
842  *  Examine each tx_buffer in the used queue. If the hardware is done
843  *  processing the packet then free associated resources. The
844  *  tx_buffer is put back on the free queue.
845  *
846  **********************************************************************/
847 bool
848 ixl_txeof(struct ixl_queue *que)
849 {
850 	struct tx_ring		*txr = &que->txr;
851 	u32			first, last, head, done, processed;
852 	struct ixl_tx_buf	*buf;
853 	struct i40e_tx_desc	*tx_desc, *eop_desc;
854 
855 
856 	mtx_assert(&txr->mtx, MA_OWNED);
857 
858 #ifdef DEV_NETMAP
859 	// XXX todo: implement moderation
860 	if (netmap_tx_irq(que->vsi->ifp, que->me))
861 		return FALSE;
862 #endif /* DEF_NETMAP */
863 
864 	/* These are not the descriptors you seek, move along :) */
865 	if (txr->avail == que->num_desc) {
866 		que->busy = 0;
867 		return FALSE;
868 	}
869 
870 	processed = 0;
871 	first = txr->next_to_clean;
872 	buf = &txr->buffers[first];
873 	tx_desc = (struct i40e_tx_desc *)&txr->base[first];
874 	last = buf->eop_index;
875 	if (last == -1)
876 		return FALSE;
877 	eop_desc = (struct i40e_tx_desc *)&txr->base[last];
878 
879 	/* Get the Head WB value */
880 	head = ixl_get_tx_head(que);
881 
882 	/*
883 	** Get the index of the first descriptor
884 	** BEYOND the EOP and call that 'done'.
885 	** I do this so the comparison in the
886 	** inner while loop below can be simple
887 	*/
888 	if (++last == que->num_desc) last = 0;
889 	done = last;
890 
891         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
892             BUS_DMASYNC_POSTREAD);
893 	/*
894 	** The HEAD index of the ring is written in a
895 	** defined location, this rather than a done bit
896 	** is what is used to keep track of what must be
897 	** 'cleaned'.
898 	*/
899 	while (first != head) {
900 		/* We clean the range of the packet */
901 		while (first != done) {
902 			++txr->avail;
903 			++processed;
904 
905 			if (buf->m_head) {
906 				txr->bytes += /* for ITR adjustment */
907 				    buf->m_head->m_pkthdr.len;
908 				txr->tx_bytes += /* for TX stats */
909 				    buf->m_head->m_pkthdr.len;
910 				bus_dmamap_sync(buf->tag,
911 				    buf->map,
912 				    BUS_DMASYNC_POSTWRITE);
913 				bus_dmamap_unload(buf->tag,
914 				    buf->map);
915 				m_freem(buf->m_head);
916 				buf->m_head = NULL;
917 				buf->map = NULL;
918 			}
919 			buf->eop_index = -1;
920 
921 			if (++first == que->num_desc)
922 				first = 0;
923 
924 			buf = &txr->buffers[first];
925 			tx_desc = &txr->base[first];
926 		}
927 		++txr->packets;
928 		/* See if there is more work now */
929 		last = buf->eop_index;
930 		if (last != -1) {
931 			eop_desc = &txr->base[last];
932 			/* Get next done point */
933 			if (++last == que->num_desc) last = 0;
934 			done = last;
935 		} else
936 			break;
937 	}
938 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
939 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
940 
941 	txr->next_to_clean = first;
942 
943 
944 	/*
945 	** Hang detection, we know there's
946 	** work outstanding or the first return
947 	** would have been taken, so indicate an
948 	** unsuccessful pass, in local_timer if
949 	** the value is too great the queue will
950 	** be considered hung. If anything has been
951 	** cleaned then reset the state.
952 	*/
953 	if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG))
954 		++que->busy;
955 
956 	if (processed)
957 		que->busy = 1; /* Note this turns off HUNG */
958 
959 	/*
960 	 * If there are no pending descriptors, clear the timeout.
961 	 */
962 	if (txr->avail == que->num_desc) {
963 		que->busy = 0;
964 		return FALSE;
965 	}
966 
967 	return TRUE;
968 }
969 
970 /*********************************************************************
971  *
972  *  Refresh mbuf buffers for RX descriptor rings
973  *   - now keeps its own state so discards due to resource
974  *     exhaustion are unnecessary, if an mbuf cannot be obtained
975  *     it just returns, keeping its placeholder, thus it can simply
976  *     be recalled to try again.
977  *
978  **********************************************************************/
979 static void
980 ixl_refresh_mbufs(struct ixl_queue *que, int limit)
981 {
982 	struct ixl_vsi		*vsi = que->vsi;
983 	struct rx_ring		*rxr = &que->rxr;
984 	bus_dma_segment_t	hseg[1];
985 	bus_dma_segment_t	pseg[1];
986 	struct ixl_rx_buf	*buf;
987 	struct mbuf		*mh, *mp;
988 	int			i, j, nsegs, error;
989 	bool			refreshed = FALSE;
990 
991 	i = j = rxr->next_refresh;
992 	/* Control the loop with one beyond */
993 	if (++j == que->num_desc)
994 		j = 0;
995 
996 	while (j != limit) {
997 		buf = &rxr->buffers[i];
998 		if (rxr->hdr_split == FALSE)
999 			goto no_split;
1000 
1001 		if (buf->m_head == NULL) {
1002 			mh = m_gethdr(M_NOWAIT, MT_DATA);
1003 			if (mh == NULL)
1004 				goto update;
1005 		} else
1006 			mh = buf->m_head;
1007 
1008 		mh->m_pkthdr.len = mh->m_len = MHLEN;
1009 		mh->m_len = MHLEN;
1010 		mh->m_flags |= M_PKTHDR;
1011 		/* Get the memory mapping */
1012 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1013 		    buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT);
1014 		if (error != 0) {
1015 			printf("Refresh mbufs: hdr dmamap load"
1016 			    " failure - %d\n", error);
1017 			m_free(mh);
1018 			buf->m_head = NULL;
1019 			goto update;
1020 		}
1021 		buf->m_head = mh;
1022 		bus_dmamap_sync(rxr->htag, buf->hmap,
1023 		    BUS_DMASYNC_PREREAD);
1024 		rxr->base[i].read.hdr_addr =
1025 		   htole64(hseg[0].ds_addr);
1026 
1027 no_split:
1028 		if (buf->m_pack == NULL) {
1029 			mp = m_getjcl(M_NOWAIT, MT_DATA,
1030 			    M_PKTHDR, rxr->mbuf_sz);
1031 			if (mp == NULL)
1032 				goto update;
1033 		} else
1034 			mp = buf->m_pack;
1035 
1036 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1037 		/* Get the memory mapping */
1038 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1039 		    buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
1040 		if (error != 0) {
1041 			printf("Refresh mbufs: payload dmamap load"
1042 			    " failure - %d\n", error);
1043 			m_free(mp);
1044 			buf->m_pack = NULL;
1045 			goto update;
1046 		}
1047 		buf->m_pack = mp;
1048 		bus_dmamap_sync(rxr->ptag, buf->pmap,
1049 		    BUS_DMASYNC_PREREAD);
1050 		rxr->base[i].read.pkt_addr =
1051 		   htole64(pseg[0].ds_addr);
1052 		/* Used only when doing header split */
1053 		rxr->base[i].read.hdr_addr = 0;
1054 
1055 		refreshed = TRUE;
1056 		/* Next is precalculated */
1057 		i = j;
1058 		rxr->next_refresh = i;
1059 		if (++j == que->num_desc)
1060 			j = 0;
1061 	}
1062 update:
1063 	if (refreshed) /* Update hardware tail index */
1064 		wr32(vsi->hw, rxr->tail, rxr->next_refresh);
1065 	return;
1066 }
1067 
1068 
1069 /*********************************************************************
1070  *
1071  *  Allocate memory for rx_buffer structures. Since we use one
1072  *  rx_buffer per descriptor, the maximum number of rx_buffer's
1073  *  that we'll need is equal to the number of receive descriptors
1074  *  that we've defined.
1075  *
1076  **********************************************************************/
1077 int
1078 ixl_allocate_rx_data(struct ixl_queue *que)
1079 {
1080 	struct rx_ring		*rxr = &que->rxr;
1081 	struct ixl_vsi		*vsi = que->vsi;
1082 	device_t 		dev = vsi->dev;
1083 	struct ixl_rx_buf 	*buf;
1084 	int             	i, bsize, error;
1085 
1086 	bsize = sizeof(struct ixl_rx_buf) * que->num_desc;
1087 	if (!(rxr->buffers =
1088 	    (struct ixl_rx_buf *) malloc(bsize,
1089 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
1090 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
1091 		error = ENOMEM;
1092 		return (error);
1093 	}
1094 
1095 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1096 				   1, 0,	/* alignment, bounds */
1097 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1098 				   BUS_SPACE_MAXADDR,	/* highaddr */
1099 				   NULL, NULL,		/* filter, filterarg */
1100 				   MSIZE,		/* maxsize */
1101 				   1,			/* nsegments */
1102 				   MSIZE,		/* maxsegsize */
1103 				   0,			/* flags */
1104 				   NULL,		/* lockfunc */
1105 				   NULL,		/* lockfuncarg */
1106 				   &rxr->htag))) {
1107 		device_printf(dev, "Unable to create RX DMA htag\n");
1108 		return (error);
1109 	}
1110 
1111 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1112 				   1, 0,	/* alignment, bounds */
1113 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1114 				   BUS_SPACE_MAXADDR,	/* highaddr */
1115 				   NULL, NULL,		/* filter, filterarg */
1116 				   MJUM16BYTES,		/* maxsize */
1117 				   1,			/* nsegments */
1118 				   MJUM16BYTES,		/* maxsegsize */
1119 				   0,			/* flags */
1120 				   NULL,		/* lockfunc */
1121 				   NULL,		/* lockfuncarg */
1122 				   &rxr->ptag))) {
1123 		device_printf(dev, "Unable to create RX DMA ptag\n");
1124 		return (error);
1125 	}
1126 
1127 	for (i = 0; i < que->num_desc; i++) {
1128 		buf = &rxr->buffers[i];
1129 		error = bus_dmamap_create(rxr->htag,
1130 		    BUS_DMA_NOWAIT, &buf->hmap);
1131 		if (error) {
1132 			device_printf(dev, "Unable to create RX head map\n");
1133 			break;
1134 		}
1135 		error = bus_dmamap_create(rxr->ptag,
1136 		    BUS_DMA_NOWAIT, &buf->pmap);
1137 		if (error) {
1138 			device_printf(dev, "Unable to create RX pkt map\n");
1139 			break;
1140 		}
1141 	}
1142 
1143 	return (error);
1144 }
1145 
1146 
1147 /*********************************************************************
1148  *
1149  *  (Re)Initialize the queue receive ring and its buffers.
1150  *
1151  **********************************************************************/
1152 int
1153 ixl_init_rx_ring(struct ixl_queue *que)
1154 {
1155 	struct	rx_ring 	*rxr = &que->rxr;
1156 	struct ixl_vsi		*vsi = que->vsi;
1157 #if defined(INET6) || defined(INET)
1158 	struct ifnet		*ifp = vsi->ifp;
1159 	struct lro_ctrl		*lro = &rxr->lro;
1160 #endif
1161 	struct ixl_rx_buf	*buf;
1162 	bus_dma_segment_t	pseg[1], hseg[1];
1163 	int			rsize, nsegs, error = 0;
1164 #ifdef DEV_NETMAP
1165 	struct netmap_adapter *na = NA(que->vsi->ifp);
1166 	struct netmap_slot *slot;
1167 #endif /* DEV_NETMAP */
1168 
1169 	IXL_RX_LOCK(rxr);
1170 #ifdef DEV_NETMAP
1171 	/* same as in ixl_init_tx_ring() */
1172 	slot = netmap_reset(na, NR_RX, que->me, 0);
1173 #endif /* DEV_NETMAP */
1174 	/* Clear the ring contents */
1175 	rsize = roundup2(que->num_desc *
1176 	    sizeof(union i40e_rx_desc), DBA_ALIGN);
1177 	bzero((void *)rxr->base, rsize);
1178 	/* Cleanup any existing buffers */
1179 	for (int i = 0; i < que->num_desc; i++) {
1180 		buf = &rxr->buffers[i];
1181 		if (buf->m_head != NULL) {
1182 			bus_dmamap_sync(rxr->htag, buf->hmap,
1183 			    BUS_DMASYNC_POSTREAD);
1184 			bus_dmamap_unload(rxr->htag, buf->hmap);
1185 			buf->m_head->m_flags |= M_PKTHDR;
1186 			m_freem(buf->m_head);
1187 		}
1188 		if (buf->m_pack != NULL) {
1189 			bus_dmamap_sync(rxr->ptag, buf->pmap,
1190 			    BUS_DMASYNC_POSTREAD);
1191 			bus_dmamap_unload(rxr->ptag, buf->pmap);
1192 			buf->m_pack->m_flags |= M_PKTHDR;
1193 			m_freem(buf->m_pack);
1194 		}
1195 		buf->m_head = NULL;
1196 		buf->m_pack = NULL;
1197 	}
1198 
1199 	/* header split is off */
1200 	rxr->hdr_split = FALSE;
1201 
1202 	/* Now replenish the mbufs */
1203 	for (int j = 0; j != que->num_desc; ++j) {
1204 		struct mbuf	*mh, *mp;
1205 
1206 		buf = &rxr->buffers[j];
1207 #ifdef DEV_NETMAP
1208 		/*
1209 		 * In netmap mode, fill the map and set the buffer
1210 		 * address in the NIC ring, considering the offset
1211 		 * between the netmap and NIC rings (see comment in
1212 		 * ixgbe_setup_transmit_ring() ). No need to allocate
1213 		 * an mbuf, so end the block with a continue;
1214 		 */
1215 		if (slot) {
1216 			int sj = netmap_idx_n2k(&na->rx_rings[que->me], j);
1217 			uint64_t paddr;
1218 			void *addr;
1219 
1220 			addr = PNMB(na, slot + sj, &paddr);
1221 			netmap_load_map(na, rxr->dma.tag, buf->pmap, addr);
1222 			/* Update descriptor and the cached value */
1223 			rxr->base[j].read.pkt_addr = htole64(paddr);
1224 			rxr->base[j].read.hdr_addr = 0;
1225 			continue;
1226 		}
1227 #endif /* DEV_NETMAP */
1228 		/*
1229 		** Don't allocate mbufs if not
1230 		** doing header split, its wasteful
1231 		*/
1232 		if (rxr->hdr_split == FALSE)
1233 			goto skip_head;
1234 
1235 		/* First the header */
1236 		buf->m_head = m_gethdr(M_NOWAIT, MT_DATA);
1237 		if (buf->m_head == NULL) {
1238 			error = ENOBUFS;
1239 			goto fail;
1240 		}
1241 		m_adj(buf->m_head, ETHER_ALIGN);
1242 		mh = buf->m_head;
1243 		mh->m_len = mh->m_pkthdr.len = MHLEN;
1244 		mh->m_flags |= M_PKTHDR;
1245 		/* Get the memory mapping */
1246 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1247 		    buf->hmap, buf->m_head, hseg,
1248 		    &nsegs, BUS_DMA_NOWAIT);
1249 		if (error != 0) /* Nothing elegant to do here */
1250 			goto fail;
1251 		bus_dmamap_sync(rxr->htag,
1252 		    buf->hmap, BUS_DMASYNC_PREREAD);
1253 		/* Update descriptor */
1254 		rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr);
1255 
1256 skip_head:
1257 		/* Now the payload cluster */
1258 		buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA,
1259 		    M_PKTHDR, rxr->mbuf_sz);
1260 		if (buf->m_pack == NULL) {
1261 			error = ENOBUFS;
1262                         goto fail;
1263 		}
1264 		mp = buf->m_pack;
1265 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1266 		/* Get the memory mapping */
1267 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1268 		    buf->pmap, mp, pseg,
1269 		    &nsegs, BUS_DMA_NOWAIT);
1270 		if (error != 0)
1271                         goto fail;
1272 		bus_dmamap_sync(rxr->ptag,
1273 		    buf->pmap, BUS_DMASYNC_PREREAD);
1274 		/* Update descriptor */
1275 		rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr);
1276 		rxr->base[j].read.hdr_addr = 0;
1277 	}
1278 
1279 
1280 	/* Setup our descriptor indices */
1281 	rxr->next_check = 0;
1282 	rxr->next_refresh = 0;
1283 	rxr->lro_enabled = FALSE;
1284 	rxr->split = 0;
1285 	rxr->bytes = 0;
1286 	rxr->discard = FALSE;
1287 
1288 	wr32(vsi->hw, rxr->tail, que->num_desc - 1);
1289 	ixl_flush(vsi->hw);
1290 
1291 #if defined(INET6) || defined(INET)
1292 	/*
1293 	** Now set up the LRO interface:
1294 	*/
1295 	if (ifp->if_capenable & IFCAP_LRO) {
1296 		int err = tcp_lro_init(lro);
1297 		if (err) {
1298 			if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me);
1299 			goto fail;
1300 		}
1301 		INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me);
1302 		rxr->lro_enabled = TRUE;
1303 		lro->ifp = vsi->ifp;
1304 	}
1305 #endif
1306 
1307 	bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1308 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1309 
1310 fail:
1311 	IXL_RX_UNLOCK(rxr);
1312 	return (error);
1313 }
1314 
1315 
1316 /*********************************************************************
1317  *
1318  *  Free station receive ring data structures
1319  *
1320  **********************************************************************/
1321 void
1322 ixl_free_que_rx(struct ixl_queue *que)
1323 {
1324 	struct rx_ring		*rxr = &que->rxr;
1325 	struct ixl_rx_buf	*buf;
1326 
1327 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
1328 
1329 	/* Cleanup any existing buffers */
1330 	if (rxr->buffers != NULL) {
1331 		for (int i = 0; i < que->num_desc; i++) {
1332 			buf = &rxr->buffers[i];
1333 			if (buf->m_head != NULL) {
1334 				bus_dmamap_sync(rxr->htag, buf->hmap,
1335 				    BUS_DMASYNC_POSTREAD);
1336 				bus_dmamap_unload(rxr->htag, buf->hmap);
1337 				buf->m_head->m_flags |= M_PKTHDR;
1338 				m_freem(buf->m_head);
1339 			}
1340 			if (buf->m_pack != NULL) {
1341 				bus_dmamap_sync(rxr->ptag, buf->pmap,
1342 				    BUS_DMASYNC_POSTREAD);
1343 				bus_dmamap_unload(rxr->ptag, buf->pmap);
1344 				buf->m_pack->m_flags |= M_PKTHDR;
1345 				m_freem(buf->m_pack);
1346 			}
1347 			buf->m_head = NULL;
1348 			buf->m_pack = NULL;
1349 			if (buf->hmap != NULL) {
1350 				bus_dmamap_destroy(rxr->htag, buf->hmap);
1351 				buf->hmap = NULL;
1352 			}
1353 			if (buf->pmap != NULL) {
1354 				bus_dmamap_destroy(rxr->ptag, buf->pmap);
1355 				buf->pmap = NULL;
1356 			}
1357 		}
1358 		if (rxr->buffers != NULL) {
1359 			free(rxr->buffers, M_DEVBUF);
1360 			rxr->buffers = NULL;
1361 		}
1362 	}
1363 
1364 	if (rxr->htag != NULL) {
1365 		bus_dma_tag_destroy(rxr->htag);
1366 		rxr->htag = NULL;
1367 	}
1368 	if (rxr->ptag != NULL) {
1369 		bus_dma_tag_destroy(rxr->ptag);
1370 		rxr->ptag = NULL;
1371 	}
1372 
1373 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
1374 	return;
1375 }
1376 
1377 static __inline void
1378 ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype)
1379 {
1380 
1381 #if defined(INET6) || defined(INET)
1382         /*
1383          * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet
1384          * should be computed by hardware. Also it should not have VLAN tag in
1385          * ethernet header.
1386          */
1387         if (rxr->lro_enabled &&
1388             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
1389             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
1390             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
1391                 /*
1392                  * Send to the stack if:
1393                  **  - LRO not enabled, or
1394                  **  - no LRO resources, or
1395                  **  - lro enqueue fails
1396                  */
1397                 if (rxr->lro.lro_cnt != 0)
1398                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
1399                                 return;
1400         }
1401 #endif
1402 	IXL_RX_UNLOCK(rxr);
1403         (*ifp->if_input)(ifp, m);
1404 	IXL_RX_LOCK(rxr);
1405 }
1406 
1407 
1408 static __inline void
1409 ixl_rx_discard(struct rx_ring *rxr, int i)
1410 {
1411 	struct ixl_rx_buf	*rbuf;
1412 
1413 	rbuf = &rxr->buffers[i];
1414 
1415         if (rbuf->fmp != NULL) {/* Partial chain ? */
1416 		rbuf->fmp->m_flags |= M_PKTHDR;
1417                 m_freem(rbuf->fmp);
1418                 rbuf->fmp = NULL;
1419 	}
1420 
1421 	/*
1422 	** With advanced descriptors the writeback
1423 	** clobbers the buffer addrs, so its easier
1424 	** to just free the existing mbufs and take
1425 	** the normal refresh path to get new buffers
1426 	** and mapping.
1427 	*/
1428 	if (rbuf->m_head) {
1429 		m_free(rbuf->m_head);
1430 		rbuf->m_head = NULL;
1431 	}
1432 
1433 	if (rbuf->m_pack) {
1434 		m_free(rbuf->m_pack);
1435 		rbuf->m_pack = NULL;
1436 	}
1437 
1438 	return;
1439 }
1440 
1441 #ifdef RSS
1442 /*
1443 ** i40e_ptype_to_hash: parse the packet type
1444 ** to determine the appropriate hash.
1445 */
1446 static inline int
1447 ixl_ptype_to_hash(u8 ptype)
1448 {
1449         struct i40e_rx_ptype_decoded	decoded;
1450 	u8				ex = 0;
1451 
1452 	decoded = decode_rx_desc_ptype(ptype);
1453 	ex = decoded.outer_frag;
1454 
1455 	if (!decoded.known)
1456 		return M_HASHTYPE_OPAQUE_HASH;
1457 
1458 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_L2)
1459 		return M_HASHTYPE_OPAQUE_HASH;
1460 
1461 	/* Note: anything that gets to this point is IP */
1462         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) {
1463 		switch (decoded.inner_prot) {
1464 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1465 				if (ex)
1466 					return M_HASHTYPE_RSS_TCP_IPV6_EX;
1467 				else
1468 					return M_HASHTYPE_RSS_TCP_IPV6;
1469 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1470 				if (ex)
1471 					return M_HASHTYPE_RSS_UDP_IPV6_EX;
1472 				else
1473 					return M_HASHTYPE_RSS_UDP_IPV6;
1474 			default:
1475 				if (ex)
1476 					return M_HASHTYPE_RSS_IPV6_EX;
1477 				else
1478 					return M_HASHTYPE_RSS_IPV6;
1479 		}
1480 	}
1481         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1482 		switch (decoded.inner_prot) {
1483 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1484 					return M_HASHTYPE_RSS_TCP_IPV4;
1485 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1486 				if (ex)
1487 					return M_HASHTYPE_RSS_UDP_IPV4_EX;
1488 				else
1489 					return M_HASHTYPE_RSS_UDP_IPV4;
1490 			default:
1491 					return M_HASHTYPE_RSS_IPV4;
1492 		}
1493 	}
1494 	/* We should never get here!! */
1495 	return M_HASHTYPE_OPAQUE_HASH;
1496 }
1497 #endif /* RSS */
1498 
1499 /*********************************************************************
1500  *
1501  *  This routine executes in interrupt context. It replenishes
1502  *  the mbufs in the descriptor and sends data which has been
1503  *  dma'ed into host memory to upper layer.
1504  *
1505  *  We loop at most count times if count is > 0, or until done if
1506  *  count < 0.
1507  *
1508  *  Return TRUE for more work, FALSE for all clean.
1509  *********************************************************************/
1510 bool
1511 ixl_rxeof(struct ixl_queue *que, int count)
1512 {
1513 	struct ixl_vsi		*vsi = que->vsi;
1514 	struct rx_ring		*rxr = &que->rxr;
1515 	struct ifnet		*ifp = vsi->ifp;
1516 #if defined(INET6) || defined(INET)
1517 	struct lro_ctrl		*lro = &rxr->lro;
1518 #endif
1519 	int			i, nextp, processed = 0;
1520 	union i40e_rx_desc	*cur;
1521 	struct ixl_rx_buf	*rbuf, *nbuf;
1522 
1523 
1524 	IXL_RX_LOCK(rxr);
1525 
1526 #ifdef DEV_NETMAP
1527 	if (netmap_rx_irq(ifp, que->me, &count)) {
1528 		IXL_RX_UNLOCK(rxr);
1529 		return (FALSE);
1530 	}
1531 #endif /* DEV_NETMAP */
1532 
1533 	for (i = rxr->next_check; count != 0;) {
1534 		struct mbuf	*sendmp, *mh, *mp;
1535 		u32		rsc, status, error;
1536 		u16		hlen, plen, vtag;
1537 		u64		qword;
1538 		u8		ptype;
1539 		bool		eop;
1540 
1541 		/* Sync the ring. */
1542 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1543 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1544 
1545 		cur = &rxr->base[i];
1546 		qword = le64toh(cur->wb.qword1.status_error_len);
1547 		status = (qword & I40E_RXD_QW1_STATUS_MASK)
1548 		    >> I40E_RXD_QW1_STATUS_SHIFT;
1549 		error = (qword & I40E_RXD_QW1_ERROR_MASK)
1550 		    >> I40E_RXD_QW1_ERROR_SHIFT;
1551 		plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK)
1552 		    >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1553 		hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK)
1554 		    >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
1555 		ptype = (qword & I40E_RXD_QW1_PTYPE_MASK)
1556 		    >> I40E_RXD_QW1_PTYPE_SHIFT;
1557 
1558 		if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) {
1559 			++rxr->not_done;
1560 			break;
1561 		}
1562 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1563 			break;
1564 
1565 		count--;
1566 		sendmp = NULL;
1567 		nbuf = NULL;
1568 		rsc = 0;
1569 		cur->wb.qword1.status_error_len = 0;
1570 		rbuf = &rxr->buffers[i];
1571 		mh = rbuf->m_head;
1572 		mp = rbuf->m_pack;
1573 		eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT));
1574 		if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT))
1575 			vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1);
1576 		else
1577 			vtag = 0;
1578 
1579 		/*
1580 		** Make sure bad packets are discarded,
1581 		** note that only EOP descriptor has valid
1582 		** error results.
1583 		*/
1584                 if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1585 			rxr->desc_errs++;
1586 			ixl_rx_discard(rxr, i);
1587 			goto next_desc;
1588 		}
1589 
1590 		/* Prefetch the next buffer */
1591 		if (!eop) {
1592 			nextp = i + 1;
1593 			if (nextp == que->num_desc)
1594 				nextp = 0;
1595 			nbuf = &rxr->buffers[nextp];
1596 			prefetch(nbuf);
1597 		}
1598 
1599 		/*
1600 		** The header mbuf is ONLY used when header
1601 		** split is enabled, otherwise we get normal
1602 		** behavior, ie, both header and payload
1603 		** are DMA'd into the payload buffer.
1604 		**
1605 		** Rather than using the fmp/lmp global pointers
1606 		** we now keep the head of a packet chain in the
1607 		** buffer struct and pass this along from one
1608 		** descriptor to the next, until we get EOP.
1609 		*/
1610 		if (rxr->hdr_split && (rbuf->fmp == NULL)) {
1611 			if (hlen > IXL_RX_HDR)
1612 				hlen = IXL_RX_HDR;
1613 			mh->m_len = hlen;
1614 			mh->m_flags |= M_PKTHDR;
1615 			mh->m_next = NULL;
1616 			mh->m_pkthdr.len = mh->m_len;
1617 			/* Null buf pointer so it is refreshed */
1618 			rbuf->m_head = NULL;
1619 			/*
1620 			** Check the payload length, this
1621 			** could be zero if its a small
1622 			** packet.
1623 			*/
1624 			if (plen > 0) {
1625 				mp->m_len = plen;
1626 				mp->m_next = NULL;
1627 				mp->m_flags &= ~M_PKTHDR;
1628 				mh->m_next = mp;
1629 				mh->m_pkthdr.len += mp->m_len;
1630 				/* Null buf pointer so it is refreshed */
1631 				rbuf->m_pack = NULL;
1632 				rxr->split++;
1633 			}
1634 			/*
1635 			** Now create the forward
1636 			** chain so when complete
1637 			** we wont have to.
1638 			*/
1639                         if (eop == 0) {
1640 				/* stash the chain head */
1641                                 nbuf->fmp = mh;
1642 				/* Make forward chain */
1643                                 if (plen)
1644                                         mp->m_next = nbuf->m_pack;
1645                                 else
1646                                         mh->m_next = nbuf->m_pack;
1647                         } else {
1648 				/* Singlet, prepare to send */
1649                                 sendmp = mh;
1650                                 if (vtag) {
1651                                         sendmp->m_pkthdr.ether_vtag = vtag;
1652                                         sendmp->m_flags |= M_VLANTAG;
1653                                 }
1654                         }
1655 		} else {
1656 			/*
1657 			** Either no header split, or a
1658 			** secondary piece of a fragmented
1659 			** split packet.
1660 			*/
1661 			mp->m_len = plen;
1662 			/*
1663 			** See if there is a stored head
1664 			** that determines what we are
1665 			*/
1666 			sendmp = rbuf->fmp;
1667 			rbuf->m_pack = rbuf->fmp = NULL;
1668 
1669 			if (sendmp != NULL) /* secondary frag */
1670 				sendmp->m_pkthdr.len += mp->m_len;
1671 			else {
1672 				/* first desc of a non-ps chain */
1673 				sendmp = mp;
1674 				sendmp->m_flags |= M_PKTHDR;
1675 				sendmp->m_pkthdr.len = mp->m_len;
1676 				if (vtag) {
1677 					sendmp->m_pkthdr.ether_vtag = vtag;
1678 					sendmp->m_flags |= M_VLANTAG;
1679 				}
1680                         }
1681 			/* Pass the head pointer on */
1682 			if (eop == 0) {
1683 				nbuf->fmp = sendmp;
1684 				sendmp = NULL;
1685 				mp->m_next = nbuf->m_pack;
1686 			}
1687 		}
1688 		++processed;
1689 		/* Sending this frame? */
1690 		if (eop) {
1691 			sendmp->m_pkthdr.rcvif = ifp;
1692 			/* gather stats */
1693 			rxr->rx_packets++;
1694 			rxr->rx_bytes += sendmp->m_pkthdr.len;
1695 			/* capture data for dynamic ITR adjustment */
1696 			rxr->packets++;
1697 			rxr->bytes += sendmp->m_pkthdr.len;
1698 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
1699 				ixl_rx_checksum(sendmp, status, error, ptype);
1700 #ifdef RSS
1701 			sendmp->m_pkthdr.flowid =
1702 			    le32toh(cur->wb.qword0.hi_dword.rss);
1703 			M_HASHTYPE_SET(sendmp, ixl_ptype_to_hash(ptype));
1704 #else
1705 			sendmp->m_pkthdr.flowid = que->msix;
1706 			M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE);
1707 #endif
1708 		}
1709 next_desc:
1710 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1711 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1712 
1713 		/* Advance our pointers to the next descriptor. */
1714 		if (++i == que->num_desc)
1715 			i = 0;
1716 
1717 		/* Now send to the stack or do LRO */
1718 		if (sendmp != NULL) {
1719 			rxr->next_check = i;
1720 			ixl_rx_input(rxr, ifp, sendmp, ptype);
1721 			i = rxr->next_check;
1722 		}
1723 
1724                /* Every 8 descriptors we go to refresh mbufs */
1725 		if (processed == 8) {
1726 			ixl_refresh_mbufs(que, i);
1727 			processed = 0;
1728 		}
1729 	}
1730 
1731 	/* Refresh any remaining buf structs */
1732 	if (ixl_rx_unrefreshed(que))
1733 		ixl_refresh_mbufs(que, i);
1734 
1735 	rxr->next_check = i;
1736 
1737 #if defined(INET6) || defined(INET)
1738 	/*
1739 	 * Flush any outstanding LRO work
1740 	 */
1741 	tcp_lro_flush_all(lro);
1742 #endif
1743 
1744 	IXL_RX_UNLOCK(rxr);
1745 	return (FALSE);
1746 }
1747 
1748 
1749 /*********************************************************************
1750  *
1751  *  Verify that the hardware indicated that the checksum is valid.
1752  *  Inform the stack about the status of checksum so that stack
1753  *  doesn't spend time verifying the checksum.
1754  *
1755  *********************************************************************/
1756 static void
1757 ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype)
1758 {
1759 	struct i40e_rx_ptype_decoded decoded;
1760 
1761 	decoded = decode_rx_desc_ptype(ptype);
1762 
1763 	/* Errors? */
1764  	if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
1765 	    (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) {
1766 		mp->m_pkthdr.csum_flags = 0;
1767 		return;
1768 	}
1769 
1770 	/* IPv6 with extension headers likely have bad csum */
1771 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1772 	    decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6)
1773 		if (status &
1774 		    (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) {
1775 			mp->m_pkthdr.csum_flags = 0;
1776 			return;
1777 		}
1778 
1779 
1780 	/* IP Checksum Good */
1781 	mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
1782 	mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
1783 
1784 	if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) {
1785 		mp->m_pkthdr.csum_flags |=
1786 		    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1787 		mp->m_pkthdr.csum_data |= htons(0xffff);
1788 	}
1789 	return;
1790 }
1791 
1792 #if __FreeBSD_version >= 1100000
1793 uint64_t
1794 ixl_get_counter(if_t ifp, ift_counter cnt)
1795 {
1796 	struct ixl_vsi *vsi;
1797 
1798 	vsi = if_getsoftc(ifp);
1799 
1800 	switch (cnt) {
1801 	case IFCOUNTER_IPACKETS:
1802 		return (vsi->ipackets);
1803 	case IFCOUNTER_IERRORS:
1804 		return (vsi->ierrors);
1805 	case IFCOUNTER_OPACKETS:
1806 		return (vsi->opackets);
1807 	case IFCOUNTER_OERRORS:
1808 		return (vsi->oerrors);
1809 	case IFCOUNTER_COLLISIONS:
1810 		/* Collisions are by standard impossible in 40G/10G Ethernet */
1811 		return (0);
1812 	case IFCOUNTER_IBYTES:
1813 		return (vsi->ibytes);
1814 	case IFCOUNTER_OBYTES:
1815 		return (vsi->obytes);
1816 	case IFCOUNTER_IMCASTS:
1817 		return (vsi->imcasts);
1818 	case IFCOUNTER_OMCASTS:
1819 		return (vsi->omcasts);
1820 	case IFCOUNTER_IQDROPS:
1821 		return (vsi->iqdrops);
1822 	case IFCOUNTER_OQDROPS:
1823 		return (vsi->oqdrops);
1824 	case IFCOUNTER_NOPROTO:
1825 		return (vsi->noproto);
1826 	default:
1827 		return (if_get_counter_default(ifp, cnt));
1828 	}
1829 }
1830 #endif
1831 
1832