xref: /freebsd/sys/dev/ixl/ixl_txrx.c (revision 39ee7a7a6bdd1557b1c3532abf60d139798ac88b)
1 /******************************************************************************
2 
3   Copyright (c) 2013-2015, Intel Corporation
4   All rights reserved.
5 
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8 
9    1. Redistributions of source code must retain the above copyright notice,
10       this list of conditions and the following disclaimer.
11 
12    2. Redistributions in binary form must reproduce the above copyright
13       notice, this list of conditions and the following disclaimer in the
14       documentation and/or other materials provided with the distribution.
15 
16    3. Neither the name of the Intel Corporation nor the names of its
17       contributors may be used to endorse or promote products derived from
18       this software without specific prior written permission.
19 
20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30   POSSIBILITY OF SUCH DAMAGE.
31 
32 ******************************************************************************/
33 /*$FreeBSD$*/
34 
35 /*
36 **	IXL driver TX/RX Routines:
37 **	    This was seperated to allow usage by
38 ** 	    both the BASE and the VF drivers.
39 */
40 
41 #ifndef IXL_STANDALONE_BUILD
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_rss.h"
45 #endif
46 
47 #include "ixl.h"
48 
49 #ifdef RSS
50 #include <net/rss_config.h>
51 #endif
52 
53 /* Local Prototypes */
54 static void	ixl_rx_checksum(struct mbuf *, u32, u32, u8);
55 static void	ixl_refresh_mbufs(struct ixl_queue *, int);
56 static int      ixl_xmit(struct ixl_queue *, struct mbuf **);
57 static int	ixl_tx_setup_offload(struct ixl_queue *,
58 		    struct mbuf *, u32 *, u32 *);
59 static bool	ixl_tso_setup(struct ixl_queue *, struct mbuf *);
60 
61 static __inline void ixl_rx_discard(struct rx_ring *, int);
62 static __inline void ixl_rx_input(struct rx_ring *, struct ifnet *,
63 		    struct mbuf *, u8);
64 
65 #ifdef DEV_NETMAP
66 #include <dev/netmap/if_ixl_netmap.h>
67 #endif /* DEV_NETMAP */
68 
69 /*
70 ** Multiqueue Transmit driver
71 */
72 int
73 ixl_mq_start(struct ifnet *ifp, struct mbuf *m)
74 {
75 	struct ixl_vsi		*vsi = ifp->if_softc;
76 	struct ixl_queue	*que;
77 	struct tx_ring		*txr;
78 	int 			err, i;
79 #ifdef RSS
80 	u32			bucket_id;
81 #endif
82 
83 	/*
84 	** Which queue to use:
85 	**
86 	** When doing RSS, map it to the same outbound
87 	** queue as the incoming flow would be mapped to.
88 	** If everything is setup correctly, it should be
89 	** the same bucket that the current CPU we're on is.
90 	*/
91 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
92 #ifdef  RSS
93 		if (rss_hash2bucket(m->m_pkthdr.flowid,
94 		    M_HASHTYPE_GET(m), &bucket_id) == 0) {
95 			i = bucket_id % vsi->num_queues;
96                 } else
97 #endif
98                         i = m->m_pkthdr.flowid % vsi->num_queues;
99         } else
100 		i = curcpu % vsi->num_queues;
101 	/*
102 	** This may not be perfect, but until something
103 	** better comes along it will keep from scheduling
104 	** on stalled queues.
105 	*/
106 	if (((1 << i) & vsi->active_queues) == 0)
107 		i = ffsl(vsi->active_queues);
108 
109 	que = &vsi->queues[i];
110 	txr = &que->txr;
111 
112 	err = drbr_enqueue(ifp, txr->br, m);
113 	if (err)
114 		return (err);
115 	if (IXL_TX_TRYLOCK(txr)) {
116 		ixl_mq_start_locked(ifp, txr);
117 		IXL_TX_UNLOCK(txr);
118 	} else
119 		taskqueue_enqueue(que->tq, &que->tx_task);
120 
121 	return (0);
122 }
123 
124 int
125 ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr)
126 {
127 	struct ixl_queue	*que = txr->que;
128 	struct ixl_vsi		*vsi = que->vsi;
129         struct mbuf		*next;
130         int			err = 0;
131 
132 
133 	if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) ||
134 	    vsi->link_active == 0)
135 		return (ENETDOWN);
136 
137 	/* Process the transmit queue */
138 	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
139 		if ((err = ixl_xmit(que, &next)) != 0) {
140 			if (next == NULL)
141 				drbr_advance(ifp, txr->br);
142 			else
143 				drbr_putback(ifp, txr->br, next);
144 			break;
145 		}
146 		drbr_advance(ifp, txr->br);
147 		/* Send a copy of the frame to the BPF listener */
148 		ETHER_BPF_MTAP(ifp, next);
149 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
150 			break;
151 	}
152 
153 	if (txr->avail < IXL_TX_CLEANUP_THRESHOLD)
154 		ixl_txeof(que);
155 
156 	return (err);
157 }
158 
159 /*
160  * Called from a taskqueue to drain queued transmit packets.
161  */
162 void
163 ixl_deferred_mq_start(void *arg, int pending)
164 {
165 	struct ixl_queue	*que = arg;
166         struct tx_ring		*txr = &que->txr;
167 	struct ixl_vsi		*vsi = que->vsi;
168         struct ifnet		*ifp = vsi->ifp;
169 
170 	IXL_TX_LOCK(txr);
171 	if (!drbr_empty(ifp, txr->br))
172 		ixl_mq_start_locked(ifp, txr);
173 	IXL_TX_UNLOCK(txr);
174 }
175 
176 /*
177 ** Flush all queue ring buffers
178 */
179 void
180 ixl_qflush(struct ifnet *ifp)
181 {
182 	struct ixl_vsi	*vsi = ifp->if_softc;
183 
184         for (int i = 0; i < vsi->num_queues; i++) {
185 		struct ixl_queue *que = &vsi->queues[i];
186 		struct tx_ring	*txr = &que->txr;
187 		struct mbuf	*m;
188 		IXL_TX_LOCK(txr);
189 		while ((m = buf_ring_dequeue_sc(txr->br)) != NULL)
190 			m_freem(m);
191 		IXL_TX_UNLOCK(txr);
192 	}
193 	if_qflush(ifp);
194 }
195 
196 /*
197 ** Find mbuf chains passed to the driver
198 ** that are 'sparse', using more than 8
199 ** mbufs to deliver an mss-size chunk of data
200 */
201 static inline bool
202 ixl_tso_detect_sparse(struct mbuf *mp)
203 {
204 	struct mbuf	*m;
205 	int		num = 0, mss;
206 	bool		ret = FALSE;
207 
208 	mss = mp->m_pkthdr.tso_segsz;
209 	for (m = mp->m_next; m != NULL; m = m->m_next) {
210 		num++;
211 		mss -= m->m_len;
212 		if (mss < 1)
213 			break;
214 		if (m->m_next == NULL)
215 			break;
216 	}
217 	if (num > IXL_SPARSE_CHAIN)
218 		ret = TRUE;
219 
220 	return (ret);
221 }
222 
223 
224 /*********************************************************************
225  *
226  *  This routine maps the mbufs to tx descriptors, allowing the
227  *  TX engine to transmit the packets.
228  *  	- return 0 on success, positive on failure
229  *
230  **********************************************************************/
231 #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS)
232 
233 static int
234 ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp)
235 {
236 	struct ixl_vsi		*vsi = que->vsi;
237 	struct i40e_hw		*hw = vsi->hw;
238 	struct tx_ring		*txr = &que->txr;
239 	struct ixl_tx_buf	*buf;
240 	struct i40e_tx_desc	*txd = NULL;
241 	struct mbuf		*m_head, *m;
242 	int             	i, j, error, nsegs, maxsegs;
243 	int			first, last = 0;
244 	u16			vtag = 0;
245 	u32			cmd, off;
246 	bus_dmamap_t		map;
247 	bus_dma_tag_t		tag;
248 	bus_dma_segment_t	segs[IXL_MAX_TSO_SEGS];
249 
250 
251 	cmd = off = 0;
252 	m_head = *m_headp;
253 
254         /*
255          * Important to capture the first descriptor
256          * used because it will contain the index of
257          * the one we tell the hardware to report back
258          */
259         first = txr->next_avail;
260 	buf = &txr->buffers[first];
261 	map = buf->map;
262 	tag = txr->tx_tag;
263 	maxsegs = IXL_MAX_TX_SEGS;
264 
265 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
266 		/* Use larger mapping for TSO */
267 		tag = txr->tso_tag;
268 		maxsegs = IXL_MAX_TSO_SEGS;
269 		if (ixl_tso_detect_sparse(m_head)) {
270 			m = m_defrag(m_head, M_NOWAIT);
271 			if (m == NULL) {
272 				m_freem(*m_headp);
273 				*m_headp = NULL;
274 				return (ENOBUFS);
275 			}
276 			*m_headp = m;
277 		}
278 	}
279 
280 	/*
281 	 * Map the packet for DMA.
282 	 */
283 	error = bus_dmamap_load_mbuf_sg(tag, map,
284 	    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
285 
286 	if (error == EFBIG) {
287 		struct mbuf *m;
288 
289 		m = m_collapse(*m_headp, M_NOWAIT, maxsegs);
290 		if (m == NULL) {
291 			que->mbuf_defrag_failed++;
292 			m_freem(*m_headp);
293 			*m_headp = NULL;
294 			return (ENOBUFS);
295 		}
296 		*m_headp = m;
297 
298 		/* Try it again */
299 		error = bus_dmamap_load_mbuf_sg(tag, map,
300 		    *m_headp, segs, &nsegs, BUS_DMA_NOWAIT);
301 
302 		if (error == ENOMEM) {
303 			que->tx_dma_setup++;
304 			return (error);
305 		} else if (error != 0) {
306 			que->tx_dma_setup++;
307 			m_freem(*m_headp);
308 			*m_headp = NULL;
309 			return (error);
310 		}
311 	} else if (error == ENOMEM) {
312 		que->tx_dma_setup++;
313 		return (error);
314 	} else if (error != 0) {
315 		que->tx_dma_setup++;
316 		m_freem(*m_headp);
317 		*m_headp = NULL;
318 		return (error);
319 	}
320 
321 	/* Make certain there are enough descriptors */
322 	if (nsegs > txr->avail - 2) {
323 		txr->no_desc++;
324 		error = ENOBUFS;
325 		goto xmit_fail;
326 	}
327 	m_head = *m_headp;
328 
329 	/* Set up the TSO/CSUM offload */
330 	if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) {
331 		error = ixl_tx_setup_offload(que, m_head, &cmd, &off);
332 		if (error)
333 			goto xmit_fail;
334 	}
335 
336 	cmd |= I40E_TX_DESC_CMD_ICRC;
337 	/* Grab the VLAN tag */
338 	if (m_head->m_flags & M_VLANTAG) {
339 		cmd |= I40E_TX_DESC_CMD_IL2TAG1;
340 		vtag = htole16(m_head->m_pkthdr.ether_vtag);
341 	}
342 
343 	i = txr->next_avail;
344 	for (j = 0; j < nsegs; j++) {
345 		bus_size_t seglen;
346 
347 		buf = &txr->buffers[i];
348 		buf->tag = tag; /* Keep track of the type tag */
349 		txd = &txr->base[i];
350 		seglen = segs[j].ds_len;
351 
352 		txd->buffer_addr = htole64(segs[j].ds_addr);
353 		txd->cmd_type_offset_bsz =
354 		    htole64(I40E_TX_DESC_DTYPE_DATA
355 		    | ((u64)cmd  << I40E_TXD_QW1_CMD_SHIFT)
356 		    | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT)
357 		    | ((u64)seglen  << I40E_TXD_QW1_TX_BUF_SZ_SHIFT)
358 		    | ((u64)vtag  << I40E_TXD_QW1_L2TAG1_SHIFT));
359 
360 		last = i; /* descriptor that will get completion IRQ */
361 
362 		if (++i == que->num_desc)
363 			i = 0;
364 
365 		buf->m_head = NULL;
366 		buf->eop_index = -1;
367 	}
368 	/* Set the last descriptor for report */
369 	txd->cmd_type_offset_bsz |=
370 	    htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT));
371 	txr->avail -= nsegs;
372 	txr->next_avail = i;
373 
374 	buf->m_head = m_head;
375 	/* Swap the dma map between the first and last descriptor */
376 	txr->buffers[first].map = buf->map;
377 	buf->map = map;
378 	bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE);
379 
380         /* Set the index of the descriptor that will be marked done */
381         buf = &txr->buffers[first];
382 	buf->eop_index = last;
383 
384         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
385             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
386 	/*
387 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
388 	 * hardware that this frame is available to transmit.
389 	 */
390 	++txr->total_packets;
391 	wr32(hw, txr->tail, i);
392 
393 	ixl_flush(hw);
394 	/* Mark outstanding work */
395 	if (que->busy == 0)
396 		que->busy = 1;
397 	return (0);
398 
399 xmit_fail:
400 	bus_dmamap_unload(tag, buf->map);
401 	return (error);
402 }
403 
404 
405 /*********************************************************************
406  *
407  *  Allocate memory for tx_buffer structures. The tx_buffer stores all
408  *  the information needed to transmit a packet on the wire. This is
409  *  called only once at attach, setup is done every reset.
410  *
411  **********************************************************************/
412 int
413 ixl_allocate_tx_data(struct ixl_queue *que)
414 {
415 	struct tx_ring		*txr = &que->txr;
416 	struct ixl_vsi		*vsi = que->vsi;
417 	device_t		dev = vsi->dev;
418 	struct ixl_tx_buf	*buf;
419 	int			error = 0;
420 
421 	/*
422 	 * Setup DMA descriptor areas.
423 	 */
424 	if ((error = bus_dma_tag_create(NULL,		/* parent */
425 			       1, 0,			/* alignment, bounds */
426 			       BUS_SPACE_MAXADDR,	/* lowaddr */
427 			       BUS_SPACE_MAXADDR,	/* highaddr */
428 			       NULL, NULL,		/* filter, filterarg */
429 			       IXL_TSO_SIZE,		/* maxsize */
430 			       IXL_MAX_TX_SEGS,		/* nsegments */
431 			       PAGE_SIZE,		/* maxsegsize */
432 			       0,			/* flags */
433 			       NULL,			/* lockfunc */
434 			       NULL,			/* lockfuncarg */
435 			       &txr->tx_tag))) {
436 		device_printf(dev,"Unable to allocate TX DMA tag\n");
437 		goto fail;
438 	}
439 
440 	/* Make a special tag for TSO */
441 	if ((error = bus_dma_tag_create(NULL,		/* parent */
442 			       1, 0,			/* alignment, bounds */
443 			       BUS_SPACE_MAXADDR,	/* lowaddr */
444 			       BUS_SPACE_MAXADDR,	/* highaddr */
445 			       NULL, NULL,		/* filter, filterarg */
446 			       IXL_TSO_SIZE,		/* maxsize */
447 			       IXL_MAX_TSO_SEGS,	/* nsegments */
448 			       PAGE_SIZE,		/* maxsegsize */
449 			       0,			/* flags */
450 			       NULL,			/* lockfunc */
451 			       NULL,			/* lockfuncarg */
452 			       &txr->tso_tag))) {
453 		device_printf(dev,"Unable to allocate TX TSO DMA tag\n");
454 		goto fail;
455 	}
456 
457 	if (!(txr->buffers =
458 	    (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) *
459 	    que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) {
460 		device_printf(dev, "Unable to allocate tx_buffer memory\n");
461 		error = ENOMEM;
462 		goto fail;
463 	}
464 
465         /* Create the descriptor buffer default dma maps */
466 	buf = txr->buffers;
467 	for (int i = 0; i < que->num_desc; i++, buf++) {
468 		buf->tag = txr->tx_tag;
469 		error = bus_dmamap_create(buf->tag, 0, &buf->map);
470 		if (error != 0) {
471 			device_printf(dev, "Unable to create TX DMA map\n");
472 			goto fail;
473 		}
474 	}
475 fail:
476 	return (error);
477 }
478 
479 
480 /*********************************************************************
481  *
482  *  (Re)Initialize a queue transmit ring.
483  *	- called by init, it clears the descriptor ring,
484  *	  and frees any stale mbufs
485  *
486  **********************************************************************/
487 void
488 ixl_init_tx_ring(struct ixl_queue *que)
489 {
490 #ifdef DEV_NETMAP
491 	struct netmap_adapter *na = NA(que->vsi->ifp);
492 	struct netmap_slot *slot;
493 #endif /* DEV_NETMAP */
494 	struct tx_ring		*txr = &que->txr;
495 	struct ixl_tx_buf	*buf;
496 
497 	/* Clear the old ring contents */
498 	IXL_TX_LOCK(txr);
499 
500 #ifdef DEV_NETMAP
501 	/*
502 	 * (under lock): if in netmap mode, do some consistency
503 	 * checks and set slot to entry 0 of the netmap ring.
504 	 */
505 	slot = netmap_reset(na, NR_TX, que->me, 0);
506 #endif /* DEV_NETMAP */
507 
508 	bzero((void *)txr->base,
509 	      (sizeof(struct i40e_tx_desc)) * que->num_desc);
510 
511 	/* Reset indices */
512 	txr->next_avail = 0;
513 	txr->next_to_clean = 0;
514 
515 #ifdef IXL_FDIR
516 	/* Initialize flow director */
517 	txr->atr_rate = ixl_atr_rate;
518 	txr->atr_count = 0;
519 #endif
520 
521 	/* Free any existing tx mbufs. */
522         buf = txr->buffers;
523 	for (int i = 0; i < que->num_desc; i++, buf++) {
524 		if (buf->m_head != NULL) {
525 			bus_dmamap_sync(buf->tag, buf->map,
526 			    BUS_DMASYNC_POSTWRITE);
527 			bus_dmamap_unload(buf->tag, buf->map);
528 			m_freem(buf->m_head);
529 			buf->m_head = NULL;
530 		}
531 #ifdef DEV_NETMAP
532 		/*
533 		 * In netmap mode, set the map for the packet buffer.
534 		 * NOTE: Some drivers (not this one) also need to set
535 		 * the physical buffer address in the NIC ring.
536 		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
537 		 * netmap slot index, si
538 		 */
539 		if (slot) {
540 			int si = netmap_idx_n2k(&na->tx_rings[que->me], i);
541 			netmap_load_map(na, buf->tag, buf->map, NMB(na, slot + si));
542 		}
543 #endif /* DEV_NETMAP */
544 		/* Clear the EOP index */
545 		buf->eop_index = -1;
546         }
547 
548 	/* Set number of descriptors available */
549 	txr->avail = que->num_desc;
550 
551 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
552 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
553 	IXL_TX_UNLOCK(txr);
554 }
555 
556 
557 /*********************************************************************
558  *
559  *  Free transmit ring related data structures.
560  *
561  **********************************************************************/
562 void
563 ixl_free_que_tx(struct ixl_queue *que)
564 {
565 	struct tx_ring *txr = &que->txr;
566 	struct ixl_tx_buf *buf;
567 
568 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
569 
570 	for (int i = 0; i < que->num_desc; i++) {
571 		buf = &txr->buffers[i];
572 		if (buf->m_head != NULL) {
573 			bus_dmamap_sync(buf->tag, buf->map,
574 			    BUS_DMASYNC_POSTWRITE);
575 			bus_dmamap_unload(buf->tag,
576 			    buf->map);
577 			m_freem(buf->m_head);
578 			buf->m_head = NULL;
579 			if (buf->map != NULL) {
580 				bus_dmamap_destroy(buf->tag,
581 				    buf->map);
582 				buf->map = NULL;
583 			}
584 		} else if (buf->map != NULL) {
585 			bus_dmamap_unload(buf->tag,
586 			    buf->map);
587 			bus_dmamap_destroy(buf->tag,
588 			    buf->map);
589 			buf->map = NULL;
590 		}
591 	}
592 	if (txr->br != NULL)
593 		buf_ring_free(txr->br, M_DEVBUF);
594 	if (txr->buffers != NULL) {
595 		free(txr->buffers, M_DEVBUF);
596 		txr->buffers = NULL;
597 	}
598 	if (txr->tx_tag != NULL) {
599 		bus_dma_tag_destroy(txr->tx_tag);
600 		txr->tx_tag = NULL;
601 	}
602 	if (txr->tso_tag != NULL) {
603 		bus_dma_tag_destroy(txr->tso_tag);
604 		txr->tso_tag = NULL;
605 	}
606 
607 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
608 	return;
609 }
610 
611 /*********************************************************************
612  *
613  *  Setup descriptor for hw offloads
614  *
615  **********************************************************************/
616 
617 static int
618 ixl_tx_setup_offload(struct ixl_queue *que,
619     struct mbuf *mp, u32 *cmd, u32 *off)
620 {
621 	struct ether_vlan_header	*eh;
622 #ifdef INET
623 	struct ip			*ip = NULL;
624 #endif
625 	struct tcphdr			*th = NULL;
626 #ifdef INET6
627 	struct ip6_hdr			*ip6;
628 #endif
629 	int				elen, ip_hlen = 0, tcp_hlen;
630 	u16				etype;
631 	u8				ipproto = 0;
632 	bool				tso = FALSE;
633 
634 
635 	/* Set up the TSO context descriptor if required */
636 	if (mp->m_pkthdr.csum_flags & CSUM_TSO) {
637 		tso = ixl_tso_setup(que, mp);
638 		if (tso)
639 			++que->tso;
640 		else
641 			return (ENXIO);
642 	}
643 
644 	/*
645 	 * Determine where frame payload starts.
646 	 * Jump over vlan headers if already present,
647 	 * helpful for QinQ too.
648 	 */
649 	eh = mtod(mp, struct ether_vlan_header *);
650 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
651 		etype = ntohs(eh->evl_proto);
652 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
653 	} else {
654 		etype = ntohs(eh->evl_encap_proto);
655 		elen = ETHER_HDR_LEN;
656 	}
657 
658 	switch (etype) {
659 #ifdef INET
660 		case ETHERTYPE_IP:
661 			ip = (struct ip *)(mp->m_data + elen);
662 			ip_hlen = ip->ip_hl << 2;
663 			ipproto = ip->ip_p;
664 			th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
665 			/* The IP checksum must be recalculated with TSO */
666 			if (tso)
667 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM;
668 			else
669 				*cmd |= I40E_TX_DESC_CMD_IIPT_IPV4;
670 			break;
671 #endif
672 #ifdef INET6
673 		case ETHERTYPE_IPV6:
674 			ip6 = (struct ip6_hdr *)(mp->m_data + elen);
675 			ip_hlen = sizeof(struct ip6_hdr);
676 			ipproto = ip6->ip6_nxt;
677 			th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
678 			*cmd |= I40E_TX_DESC_CMD_IIPT_IPV6;
679 			break;
680 #endif
681 		default:
682 			break;
683 	}
684 
685 	*off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT;
686 	*off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT;
687 
688 	switch (ipproto) {
689 		case IPPROTO_TCP:
690 			tcp_hlen = th->th_off << 2;
691 			if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) {
692 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP;
693 				*off |= (tcp_hlen >> 2) <<
694 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
695 			}
696 #ifdef IXL_FDIR
697 			ixl_atr(que, th, etype);
698 #endif
699 			break;
700 		case IPPROTO_UDP:
701 			if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) {
702 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP;
703 				*off |= (sizeof(struct udphdr) >> 2) <<
704 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
705 			}
706 			break;
707 
708 		case IPPROTO_SCTP:
709 			if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) {
710 				*cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP;
711 				*off |= (sizeof(struct sctphdr) >> 2) <<
712 				    I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
713 			}
714 			/* Fall Thru */
715 		default:
716 			break;
717 	}
718 
719         return (0);
720 }
721 
722 
723 /**********************************************************************
724  *
725  *  Setup context for hardware segmentation offload (TSO)
726  *
727  **********************************************************************/
728 static bool
729 ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp)
730 {
731 	struct tx_ring			*txr = &que->txr;
732 	struct i40e_tx_context_desc	*TXD;
733 	struct ixl_tx_buf		*buf;
734 	u32				cmd, mss, type, tsolen;
735 	u16				etype;
736 	int				idx, elen, ip_hlen, tcp_hlen;
737 	struct ether_vlan_header	*eh;
738 #ifdef INET
739 	struct ip			*ip;
740 #endif
741 #ifdef INET6
742 	struct ip6_hdr			*ip6;
743 #endif
744 #if defined(INET6) || defined(INET)
745 	struct tcphdr			*th;
746 #endif
747 	u64				type_cmd_tso_mss;
748 
749 	/*
750 	 * Determine where frame payload starts.
751 	 * Jump over vlan headers if already present
752 	 */
753 	eh = mtod(mp, struct ether_vlan_header *);
754 	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
755 		elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
756 		etype = eh->evl_proto;
757 	} else {
758 		elen = ETHER_HDR_LEN;
759 		etype = eh->evl_encap_proto;
760 	}
761 
762         switch (ntohs(etype)) {
763 #ifdef INET6
764 	case ETHERTYPE_IPV6:
765 		ip6 = (struct ip6_hdr *)(mp->m_data + elen);
766 		if (ip6->ip6_nxt != IPPROTO_TCP)
767 			return (ENXIO);
768 		ip_hlen = sizeof(struct ip6_hdr);
769 		th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen);
770 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
771 		tcp_hlen = th->th_off << 2;
772 		break;
773 #endif
774 #ifdef INET
775 	case ETHERTYPE_IP:
776 		ip = (struct ip *)(mp->m_data + elen);
777 		if (ip->ip_p != IPPROTO_TCP)
778 			return (ENXIO);
779 		ip->ip_sum = 0;
780 		ip_hlen = ip->ip_hl << 2;
781 		th = (struct tcphdr *)((caddr_t)ip + ip_hlen);
782 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
783 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
784 		tcp_hlen = th->th_off << 2;
785 		break;
786 #endif
787 	default:
788 		printf("%s: CSUM_TSO but no supported IP version (0x%04x)",
789 		    __func__, ntohs(etype));
790 		return FALSE;
791         }
792 
793         /* Ensure we have at least the IP+TCP header in the first mbuf. */
794         if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr))
795 		return FALSE;
796 
797 	idx = txr->next_avail;
798 	buf = &txr->buffers[idx];
799 	TXD = (struct i40e_tx_context_desc *) &txr->base[idx];
800 	tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen);
801 
802 	type = I40E_TX_DESC_DTYPE_CONTEXT;
803 	cmd = I40E_TX_CTX_DESC_TSO;
804 	mss = mp->m_pkthdr.tso_segsz;
805 
806 	type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) |
807 	    ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) |
808 	    ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) |
809 	    ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT);
810 	TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss);
811 
812 	TXD->tunneling_params = htole32(0);
813 	buf->m_head = NULL;
814 	buf->eop_index = -1;
815 
816 	if (++idx == que->num_desc)
817 		idx = 0;
818 
819 	txr->avail--;
820 	txr->next_avail = idx;
821 
822 	return TRUE;
823 }
824 
825 /*
826 ** ixl_get_tx_head - Retrieve the value from the
827 **    location the HW records its HEAD index
828 */
829 static inline u32
830 ixl_get_tx_head(struct ixl_queue *que)
831 {
832 	struct tx_ring  *txr = &que->txr;
833 	void *head = &txr->base[que->num_desc];
834 	return LE32_TO_CPU(*(volatile __le32 *)head);
835 }
836 
837 /**********************************************************************
838  *
839  *  Examine each tx_buffer in the used queue. If the hardware is done
840  *  processing the packet then free associated resources. The
841  *  tx_buffer is put back on the free queue.
842  *
843  **********************************************************************/
844 bool
845 ixl_txeof(struct ixl_queue *que)
846 {
847 	struct tx_ring		*txr = &que->txr;
848 	u32			first, last, head, done, processed;
849 	struct ixl_tx_buf	*buf;
850 	struct i40e_tx_desc	*tx_desc, *eop_desc;
851 
852 
853 	mtx_assert(&txr->mtx, MA_OWNED);
854 
855 #ifdef DEV_NETMAP
856 	// XXX todo: implement moderation
857 	if (netmap_tx_irq(que->vsi->ifp, que->me))
858 		return FALSE;
859 #endif /* DEF_NETMAP */
860 
861 	/* These are not the descriptors you seek, move along :) */
862 	if (txr->avail == que->num_desc) {
863 		que->busy = 0;
864 		return FALSE;
865 	}
866 
867 	processed = 0;
868 	first = txr->next_to_clean;
869 	buf = &txr->buffers[first];
870 	tx_desc = (struct i40e_tx_desc *)&txr->base[first];
871 	last = buf->eop_index;
872 	if (last == -1)
873 		return FALSE;
874 	eop_desc = (struct i40e_tx_desc *)&txr->base[last];
875 
876 	/* Get the Head WB value */
877 	head = ixl_get_tx_head(que);
878 
879 	/*
880 	** Get the index of the first descriptor
881 	** BEYOND the EOP and call that 'done'.
882 	** I do this so the comparison in the
883 	** inner while loop below can be simple
884 	*/
885 	if (++last == que->num_desc) last = 0;
886 	done = last;
887 
888         bus_dmamap_sync(txr->dma.tag, txr->dma.map,
889             BUS_DMASYNC_POSTREAD);
890 	/*
891 	** The HEAD index of the ring is written in a
892 	** defined location, this rather than a done bit
893 	** is what is used to keep track of what must be
894 	** 'cleaned'.
895 	*/
896 	while (first != head) {
897 		/* We clean the range of the packet */
898 		while (first != done) {
899 			++txr->avail;
900 			++processed;
901 
902 			if (buf->m_head) {
903 				txr->bytes += /* for ITR adjustment */
904 				    buf->m_head->m_pkthdr.len;
905 				txr->tx_bytes += /* for TX stats */
906 				    buf->m_head->m_pkthdr.len;
907 				bus_dmamap_sync(buf->tag,
908 				    buf->map,
909 				    BUS_DMASYNC_POSTWRITE);
910 				bus_dmamap_unload(buf->tag,
911 				    buf->map);
912 				m_freem(buf->m_head);
913 				buf->m_head = NULL;
914 				buf->map = NULL;
915 			}
916 			buf->eop_index = -1;
917 
918 			if (++first == que->num_desc)
919 				first = 0;
920 
921 			buf = &txr->buffers[first];
922 			tx_desc = &txr->base[first];
923 		}
924 		++txr->packets;
925 		/* See if there is more work now */
926 		last = buf->eop_index;
927 		if (last != -1) {
928 			eop_desc = &txr->base[last];
929 			/* Get next done point */
930 			if (++last == que->num_desc) last = 0;
931 			done = last;
932 		} else
933 			break;
934 	}
935 	bus_dmamap_sync(txr->dma.tag, txr->dma.map,
936 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
937 
938 	txr->next_to_clean = first;
939 
940 
941 	/*
942 	** Hang detection, we know there's
943 	** work outstanding or the first return
944 	** would have been taken, so indicate an
945 	** unsuccessful pass, in local_timer if
946 	** the value is too great the queue will
947 	** be considered hung. If anything has been
948 	** cleaned then reset the state.
949 	*/
950 	if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG))
951 		++que->busy;
952 
953 	if (processed)
954 		que->busy = 1; /* Note this turns off HUNG */
955 
956 	/*
957 	 * If there are no pending descriptors, clear the timeout.
958 	 */
959 	if (txr->avail == que->num_desc) {
960 		que->busy = 0;
961 		return FALSE;
962 	}
963 
964 	return TRUE;
965 }
966 
967 /*********************************************************************
968  *
969  *  Refresh mbuf buffers for RX descriptor rings
970  *   - now keeps its own state so discards due to resource
971  *     exhaustion are unnecessary, if an mbuf cannot be obtained
972  *     it just returns, keeping its placeholder, thus it can simply
973  *     be recalled to try again.
974  *
975  **********************************************************************/
976 static void
977 ixl_refresh_mbufs(struct ixl_queue *que, int limit)
978 {
979 	struct ixl_vsi		*vsi = que->vsi;
980 	struct rx_ring		*rxr = &que->rxr;
981 	bus_dma_segment_t	hseg[1];
982 	bus_dma_segment_t	pseg[1];
983 	struct ixl_rx_buf	*buf;
984 	struct mbuf		*mh, *mp;
985 	int			i, j, nsegs, error;
986 	bool			refreshed = FALSE;
987 
988 	i = j = rxr->next_refresh;
989 	/* Control the loop with one beyond */
990 	if (++j == que->num_desc)
991 		j = 0;
992 
993 	while (j != limit) {
994 		buf = &rxr->buffers[i];
995 		if (rxr->hdr_split == FALSE)
996 			goto no_split;
997 
998 		if (buf->m_head == NULL) {
999 			mh = m_gethdr(M_NOWAIT, MT_DATA);
1000 			if (mh == NULL)
1001 				goto update;
1002 		} else
1003 			mh = buf->m_head;
1004 
1005 		mh->m_pkthdr.len = mh->m_len = MHLEN;
1006 		mh->m_len = MHLEN;
1007 		mh->m_flags |= M_PKTHDR;
1008 		/* Get the memory mapping */
1009 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1010 		    buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT);
1011 		if (error != 0) {
1012 			printf("Refresh mbufs: hdr dmamap load"
1013 			    " failure - %d\n", error);
1014 			m_free(mh);
1015 			buf->m_head = NULL;
1016 			goto update;
1017 		}
1018 		buf->m_head = mh;
1019 		bus_dmamap_sync(rxr->htag, buf->hmap,
1020 		    BUS_DMASYNC_PREREAD);
1021 		rxr->base[i].read.hdr_addr =
1022 		   htole64(hseg[0].ds_addr);
1023 
1024 no_split:
1025 		if (buf->m_pack == NULL) {
1026 			mp = m_getjcl(M_NOWAIT, MT_DATA,
1027 			    M_PKTHDR, rxr->mbuf_sz);
1028 			if (mp == NULL)
1029 				goto update;
1030 		} else
1031 			mp = buf->m_pack;
1032 
1033 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1034 		/* Get the memory mapping */
1035 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1036 		    buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT);
1037 		if (error != 0) {
1038 			printf("Refresh mbufs: payload dmamap load"
1039 			    " failure - %d\n", error);
1040 			m_free(mp);
1041 			buf->m_pack = NULL;
1042 			goto update;
1043 		}
1044 		buf->m_pack = mp;
1045 		bus_dmamap_sync(rxr->ptag, buf->pmap,
1046 		    BUS_DMASYNC_PREREAD);
1047 		rxr->base[i].read.pkt_addr =
1048 		   htole64(pseg[0].ds_addr);
1049 		/* Used only when doing header split */
1050 		rxr->base[i].read.hdr_addr = 0;
1051 
1052 		refreshed = TRUE;
1053 		/* Next is precalculated */
1054 		i = j;
1055 		rxr->next_refresh = i;
1056 		if (++j == que->num_desc)
1057 			j = 0;
1058 	}
1059 update:
1060 	if (refreshed) /* Update hardware tail index */
1061 		wr32(vsi->hw, rxr->tail, rxr->next_refresh);
1062 	return;
1063 }
1064 
1065 
1066 /*********************************************************************
1067  *
1068  *  Allocate memory for rx_buffer structures. Since we use one
1069  *  rx_buffer per descriptor, the maximum number of rx_buffer's
1070  *  that we'll need is equal to the number of receive descriptors
1071  *  that we've defined.
1072  *
1073  **********************************************************************/
1074 int
1075 ixl_allocate_rx_data(struct ixl_queue *que)
1076 {
1077 	struct rx_ring		*rxr = &que->rxr;
1078 	struct ixl_vsi		*vsi = que->vsi;
1079 	device_t 		dev = vsi->dev;
1080 	struct ixl_rx_buf 	*buf;
1081 	int             	i, bsize, error;
1082 
1083 	bsize = sizeof(struct ixl_rx_buf) * que->num_desc;
1084 	if (!(rxr->buffers =
1085 	    (struct ixl_rx_buf *) malloc(bsize,
1086 	    M_DEVBUF, M_NOWAIT | M_ZERO))) {
1087 		device_printf(dev, "Unable to allocate rx_buffer memory\n");
1088 		error = ENOMEM;
1089 		return (error);
1090 	}
1091 
1092 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1093 				   1, 0,	/* alignment, bounds */
1094 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1095 				   BUS_SPACE_MAXADDR,	/* highaddr */
1096 				   NULL, NULL,		/* filter, filterarg */
1097 				   MSIZE,		/* maxsize */
1098 				   1,			/* nsegments */
1099 				   MSIZE,		/* maxsegsize */
1100 				   0,			/* flags */
1101 				   NULL,		/* lockfunc */
1102 				   NULL,		/* lockfuncarg */
1103 				   &rxr->htag))) {
1104 		device_printf(dev, "Unable to create RX DMA htag\n");
1105 		return (error);
1106 	}
1107 
1108 	if ((error = bus_dma_tag_create(NULL,	/* parent */
1109 				   1, 0,	/* alignment, bounds */
1110 				   BUS_SPACE_MAXADDR,	/* lowaddr */
1111 				   BUS_SPACE_MAXADDR,	/* highaddr */
1112 				   NULL, NULL,		/* filter, filterarg */
1113 				   MJUM16BYTES,		/* maxsize */
1114 				   1,			/* nsegments */
1115 				   MJUM16BYTES,		/* maxsegsize */
1116 				   0,			/* flags */
1117 				   NULL,		/* lockfunc */
1118 				   NULL,		/* lockfuncarg */
1119 				   &rxr->ptag))) {
1120 		device_printf(dev, "Unable to create RX DMA ptag\n");
1121 		return (error);
1122 	}
1123 
1124 	for (i = 0; i < que->num_desc; i++) {
1125 		buf = &rxr->buffers[i];
1126 		error = bus_dmamap_create(rxr->htag,
1127 		    BUS_DMA_NOWAIT, &buf->hmap);
1128 		if (error) {
1129 			device_printf(dev, "Unable to create RX head map\n");
1130 			break;
1131 		}
1132 		error = bus_dmamap_create(rxr->ptag,
1133 		    BUS_DMA_NOWAIT, &buf->pmap);
1134 		if (error) {
1135 			device_printf(dev, "Unable to create RX pkt map\n");
1136 			break;
1137 		}
1138 	}
1139 
1140 	return (error);
1141 }
1142 
1143 
1144 /*********************************************************************
1145  *
1146  *  (Re)Initialize the queue receive ring and its buffers.
1147  *
1148  **********************************************************************/
1149 int
1150 ixl_init_rx_ring(struct ixl_queue *que)
1151 {
1152 	struct	rx_ring 	*rxr = &que->rxr;
1153 	struct ixl_vsi		*vsi = que->vsi;
1154 #if defined(INET6) || defined(INET)
1155 	struct ifnet		*ifp = vsi->ifp;
1156 	struct lro_ctrl		*lro = &rxr->lro;
1157 #endif
1158 	struct ixl_rx_buf	*buf;
1159 	bus_dma_segment_t	pseg[1], hseg[1];
1160 	int			rsize, nsegs, error = 0;
1161 #ifdef DEV_NETMAP
1162 	struct netmap_adapter *na = NA(que->vsi->ifp);
1163 	struct netmap_slot *slot;
1164 #endif /* DEV_NETMAP */
1165 
1166 	IXL_RX_LOCK(rxr);
1167 #ifdef DEV_NETMAP
1168 	/* same as in ixl_init_tx_ring() */
1169 	slot = netmap_reset(na, NR_RX, que->me, 0);
1170 #endif /* DEV_NETMAP */
1171 	/* Clear the ring contents */
1172 	rsize = roundup2(que->num_desc *
1173 	    sizeof(union i40e_rx_desc), DBA_ALIGN);
1174 	bzero((void *)rxr->base, rsize);
1175 	/* Cleanup any existing buffers */
1176 	for (int i = 0; i < que->num_desc; i++) {
1177 		buf = &rxr->buffers[i];
1178 		if (buf->m_head != NULL) {
1179 			bus_dmamap_sync(rxr->htag, buf->hmap,
1180 			    BUS_DMASYNC_POSTREAD);
1181 			bus_dmamap_unload(rxr->htag, buf->hmap);
1182 			buf->m_head->m_flags |= M_PKTHDR;
1183 			m_freem(buf->m_head);
1184 		}
1185 		if (buf->m_pack != NULL) {
1186 			bus_dmamap_sync(rxr->ptag, buf->pmap,
1187 			    BUS_DMASYNC_POSTREAD);
1188 			bus_dmamap_unload(rxr->ptag, buf->pmap);
1189 			buf->m_pack->m_flags |= M_PKTHDR;
1190 			m_freem(buf->m_pack);
1191 		}
1192 		buf->m_head = NULL;
1193 		buf->m_pack = NULL;
1194 	}
1195 
1196 	/* header split is off */
1197 	rxr->hdr_split = FALSE;
1198 
1199 	/* Now replenish the mbufs */
1200 	for (int j = 0; j != que->num_desc; ++j) {
1201 		struct mbuf	*mh, *mp;
1202 
1203 		buf = &rxr->buffers[j];
1204 #ifdef DEV_NETMAP
1205 		/*
1206 		 * In netmap mode, fill the map and set the buffer
1207 		 * address in the NIC ring, considering the offset
1208 		 * between the netmap and NIC rings (see comment in
1209 		 * ixgbe_setup_transmit_ring() ). No need to allocate
1210 		 * an mbuf, so end the block with a continue;
1211 		 */
1212 		if (slot) {
1213 			int sj = netmap_idx_n2k(&na->rx_rings[que->me], j);
1214 			uint64_t paddr;
1215 			void *addr;
1216 
1217 			addr = PNMB(na, slot + sj, &paddr);
1218 			netmap_load_map(na, rxr->dma.tag, buf->pmap, addr);
1219 			/* Update descriptor and the cached value */
1220 			rxr->base[j].read.pkt_addr = htole64(paddr);
1221 			rxr->base[j].read.hdr_addr = 0;
1222 			continue;
1223 		}
1224 #endif /* DEV_NETMAP */
1225 		/*
1226 		** Don't allocate mbufs if not
1227 		** doing header split, its wasteful
1228 		*/
1229 		if (rxr->hdr_split == FALSE)
1230 			goto skip_head;
1231 
1232 		/* First the header */
1233 		buf->m_head = m_gethdr(M_NOWAIT, MT_DATA);
1234 		if (buf->m_head == NULL) {
1235 			error = ENOBUFS;
1236 			goto fail;
1237 		}
1238 		m_adj(buf->m_head, ETHER_ALIGN);
1239 		mh = buf->m_head;
1240 		mh->m_len = mh->m_pkthdr.len = MHLEN;
1241 		mh->m_flags |= M_PKTHDR;
1242 		/* Get the memory mapping */
1243 		error = bus_dmamap_load_mbuf_sg(rxr->htag,
1244 		    buf->hmap, buf->m_head, hseg,
1245 		    &nsegs, BUS_DMA_NOWAIT);
1246 		if (error != 0) /* Nothing elegant to do here */
1247 			goto fail;
1248 		bus_dmamap_sync(rxr->htag,
1249 		    buf->hmap, BUS_DMASYNC_PREREAD);
1250 		/* Update descriptor */
1251 		rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr);
1252 
1253 skip_head:
1254 		/* Now the payload cluster */
1255 		buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA,
1256 		    M_PKTHDR, rxr->mbuf_sz);
1257 		if (buf->m_pack == NULL) {
1258 			error = ENOBUFS;
1259                         goto fail;
1260 		}
1261 		mp = buf->m_pack;
1262 		mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz;
1263 		/* Get the memory mapping */
1264 		error = bus_dmamap_load_mbuf_sg(rxr->ptag,
1265 		    buf->pmap, mp, pseg,
1266 		    &nsegs, BUS_DMA_NOWAIT);
1267 		if (error != 0)
1268                         goto fail;
1269 		bus_dmamap_sync(rxr->ptag,
1270 		    buf->pmap, BUS_DMASYNC_PREREAD);
1271 		/* Update descriptor */
1272 		rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr);
1273 		rxr->base[j].read.hdr_addr = 0;
1274 	}
1275 
1276 
1277 	/* Setup our descriptor indices */
1278 	rxr->next_check = 0;
1279 	rxr->next_refresh = 0;
1280 	rxr->lro_enabled = FALSE;
1281 	rxr->split = 0;
1282 	rxr->bytes = 0;
1283 	rxr->discard = FALSE;
1284 
1285 	wr32(vsi->hw, rxr->tail, que->num_desc - 1);
1286 	ixl_flush(vsi->hw);
1287 
1288 #if defined(INET6) || defined(INET)
1289 	/*
1290 	** Now set up the LRO interface:
1291 	*/
1292 	if (ifp->if_capenable & IFCAP_LRO) {
1293 		int err = tcp_lro_init(lro);
1294 		if (err) {
1295 			if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me);
1296 			goto fail;
1297 		}
1298 		INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me);
1299 		rxr->lro_enabled = TRUE;
1300 		lro->ifp = vsi->ifp;
1301 	}
1302 #endif
1303 
1304 	bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1305 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1306 
1307 fail:
1308 	IXL_RX_UNLOCK(rxr);
1309 	return (error);
1310 }
1311 
1312 
1313 /*********************************************************************
1314  *
1315  *  Free station receive ring data structures
1316  *
1317  **********************************************************************/
1318 void
1319 ixl_free_que_rx(struct ixl_queue *que)
1320 {
1321 	struct rx_ring		*rxr = &que->rxr;
1322 	struct ixl_rx_buf	*buf;
1323 
1324 	INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me);
1325 
1326 	/* Cleanup any existing buffers */
1327 	if (rxr->buffers != NULL) {
1328 		for (int i = 0; i < que->num_desc; i++) {
1329 			buf = &rxr->buffers[i];
1330 			if (buf->m_head != NULL) {
1331 				bus_dmamap_sync(rxr->htag, buf->hmap,
1332 				    BUS_DMASYNC_POSTREAD);
1333 				bus_dmamap_unload(rxr->htag, buf->hmap);
1334 				buf->m_head->m_flags |= M_PKTHDR;
1335 				m_freem(buf->m_head);
1336 			}
1337 			if (buf->m_pack != NULL) {
1338 				bus_dmamap_sync(rxr->ptag, buf->pmap,
1339 				    BUS_DMASYNC_POSTREAD);
1340 				bus_dmamap_unload(rxr->ptag, buf->pmap);
1341 				buf->m_pack->m_flags |= M_PKTHDR;
1342 				m_freem(buf->m_pack);
1343 			}
1344 			buf->m_head = NULL;
1345 			buf->m_pack = NULL;
1346 			if (buf->hmap != NULL) {
1347 				bus_dmamap_destroy(rxr->htag, buf->hmap);
1348 				buf->hmap = NULL;
1349 			}
1350 			if (buf->pmap != NULL) {
1351 				bus_dmamap_destroy(rxr->ptag, buf->pmap);
1352 				buf->pmap = NULL;
1353 			}
1354 		}
1355 		if (rxr->buffers != NULL) {
1356 			free(rxr->buffers, M_DEVBUF);
1357 			rxr->buffers = NULL;
1358 		}
1359 	}
1360 
1361 	if (rxr->htag != NULL) {
1362 		bus_dma_tag_destroy(rxr->htag);
1363 		rxr->htag = NULL;
1364 	}
1365 	if (rxr->ptag != NULL) {
1366 		bus_dma_tag_destroy(rxr->ptag);
1367 		rxr->ptag = NULL;
1368 	}
1369 
1370 	INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me);
1371 	return;
1372 }
1373 
1374 static __inline void
1375 ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype)
1376 {
1377 
1378 #if defined(INET6) || defined(INET)
1379         /*
1380          * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet
1381          * should be computed by hardware. Also it should not have VLAN tag in
1382          * ethernet header.
1383          */
1384         if (rxr->lro_enabled &&
1385             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 &&
1386             (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) ==
1387             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) {
1388                 /*
1389                  * Send to the stack if:
1390                  **  - LRO not enabled, or
1391                  **  - no LRO resources, or
1392                  **  - lro enqueue fails
1393                  */
1394                 if (rxr->lro.lro_cnt != 0)
1395                         if (tcp_lro_rx(&rxr->lro, m, 0) == 0)
1396                                 return;
1397         }
1398 #endif
1399 	IXL_RX_UNLOCK(rxr);
1400         (*ifp->if_input)(ifp, m);
1401 	IXL_RX_LOCK(rxr);
1402 }
1403 
1404 
1405 static __inline void
1406 ixl_rx_discard(struct rx_ring *rxr, int i)
1407 {
1408 	struct ixl_rx_buf	*rbuf;
1409 
1410 	rbuf = &rxr->buffers[i];
1411 
1412         if (rbuf->fmp != NULL) {/* Partial chain ? */
1413 		rbuf->fmp->m_flags |= M_PKTHDR;
1414                 m_freem(rbuf->fmp);
1415                 rbuf->fmp = NULL;
1416 	}
1417 
1418 	/*
1419 	** With advanced descriptors the writeback
1420 	** clobbers the buffer addrs, so its easier
1421 	** to just free the existing mbufs and take
1422 	** the normal refresh path to get new buffers
1423 	** and mapping.
1424 	*/
1425 	if (rbuf->m_head) {
1426 		m_free(rbuf->m_head);
1427 		rbuf->m_head = NULL;
1428 	}
1429 
1430 	if (rbuf->m_pack) {
1431 		m_free(rbuf->m_pack);
1432 		rbuf->m_pack = NULL;
1433 	}
1434 
1435 	return;
1436 }
1437 
1438 #ifdef RSS
1439 /*
1440 ** i40e_ptype_to_hash: parse the packet type
1441 ** to determine the appropriate hash.
1442 */
1443 static inline int
1444 ixl_ptype_to_hash(u8 ptype)
1445 {
1446         struct i40e_rx_ptype_decoded	decoded;
1447 	u8				ex = 0;
1448 
1449 	decoded = decode_rx_desc_ptype(ptype);
1450 	ex = decoded.outer_frag;
1451 
1452 	if (!decoded.known)
1453 		return M_HASHTYPE_OPAQUE;
1454 
1455 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_L2)
1456 		return M_HASHTYPE_OPAQUE;
1457 
1458 	/* Note: anything that gets to this point is IP */
1459         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) {
1460 		switch (decoded.inner_prot) {
1461 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1462 				if (ex)
1463 					return M_HASHTYPE_RSS_TCP_IPV6_EX;
1464 				else
1465 					return M_HASHTYPE_RSS_TCP_IPV6;
1466 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1467 				if (ex)
1468 					return M_HASHTYPE_RSS_UDP_IPV6_EX;
1469 				else
1470 					return M_HASHTYPE_RSS_UDP_IPV6;
1471 			default:
1472 				if (ex)
1473 					return M_HASHTYPE_RSS_IPV6_EX;
1474 				else
1475 					return M_HASHTYPE_RSS_IPV6;
1476 		}
1477 	}
1478         if (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4) {
1479 		switch (decoded.inner_prot) {
1480 			case I40E_RX_PTYPE_INNER_PROT_TCP:
1481 					return M_HASHTYPE_RSS_TCP_IPV4;
1482 			case I40E_RX_PTYPE_INNER_PROT_UDP:
1483 				if (ex)
1484 					return M_HASHTYPE_RSS_UDP_IPV4_EX;
1485 				else
1486 					return M_HASHTYPE_RSS_UDP_IPV4;
1487 			default:
1488 					return M_HASHTYPE_RSS_IPV4;
1489 		}
1490 	}
1491 	/* We should never get here!! */
1492 	return M_HASHTYPE_OPAQUE;
1493 }
1494 #endif /* RSS */
1495 
1496 /*********************************************************************
1497  *
1498  *  This routine executes in interrupt context. It replenishes
1499  *  the mbufs in the descriptor and sends data which has been
1500  *  dma'ed into host memory to upper layer.
1501  *
1502  *  We loop at most count times if count is > 0, or until done if
1503  *  count < 0.
1504  *
1505  *  Return TRUE for more work, FALSE for all clean.
1506  *********************************************************************/
1507 bool
1508 ixl_rxeof(struct ixl_queue *que, int count)
1509 {
1510 	struct ixl_vsi		*vsi = que->vsi;
1511 	struct rx_ring		*rxr = &que->rxr;
1512 	struct ifnet		*ifp = vsi->ifp;
1513 #if defined(INET6) || defined(INET)
1514 	struct lro_ctrl		*lro = &rxr->lro;
1515 	struct lro_entry	*queued;
1516 #endif
1517 	int			i, nextp, processed = 0;
1518 	union i40e_rx_desc	*cur;
1519 	struct ixl_rx_buf	*rbuf, *nbuf;
1520 
1521 
1522 	IXL_RX_LOCK(rxr);
1523 
1524 #ifdef DEV_NETMAP
1525 	if (netmap_rx_irq(ifp, que->me, &count)) {
1526 		IXL_RX_UNLOCK(rxr);
1527 		return (FALSE);
1528 	}
1529 #endif /* DEV_NETMAP */
1530 
1531 	for (i = rxr->next_check; count != 0;) {
1532 		struct mbuf	*sendmp, *mh, *mp;
1533 		u32		rsc, status, error;
1534 		u16		hlen, plen, vtag;
1535 		u64		qword;
1536 		u8		ptype;
1537 		bool		eop;
1538 
1539 		/* Sync the ring. */
1540 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1541 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1542 
1543 		cur = &rxr->base[i];
1544 		qword = le64toh(cur->wb.qword1.status_error_len);
1545 		status = (qword & I40E_RXD_QW1_STATUS_MASK)
1546 		    >> I40E_RXD_QW1_STATUS_SHIFT;
1547 		error = (qword & I40E_RXD_QW1_ERROR_MASK)
1548 		    >> I40E_RXD_QW1_ERROR_SHIFT;
1549 		plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK)
1550 		    >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
1551 		hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK)
1552 		    >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT;
1553 		ptype = (qword & I40E_RXD_QW1_PTYPE_MASK)
1554 		    >> I40E_RXD_QW1_PTYPE_SHIFT;
1555 
1556 		if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) {
1557 			++rxr->not_done;
1558 			break;
1559 		}
1560 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1561 			break;
1562 
1563 		count--;
1564 		sendmp = NULL;
1565 		nbuf = NULL;
1566 		rsc = 0;
1567 		cur->wb.qword1.status_error_len = 0;
1568 		rbuf = &rxr->buffers[i];
1569 		mh = rbuf->m_head;
1570 		mp = rbuf->m_pack;
1571 		eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT));
1572 		if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT))
1573 			vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1);
1574 		else
1575 			vtag = 0;
1576 
1577 		/*
1578 		** Make sure bad packets are discarded,
1579 		** note that only EOP descriptor has valid
1580 		** error results.
1581 		*/
1582                 if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) {
1583 			rxr->discarded++;
1584 			ixl_rx_discard(rxr, i);
1585 			goto next_desc;
1586 		}
1587 
1588 		/* Prefetch the next buffer */
1589 		if (!eop) {
1590 			nextp = i + 1;
1591 			if (nextp == que->num_desc)
1592 				nextp = 0;
1593 			nbuf = &rxr->buffers[nextp];
1594 			prefetch(nbuf);
1595 		}
1596 
1597 		/*
1598 		** The header mbuf is ONLY used when header
1599 		** split is enabled, otherwise we get normal
1600 		** behavior, ie, both header and payload
1601 		** are DMA'd into the payload buffer.
1602 		**
1603 		** Rather than using the fmp/lmp global pointers
1604 		** we now keep the head of a packet chain in the
1605 		** buffer struct and pass this along from one
1606 		** descriptor to the next, until we get EOP.
1607 		*/
1608 		if (rxr->hdr_split && (rbuf->fmp == NULL)) {
1609 			if (hlen > IXL_RX_HDR)
1610 				hlen = IXL_RX_HDR;
1611 			mh->m_len = hlen;
1612 			mh->m_flags |= M_PKTHDR;
1613 			mh->m_next = NULL;
1614 			mh->m_pkthdr.len = mh->m_len;
1615 			/* Null buf pointer so it is refreshed */
1616 			rbuf->m_head = NULL;
1617 			/*
1618 			** Check the payload length, this
1619 			** could be zero if its a small
1620 			** packet.
1621 			*/
1622 			if (plen > 0) {
1623 				mp->m_len = plen;
1624 				mp->m_next = NULL;
1625 				mp->m_flags &= ~M_PKTHDR;
1626 				mh->m_next = mp;
1627 				mh->m_pkthdr.len += mp->m_len;
1628 				/* Null buf pointer so it is refreshed */
1629 				rbuf->m_pack = NULL;
1630 				rxr->split++;
1631 			}
1632 			/*
1633 			** Now create the forward
1634 			** chain so when complete
1635 			** we wont have to.
1636 			*/
1637                         if (eop == 0) {
1638 				/* stash the chain head */
1639                                 nbuf->fmp = mh;
1640 				/* Make forward chain */
1641                                 if (plen)
1642                                         mp->m_next = nbuf->m_pack;
1643                                 else
1644                                         mh->m_next = nbuf->m_pack;
1645                         } else {
1646 				/* Singlet, prepare to send */
1647                                 sendmp = mh;
1648                                 if (vtag) {
1649                                         sendmp->m_pkthdr.ether_vtag = vtag;
1650                                         sendmp->m_flags |= M_VLANTAG;
1651                                 }
1652                         }
1653 		} else {
1654 			/*
1655 			** Either no header split, or a
1656 			** secondary piece of a fragmented
1657 			** split packet.
1658 			*/
1659 			mp->m_len = plen;
1660 			/*
1661 			** See if there is a stored head
1662 			** that determines what we are
1663 			*/
1664 			sendmp = rbuf->fmp;
1665 			rbuf->m_pack = rbuf->fmp = NULL;
1666 
1667 			if (sendmp != NULL) /* secondary frag */
1668 				sendmp->m_pkthdr.len += mp->m_len;
1669 			else {
1670 				/* first desc of a non-ps chain */
1671 				sendmp = mp;
1672 				sendmp->m_flags |= M_PKTHDR;
1673 				sendmp->m_pkthdr.len = mp->m_len;
1674 				if (vtag) {
1675 					sendmp->m_pkthdr.ether_vtag = vtag;
1676 					sendmp->m_flags |= M_VLANTAG;
1677 				}
1678                         }
1679 			/* Pass the head pointer on */
1680 			if (eop == 0) {
1681 				nbuf->fmp = sendmp;
1682 				sendmp = NULL;
1683 				mp->m_next = nbuf->m_pack;
1684 			}
1685 		}
1686 		++processed;
1687 		/* Sending this frame? */
1688 		if (eop) {
1689 			sendmp->m_pkthdr.rcvif = ifp;
1690 			/* gather stats */
1691 			rxr->rx_packets++;
1692 			rxr->rx_bytes += sendmp->m_pkthdr.len;
1693 			/* capture data for dynamic ITR adjustment */
1694 			rxr->packets++;
1695 			rxr->bytes += sendmp->m_pkthdr.len;
1696 			if ((ifp->if_capenable & IFCAP_RXCSUM) != 0)
1697 				ixl_rx_checksum(sendmp, status, error, ptype);
1698 #ifdef RSS
1699 			sendmp->m_pkthdr.flowid =
1700 			    le32toh(cur->wb.qword0.hi_dword.rss);
1701 			M_HASHTYPE_SET(sendmp, ixl_ptype_to_hash(ptype));
1702 #else
1703 			sendmp->m_pkthdr.flowid = que->msix;
1704 			M_HASHTYPE_SET(sendmp, M_HASHTYPE_OPAQUE);
1705 #endif
1706 		}
1707 next_desc:
1708 		bus_dmamap_sync(rxr->dma.tag, rxr->dma.map,
1709 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1710 
1711 		/* Advance our pointers to the next descriptor. */
1712 		if (++i == que->num_desc)
1713 			i = 0;
1714 
1715 		/* Now send to the stack or do LRO */
1716 		if (sendmp != NULL) {
1717 			rxr->next_check = i;
1718 			ixl_rx_input(rxr, ifp, sendmp, ptype);
1719 			i = rxr->next_check;
1720 		}
1721 
1722                /* Every 8 descriptors we go to refresh mbufs */
1723 		if (processed == 8) {
1724 			ixl_refresh_mbufs(que, i);
1725 			processed = 0;
1726 		}
1727 	}
1728 
1729 	/* Refresh any remaining buf structs */
1730 	if (ixl_rx_unrefreshed(que))
1731 		ixl_refresh_mbufs(que, i);
1732 
1733 	rxr->next_check = i;
1734 
1735 #if defined(INET6) || defined(INET)
1736 	/*
1737 	 * Flush any outstanding LRO work
1738 	 */
1739 	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1740 		SLIST_REMOVE_HEAD(&lro->lro_active, next);
1741 		tcp_lro_flush(lro, queued);
1742 	}
1743 #endif
1744 
1745 	IXL_RX_UNLOCK(rxr);
1746 	return (FALSE);
1747 }
1748 
1749 
1750 /*********************************************************************
1751  *
1752  *  Verify that the hardware indicated that the checksum is valid.
1753  *  Inform the stack about the status of checksum so that stack
1754  *  doesn't spend time verifying the checksum.
1755  *
1756  *********************************************************************/
1757 static void
1758 ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype)
1759 {
1760 	struct i40e_rx_ptype_decoded decoded;
1761 
1762 	decoded = decode_rx_desc_ptype(ptype);
1763 
1764 	/* Errors? */
1765  	if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
1766 	    (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) {
1767 		mp->m_pkthdr.csum_flags = 0;
1768 		return;
1769 	}
1770 
1771 	/* IPv6 with extension headers likely have bad csum */
1772 	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
1773 	    decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6)
1774 		if (status &
1775 		    (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) {
1776 			mp->m_pkthdr.csum_flags = 0;
1777 			return;
1778 		}
1779 
1780 
1781 	/* IP Checksum Good */
1782 	mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED;
1783 	mp->m_pkthdr.csum_flags |= CSUM_IP_VALID;
1784 
1785 	if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) {
1786 		mp->m_pkthdr.csum_flags |=
1787 		    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1788 		mp->m_pkthdr.csum_data |= htons(0xffff);
1789 	}
1790 	return;
1791 }
1792 
1793 #if __FreeBSD_version >= 1100000
1794 uint64_t
1795 ixl_get_counter(if_t ifp, ift_counter cnt)
1796 {
1797 	struct ixl_vsi *vsi;
1798 
1799 	vsi = if_getsoftc(ifp);
1800 
1801 	switch (cnt) {
1802 	case IFCOUNTER_IPACKETS:
1803 		return (vsi->ipackets);
1804 	case IFCOUNTER_IERRORS:
1805 		return (vsi->ierrors);
1806 	case IFCOUNTER_OPACKETS:
1807 		return (vsi->opackets);
1808 	case IFCOUNTER_OERRORS:
1809 		return (vsi->oerrors);
1810 	case IFCOUNTER_COLLISIONS:
1811 		/* Collisions are by standard impossible in 40G/10G Ethernet */
1812 		return (0);
1813 	case IFCOUNTER_IBYTES:
1814 		return (vsi->ibytes);
1815 	case IFCOUNTER_OBYTES:
1816 		return (vsi->obytes);
1817 	case IFCOUNTER_IMCASTS:
1818 		return (vsi->imcasts);
1819 	case IFCOUNTER_OMCASTS:
1820 		return (vsi->omcasts);
1821 	case IFCOUNTER_IQDROPS:
1822 		return (vsi->iqdrops);
1823 	case IFCOUNTER_OQDROPS:
1824 		return (vsi->oqdrops);
1825 	case IFCOUNTER_NOPROTO:
1826 		return (vsi->noproto);
1827 	default:
1828 		return (if_get_counter_default(ifp, cnt));
1829 	}
1830 }
1831 #endif
1832 
1833