xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision a491581f3f8df07cdff0236bd556895205929af4)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 
87 #include <machine/atomic.h>
88 #include <machine/in_cksum.h>
89 
90 #include <net/bpf.h>
91 #include <net/ethernet.h>
92 #include <net/if.h>
93 #include <net/if_dl.h>
94 #include <net/if_media.h>
95 #include <net/if_types.h>
96 #include <net/if_var.h>
97 #include <net/rndis.h>
98 #ifdef RSS
99 #include <net/rss_config.h>
100 #endif
101 
102 #include <netinet/in_systm.h>
103 #include <netinet/in.h>
104 #include <netinet/ip.h>
105 #include <netinet/ip6.h>
106 #include <netinet/tcp.h>
107 #include <netinet/tcp_lro.h>
108 #include <netinet/udp.h>
109 
110 #include <dev/hyperv/include/hyperv.h>
111 #include <dev/hyperv/include/hyperv_busdma.h>
112 #include <dev/hyperv/include/vmbus.h>
113 #include <dev/hyperv/include/vmbus_xact.h>
114 
115 #include <dev/hyperv/netvsc/ndis.h>
116 #include <dev/hyperv/netvsc/if_hnreg.h>
117 #include <dev/hyperv/netvsc/if_hnvar.h>
118 #include <dev/hyperv/netvsc/hn_nvs.h>
119 #include <dev/hyperv/netvsc/hn_rndis.h>
120 
121 #include "vmbus_if.h"
122 
123 #define HN_IFSTART_SUPPORT
124 
125 #define HN_RING_CNT_DEF_MAX		8
126 
127 #define HN_VFMAP_SIZE_DEF		8
128 
129 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
130 
131 /* YYY should get it from the underlying channel */
132 #define HN_TX_DESC_CNT			512
133 
134 #define HN_RNDIS_PKT_LEN					\
135 	(sizeof(struct rndis_packet_msg) +			\
136 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
140 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
141 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
142 
143 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
144 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
145 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
146 /* -1 for RNDIS packet message */
147 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
148 
149 #define HN_DIRECT_TX_SIZE_DEF		128
150 
151 #define HN_EARLY_TXEOF_THRESH		8
152 
153 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
154 
155 #define HN_LROENT_CNT_DEF		128
156 
157 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
158 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
159 /* YYY 2*MTU is a bit rough, but should be good enough. */
160 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
161 
162 #define HN_LRO_ACKCNT_DEF		1
163 
164 #define HN_LOCK_INIT(sc)		\
165 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
166 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
167 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
168 #define HN_LOCK(sc)					\
169 do {							\
170 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
171 		/* Relinquish cpu to avoid deadlock */	\
172 		sched_relinquish(curthread);		\
173 		DELAY(1000);				\
174 	}						\
175 } while (0)
176 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
177 
178 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
179 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
180 #define HN_CSUM_IP_HWASSIST(sc)		\
181 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
182 #define HN_CSUM_IP6_HWASSIST(sc)	\
183 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184 
185 #define HN_PKTSIZE_MIN(align)		\
186 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
187 	    HN_RNDIS_PKT_LEN, (align))
188 #define HN_PKTSIZE(m, align)		\
189 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190 
191 #ifdef RSS
192 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
193 #else
194 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
195 #endif
196 
197 struct hn_txdesc {
198 #ifndef HN_USE_TXDESC_BUFRING
199 	SLIST_ENTRY(hn_txdesc)		link;
200 #endif
201 	STAILQ_ENTRY(hn_txdesc)		agg_link;
202 
203 	/* Aggregated txdescs, in sending order. */
204 	STAILQ_HEAD(, hn_txdesc)	agg_list;
205 
206 	/* The oldest packet, if transmission aggregation happens. */
207 	struct mbuf			*m;
208 	struct hn_tx_ring		*txr;
209 	int				refs;
210 	uint32_t			flags;	/* HN_TXD_FLAG_ */
211 	struct hn_nvs_sendctx		send_ctx;
212 	uint32_t			chim_index;
213 	int				chim_size;
214 
215 	bus_dmamap_t			data_dmap;
216 
217 	bus_addr_t			rndis_pkt_paddr;
218 	struct rndis_packet_msg		*rndis_pkt;
219 	bus_dmamap_t			rndis_pkt_dmap;
220 };
221 
222 #define HN_TXD_FLAG_ONLIST		0x0001
223 #define HN_TXD_FLAG_DMAMAP		0x0002
224 #define HN_TXD_FLAG_ONAGG		0x0004
225 
226 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
227 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
228 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
229 
230 struct packet_info_id {
231 	uint8_t				ver;
232 	uint8_t				flag;
233 	uint16_t			pkt_id;
234 };
235 
236 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
237 
238 
239 struct hn_rxinfo {
240 	const uint32_t			*vlan_info;
241 	const uint32_t			*csum_info;
242 	const uint32_t			*hash_info;
243 	const uint32_t			*hash_value;
244 	const struct packet_info_id	*pktinfo_id;
245 };
246 
247 struct hn_rxvf_setarg {
248 	struct hn_rx_ring	*rxr;
249 	struct ifnet		*vf_ifp;
250 };
251 
252 #define HN_RXINFO_VLAN			0x0001
253 #define HN_RXINFO_CSUM			0x0002
254 #define HN_RXINFO_HASHINF		0x0004
255 #define HN_RXINFO_HASHVAL		0x0008
256 #define HN_RXINFO_PKTINFO_ID		0x0010
257 #define HN_RXINFO_ALL			\
258 	(HN_RXINFO_VLAN |		\
259 	 HN_RXINFO_CSUM |		\
260 	 HN_RXINFO_HASHINF |		\
261 	 HN_RXINFO_HASHVAL |		\
262 	 HN_RXINFO_PKTINFO_ID)
263 
264 static int			hn_probe(device_t);
265 static int			hn_attach(device_t);
266 static int			hn_detach(device_t);
267 static int			hn_shutdown(device_t);
268 static void			hn_chan_callback(struct vmbus_channel *,
269 				    void *);
270 
271 static void			hn_init(void *);
272 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
273 #ifdef HN_IFSTART_SUPPORT
274 static void			hn_start(struct ifnet *);
275 #endif
276 static int			hn_transmit(struct ifnet *, struct mbuf *);
277 static void			hn_xmit_qflush(struct ifnet *);
278 static int			hn_ifmedia_upd(struct ifnet *);
279 static void			hn_ifmedia_sts(struct ifnet *,
280 				    struct ifmediareq *);
281 
282 static void			hn_ifnet_event(void *, struct ifnet *, int);
283 static void			hn_ifaddr_event(void *, struct ifnet *);
284 static void			hn_ifnet_attevent(void *, struct ifnet *);
285 static void			hn_ifnet_detevent(void *, struct ifnet *);
286 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
287 
288 static bool			hn_ismyvf(const struct hn_softc *,
289 				    const struct ifnet *);
290 static void			hn_rxvf_change(struct hn_softc *,
291 				    struct ifnet *, bool);
292 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
293 static void			hn_rxvf_set_task(void *, int);
294 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
295 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
296 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
297 				    struct ifreq *);
298 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
299 static bool			hn_xpnt_vf_isready(struct hn_softc *);
300 static void			hn_xpnt_vf_setready(struct hn_softc *);
301 static void			hn_xpnt_vf_init_taskfunc(void *, int);
302 static void			hn_xpnt_vf_init(struct hn_softc *);
303 static void			hn_xpnt_vf_setenable(struct hn_softc *);
304 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
305 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
306 static void			hn_vf_rss_restore(struct hn_softc *);
307 
308 static int			hn_rndis_rxinfo(const void *, int,
309 				    struct hn_rxinfo *);
310 static void			hn_rndis_rx_data(struct hn_rx_ring *,
311 				    const void *, int);
312 static void			hn_rndis_rx_status(struct hn_softc *,
313 				    const void *, int);
314 static void			hn_rndis_init_fixat(struct hn_softc *, int);
315 
316 static void			hn_nvs_handle_notify(struct hn_softc *,
317 				    const struct vmbus_chanpkt_hdr *);
318 static void			hn_nvs_handle_comp(struct hn_softc *,
319 				    struct vmbus_channel *,
320 				    const struct vmbus_chanpkt_hdr *);
321 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
322 				    struct vmbus_channel *,
323 				    const struct vmbus_chanpkt_hdr *);
324 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
325 				    struct vmbus_channel *, uint64_t);
326 
327 #if __FreeBSD_version >= 1100099
328 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
330 #endif
331 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
333 #if __FreeBSD_version < 1100095
334 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
335 #else
336 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
337 #endif
338 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
345 #ifndef RSS
346 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
348 #endif
349 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
360 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
361 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
362 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
363 
364 static void			hn_stop(struct hn_softc *, bool);
365 static void			hn_init_locked(struct hn_softc *);
366 static int			hn_chan_attach(struct hn_softc *,
367 				    struct vmbus_channel *);
368 static void			hn_chan_detach(struct hn_softc *,
369 				    struct vmbus_channel *);
370 static int			hn_attach_subchans(struct hn_softc *);
371 static void			hn_detach_allchans(struct hn_softc *);
372 static void			hn_chan_rollup(struct hn_rx_ring *,
373 				    struct hn_tx_ring *);
374 static void			hn_set_ring_inuse(struct hn_softc *, int);
375 static int			hn_synth_attach(struct hn_softc *, int);
376 static void			hn_synth_detach(struct hn_softc *);
377 static int			hn_synth_alloc_subchans(struct hn_softc *,
378 				    int *);
379 static bool			hn_synth_attachable(const struct hn_softc *);
380 static void			hn_suspend(struct hn_softc *);
381 static void			hn_suspend_data(struct hn_softc *);
382 static void			hn_suspend_mgmt(struct hn_softc *);
383 static void			hn_resume(struct hn_softc *);
384 static void			hn_resume_data(struct hn_softc *);
385 static void			hn_resume_mgmt(struct hn_softc *);
386 static void			hn_suspend_mgmt_taskfunc(void *, int);
387 static void			hn_chan_drain(struct hn_softc *,
388 				    struct vmbus_channel *);
389 static void			hn_disable_rx(struct hn_softc *);
390 static void			hn_drain_rxtx(struct hn_softc *, int);
391 static void			hn_polling(struct hn_softc *, u_int);
392 static void			hn_chan_polling(struct vmbus_channel *, u_int);
393 static void			hn_mtu_change_fixup(struct hn_softc *);
394 
395 static void			hn_update_link_status(struct hn_softc *);
396 static void			hn_change_network(struct hn_softc *);
397 static void			hn_link_taskfunc(void *, int);
398 static void			hn_netchg_init_taskfunc(void *, int);
399 static void			hn_netchg_status_taskfunc(void *, int);
400 static void			hn_link_status(struct hn_softc *);
401 
402 static int			hn_create_rx_data(struct hn_softc *, int);
403 static void			hn_destroy_rx_data(struct hn_softc *);
404 static int			hn_check_iplen(const struct mbuf *, int);
405 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
406 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
407 static int			hn_rxfilter_config(struct hn_softc *);
408 static int			hn_rss_reconfig(struct hn_softc *);
409 static void			hn_rss_ind_fixup(struct hn_softc *);
410 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
411 static int			hn_rxpkt(struct hn_rx_ring *);
412 static uint32_t			hn_rss_type_fromndis(uint32_t);
413 static uint32_t			hn_rss_type_tondis(uint32_t);
414 
415 static int			hn_tx_ring_create(struct hn_softc *, int);
416 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
417 static int			hn_create_tx_data(struct hn_softc *, int);
418 static void			hn_fixup_tx_data(struct hn_softc *);
419 static void			hn_fixup_rx_data(struct hn_softc *);
420 static void			hn_destroy_tx_data(struct hn_softc *);
421 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
422 static void			hn_txdesc_gc(struct hn_tx_ring *,
423 				    struct hn_txdesc *);
424 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
425 				    struct hn_txdesc *, struct mbuf **);
426 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
427 				    struct hn_txdesc *);
428 static void			hn_set_chim_size(struct hn_softc *, int);
429 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
430 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
431 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
432 static void			hn_resume_tx(struct hn_softc *, int);
433 static void			hn_set_txagg(struct hn_softc *);
434 static void			*hn_try_txagg(struct ifnet *,
435 				    struct hn_tx_ring *, struct hn_txdesc *,
436 				    int);
437 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
438 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
439 				    struct hn_softc *, struct vmbus_channel *,
440 				    const void *, int);
441 static int			hn_txpkt_sglist(struct hn_tx_ring *,
442 				    struct hn_txdesc *);
443 static int			hn_txpkt_chim(struct hn_tx_ring *,
444 				    struct hn_txdesc *);
445 static int			hn_xmit(struct hn_tx_ring *, int);
446 static void			hn_xmit_taskfunc(void *, int);
447 static void			hn_xmit_txeof(struct hn_tx_ring *);
448 static void			hn_xmit_txeof_taskfunc(void *, int);
449 #ifdef HN_IFSTART_SUPPORT
450 static int			hn_start_locked(struct hn_tx_ring *, int);
451 static void			hn_start_taskfunc(void *, int);
452 static void			hn_start_txeof(struct hn_tx_ring *);
453 static void			hn_start_txeof_taskfunc(void *, int);
454 #endif
455 
456 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
457     "Hyper-V network interface");
458 
459 /* Trust tcp segements verification on host side. */
460 static int			hn_trust_hosttcp = 1;
461 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
462     &hn_trust_hosttcp, 0,
463     "Trust tcp segement verification on host side, "
464     "when csum info is missing (global setting)");
465 
466 /* Trust udp datagrams verification on host side. */
467 static int			hn_trust_hostudp = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
469     &hn_trust_hostudp, 0,
470     "Trust udp datagram verification on host side, "
471     "when csum info is missing (global setting)");
472 
473 /* Trust ip packets verification on host side. */
474 static int			hn_trust_hostip = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
476     &hn_trust_hostip, 0,
477     "Trust ip packet verification on host side, "
478     "when csum info is missing (global setting)");
479 
480 /*
481  * Offload UDP/IPv4 checksum.
482  */
483 static int			hn_enable_udp4cs = 1;
484 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
485     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
486 
487 /*
488  * Offload UDP/IPv6 checksum.
489  */
490 static int			hn_enable_udp6cs = 1;
491 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
492     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
493 
494 /* Stats. */
495 static counter_u64_t		hn_udpcs_fixup;
496 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
497     &hn_udpcs_fixup, "# of UDP checksum fixup");
498 
499 /*
500  * See hn_set_hlen().
501  *
502  * This value is for Azure.  For Hyper-V, set this above
503  * 65536 to disable UDP datagram checksum fixup.
504  */
505 static int			hn_udpcs_fixup_mtu = 1420;
506 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
507     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
508 
509 /* Limit TSO burst size */
510 static int			hn_tso_maxlen = IP_MAXPACKET;
511 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
512     &hn_tso_maxlen, 0, "TSO burst limit");
513 
514 /* Limit chimney send size */
515 static int			hn_tx_chimney_size = 0;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
517     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
518 
519 /* Limit the size of packet for direct transmission */
520 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
521 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
522     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
523 
524 /* # of LRO entries per RX ring */
525 #if defined(INET) || defined(INET6)
526 #if __FreeBSD_version >= 1100095
527 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
528 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
529     &hn_lro_entry_count, 0, "LRO entry count");
530 #endif
531 #endif
532 
533 static int			hn_tx_taskq_cnt = 1;
534 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
535     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
536 
537 #define HN_TX_TASKQ_M_INDEP	0
538 #define HN_TX_TASKQ_M_GLOBAL	1
539 #define HN_TX_TASKQ_M_EVTTQ	2
540 
541 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
542 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
543     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
544     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
545 
546 #ifndef HN_USE_TXDESC_BUFRING
547 static int			hn_use_txdesc_bufring = 0;
548 #else
549 static int			hn_use_txdesc_bufring = 1;
550 #endif
551 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
552     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
553 
554 #ifdef HN_IFSTART_SUPPORT
555 /* Use ifnet.if_start instead of ifnet.if_transmit */
556 static int			hn_use_if_start = 0;
557 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
558     &hn_use_if_start, 0, "Use if_start TX method");
559 #endif
560 
561 /* # of channels to use */
562 static int			hn_chan_cnt = 0;
563 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
564     &hn_chan_cnt, 0,
565     "# of channels to use; each channel has one RX ring and one TX ring");
566 
567 /* # of transmit rings to use */
568 static int			hn_tx_ring_cnt = 0;
569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
570     &hn_tx_ring_cnt, 0, "# of TX rings to use");
571 
572 /* Software TX ring deptch */
573 static int			hn_tx_swq_depth = 0;
574 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
575     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
576 
577 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
578 #if __FreeBSD_version >= 1100095
579 static u_int			hn_lro_mbufq_depth = 0;
580 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
581     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
582 #endif
583 
584 /* Packet transmission aggregation size limit */
585 static int			hn_tx_agg_size = -1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
587     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
588 
589 /* Packet transmission aggregation count limit */
590 static int			hn_tx_agg_pkts = -1;
591 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
592     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
593 
594 /* VF list */
595 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
596     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
597     hn_vflist_sysctl, "A",
598     "VF list");
599 
600 /* VF mapping */
601 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
602     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
603     hn_vfmap_sysctl, "A",
604     "VF mapping");
605 
606 /* Transparent VF */
607 static int			hn_xpnt_vf = 1;
608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
609     &hn_xpnt_vf, 0, "Transparent VF mod");
610 
611 /* Accurate BPF support for Transparent VF */
612 static int			hn_xpnt_vf_accbpf = 0;
613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
614     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
615 
616 /* Extra wait for transparent VF attach routing; unit seconds. */
617 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
618 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
619     &hn_xpnt_vf_attwait, 0,
620     "Extra wait for transparent VF attach routing; unit: seconds");
621 
622 static u_int			hn_cpu_index;	/* next CPU for channel */
623 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
624 
625 static struct rmlock		hn_vfmap_lock;
626 static int			hn_vfmap_size;
627 static struct ifnet		**hn_vfmap;
628 
629 #ifndef RSS
630 static const uint8_t
631 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
632 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
633 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
634 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
635 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
636 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
637 };
638 #endif	/* !RSS */
639 
640 static const struct hyperv_guid	hn_guid = {
641 	.hv_guid = {
642 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
643 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
644 };
645 
646 static device_method_t hn_methods[] = {
647 	/* Device interface */
648 	DEVMETHOD(device_probe,		hn_probe),
649 	DEVMETHOD(device_attach,	hn_attach),
650 	DEVMETHOD(device_detach,	hn_detach),
651 	DEVMETHOD(device_shutdown,	hn_shutdown),
652 	DEVMETHOD_END
653 };
654 
655 static driver_t hn_driver = {
656 	"hn",
657 	hn_methods,
658 	sizeof(struct hn_softc)
659 };
660 
661 static devclass_t hn_devclass;
662 
663 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
664 MODULE_VERSION(hn, 1);
665 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
666 
667 #if __FreeBSD_version >= 1100099
668 static void
669 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
670 {
671 	int i;
672 
673 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
674 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
675 }
676 #endif
677 
678 static int
679 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
680 {
681 
682 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
683 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
684 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
685 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
686 }
687 
688 static int
689 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
690 {
691 	struct hn_nvs_rndis rndis;
692 
693 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
694 	    txd->chim_size > 0, ("invalid rndis chim txd"));
695 
696 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
697 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
698 	rndis.nvs_chim_idx = txd->chim_index;
699 	rndis.nvs_chim_sz = txd->chim_size;
700 
701 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
702 	    &rndis, sizeof(rndis), &txd->send_ctx));
703 }
704 
705 static __inline uint32_t
706 hn_chim_alloc(struct hn_softc *sc)
707 {
708 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
709 	u_long *bmap = sc->hn_chim_bmap;
710 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
711 
712 	for (i = 0; i < bmap_cnt; ++i) {
713 		int idx;
714 
715 		idx = ffsl(~bmap[i]);
716 		if (idx == 0)
717 			continue;
718 
719 		--idx; /* ffsl is 1-based */
720 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
721 		    ("invalid i %d and idx %d", i, idx));
722 
723 		if (atomic_testandset_long(&bmap[i], idx))
724 			continue;
725 
726 		ret = i * LONG_BIT + idx;
727 		break;
728 	}
729 	return (ret);
730 }
731 
732 static __inline void
733 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
734 {
735 	u_long mask;
736 	uint32_t idx;
737 
738 	idx = chim_idx / LONG_BIT;
739 	KASSERT(idx < sc->hn_chim_bmap_cnt,
740 	    ("invalid chimney index 0x%x", chim_idx));
741 
742 	mask = 1UL << (chim_idx % LONG_BIT);
743 	KASSERT(sc->hn_chim_bmap[idx] & mask,
744 	    ("index bitmap 0x%lx, chimney index %u, "
745 	     "bitmap idx %d, bitmask 0x%lx",
746 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
747 
748 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
749 }
750 
751 #if defined(INET6) || defined(INET)
752 
753 #define PULLUP_HDR(m, len)				\
754 do {							\
755 	if (__predict_false((m)->m_len < (len))) {	\
756 		(m) = m_pullup((m), (len));		\
757 		if ((m) == NULL)			\
758 			return (NULL);			\
759 	}						\
760 } while (0)
761 
762 /*
763  * NOTE: If this function failed, the m_head would be freed.
764  */
765 static __inline struct mbuf *
766 hn_tso_fixup(struct mbuf *m_head)
767 {
768 	struct ether_vlan_header *evl;
769 	struct tcphdr *th;
770 	int ehlen;
771 
772 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
773 
774 	PULLUP_HDR(m_head, sizeof(*evl));
775 	evl = mtod(m_head, struct ether_vlan_header *);
776 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
777 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
778 	else
779 		ehlen = ETHER_HDR_LEN;
780 	m_head->m_pkthdr.l2hlen = ehlen;
781 
782 #ifdef INET
783 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
784 		struct ip *ip;
785 		int iphlen;
786 
787 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
788 		ip = mtodo(m_head, ehlen);
789 		iphlen = ip->ip_hl << 2;
790 		m_head->m_pkthdr.l3hlen = iphlen;
791 
792 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
793 		th = mtodo(m_head, ehlen + iphlen);
794 
795 		ip->ip_len = 0;
796 		ip->ip_sum = 0;
797 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
798 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
799 	}
800 #endif
801 #if defined(INET6) && defined(INET)
802 	else
803 #endif
804 #ifdef INET6
805 	{
806 		struct ip6_hdr *ip6;
807 
808 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
809 		ip6 = mtodo(m_head, ehlen);
810 		if (ip6->ip6_nxt != IPPROTO_TCP) {
811 			m_freem(m_head);
812 			return (NULL);
813 		}
814 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
815 
816 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
817 		th = mtodo(m_head, ehlen + sizeof(*ip6));
818 
819 		ip6->ip6_plen = 0;
820 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
821 	}
822 #endif
823 	return (m_head);
824 }
825 
826 /*
827  * NOTE: If this function failed, the m_head would be freed.
828  */
829 static __inline struct mbuf *
830 hn_set_hlen(struct mbuf *m_head)
831 {
832 	const struct ether_vlan_header *evl;
833 	int ehlen;
834 
835 	PULLUP_HDR(m_head, sizeof(*evl));
836 	evl = mtod(m_head, const struct ether_vlan_header *);
837 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
838 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
839 	else
840 		ehlen = ETHER_HDR_LEN;
841 	m_head->m_pkthdr.l2hlen = ehlen;
842 
843 #ifdef INET
844 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
845 		const struct ip *ip;
846 		int iphlen;
847 
848 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
849 		ip = mtodo(m_head, ehlen);
850 		iphlen = ip->ip_hl << 2;
851 		m_head->m_pkthdr.l3hlen = iphlen;
852 
853 		/*
854 		 * UDP checksum offload does not work in Azure, if the
855 		 * following conditions meet:
856 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
857 		 * - IP_DF is not set in the IP hdr.
858 		 *
859 		 * Fallback to software checksum for these UDP datagrams.
860 		 */
861 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
862 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
863 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
864 			uint16_t off = ehlen + iphlen;
865 
866 			counter_u64_add(hn_udpcs_fixup, 1);
867 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
868 			*(uint16_t *)(m_head->m_data + off +
869                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
870 			    m_head, m_head->m_pkthdr.len, off);
871 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
872 		}
873 	}
874 #endif
875 #if defined(INET6) && defined(INET)
876 	else
877 #endif
878 #ifdef INET6
879 	{
880 		const struct ip6_hdr *ip6;
881 
882 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
883 		ip6 = mtodo(m_head, ehlen);
884 		if (ip6->ip6_nxt != IPPROTO_TCP &&
885 		    ip6->ip6_nxt != IPPROTO_UDP) {
886 			m_freem(m_head);
887 			return (NULL);
888 		}
889 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
890 	}
891 #endif
892 	return (m_head);
893 }
894 
895 /*
896  * NOTE: If this function failed, the m_head would be freed.
897  */
898 static __inline struct mbuf *
899 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
900 {
901 	const struct tcphdr *th;
902 	int ehlen, iphlen;
903 
904 	*tcpsyn = 0;
905 	ehlen = m_head->m_pkthdr.l2hlen;
906 	iphlen = m_head->m_pkthdr.l3hlen;
907 
908 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
909 	th = mtodo(m_head, ehlen + iphlen);
910 	if (th->th_flags & TH_SYN)
911 		*tcpsyn = 1;
912 	return (m_head);
913 }
914 
915 #undef PULLUP_HDR
916 
917 #endif	/* INET6 || INET */
918 
919 static int
920 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
921 {
922 	int error = 0;
923 
924 	HN_LOCK_ASSERT(sc);
925 
926 	if (sc->hn_rx_filter != filter) {
927 		error = hn_rndis_set_rxfilter(sc, filter);
928 		if (!error)
929 			sc->hn_rx_filter = filter;
930 	}
931 	return (error);
932 }
933 
934 static int
935 hn_rxfilter_config(struct hn_softc *sc)
936 {
937 	struct ifnet *ifp = sc->hn_ifp;
938 	uint32_t filter;
939 
940 	HN_LOCK_ASSERT(sc);
941 
942 	/*
943 	 * If the non-transparent mode VF is activated, we don't know how
944 	 * its RX filter is configured, so stick the synthetic device in
945 	 * the promiscous mode.
946 	 */
947 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
948 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
949 	} else {
950 		filter = NDIS_PACKET_TYPE_DIRECTED;
951 		if (ifp->if_flags & IFF_BROADCAST)
952 			filter |= NDIS_PACKET_TYPE_BROADCAST;
953 		/* TODO: support multicast list */
954 		if ((ifp->if_flags & IFF_ALLMULTI) ||
955 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
956 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
957 	}
958 	return (hn_set_rxfilter(sc, filter));
959 }
960 
961 static void
962 hn_set_txagg(struct hn_softc *sc)
963 {
964 	uint32_t size, pkts;
965 	int i;
966 
967 	/*
968 	 * Setup aggregation size.
969 	 */
970 	if (sc->hn_agg_size < 0)
971 		size = UINT32_MAX;
972 	else
973 		size = sc->hn_agg_size;
974 
975 	if (sc->hn_rndis_agg_size < size)
976 		size = sc->hn_rndis_agg_size;
977 
978 	/* NOTE: We only aggregate packets using chimney sending buffers. */
979 	if (size > (uint32_t)sc->hn_chim_szmax)
980 		size = sc->hn_chim_szmax;
981 
982 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
983 		/* Disable */
984 		size = 0;
985 		pkts = 0;
986 		goto done;
987 	}
988 
989 	/* NOTE: Type of the per TX ring setting is 'int'. */
990 	if (size > INT_MAX)
991 		size = INT_MAX;
992 
993 	/*
994 	 * Setup aggregation packet count.
995 	 */
996 	if (sc->hn_agg_pkts < 0)
997 		pkts = UINT32_MAX;
998 	else
999 		pkts = sc->hn_agg_pkts;
1000 
1001 	if (sc->hn_rndis_agg_pkts < pkts)
1002 		pkts = sc->hn_rndis_agg_pkts;
1003 
1004 	if (pkts <= 1) {
1005 		/* Disable */
1006 		size = 0;
1007 		pkts = 0;
1008 		goto done;
1009 	}
1010 
1011 	/* NOTE: Type of the per TX ring setting is 'short'. */
1012 	if (pkts > SHRT_MAX)
1013 		pkts = SHRT_MAX;
1014 
1015 done:
1016 	/* NOTE: Type of the per TX ring setting is 'short'. */
1017 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1018 		/* Disable */
1019 		size = 0;
1020 		pkts = 0;
1021 	}
1022 
1023 	if (bootverbose) {
1024 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1025 		    size, pkts, sc->hn_rndis_agg_align);
1026 	}
1027 
1028 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1029 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1030 
1031 		mtx_lock(&txr->hn_tx_lock);
1032 		txr->hn_agg_szmax = size;
1033 		txr->hn_agg_pktmax = pkts;
1034 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1035 		mtx_unlock(&txr->hn_tx_lock);
1036 	}
1037 }
1038 
1039 static int
1040 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1041 {
1042 
1043 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1044 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1045 		return txr->hn_txdesc_cnt;
1046 	return hn_tx_swq_depth;
1047 }
1048 
1049 static int
1050 hn_rss_reconfig(struct hn_softc *sc)
1051 {
1052 	int error;
1053 
1054 	HN_LOCK_ASSERT(sc);
1055 
1056 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1057 		return (ENXIO);
1058 
1059 	/*
1060 	 * Disable RSS first.
1061 	 *
1062 	 * NOTE:
1063 	 * Direct reconfiguration by setting the UNCHG flags does
1064 	 * _not_ work properly.
1065 	 */
1066 	if (bootverbose)
1067 		if_printf(sc->hn_ifp, "disable RSS\n");
1068 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1069 	if (error) {
1070 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1071 		return (error);
1072 	}
1073 
1074 	/*
1075 	 * Reenable the RSS w/ the updated RSS key or indirect
1076 	 * table.
1077 	 */
1078 	if (bootverbose)
1079 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1080 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1081 	if (error) {
1082 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1083 		return (error);
1084 	}
1085 	return (0);
1086 }
1087 
1088 static void
1089 hn_rss_ind_fixup(struct hn_softc *sc)
1090 {
1091 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1092 	int i, nchan;
1093 
1094 	nchan = sc->hn_rx_ring_inuse;
1095 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1096 
1097 	/*
1098 	 * Check indirect table to make sure that all channels in it
1099 	 * can be used.
1100 	 */
1101 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1102 		if (rss->rss_ind[i] >= nchan) {
1103 			if_printf(sc->hn_ifp,
1104 			    "RSS indirect table %d fixup: %u -> %d\n",
1105 			    i, rss->rss_ind[i], nchan - 1);
1106 			rss->rss_ind[i] = nchan - 1;
1107 		}
1108 	}
1109 }
1110 
1111 static int
1112 hn_ifmedia_upd(struct ifnet *ifp __unused)
1113 {
1114 
1115 	return EOPNOTSUPP;
1116 }
1117 
1118 static void
1119 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1120 {
1121 	struct hn_softc *sc = ifp->if_softc;
1122 
1123 	ifmr->ifm_status = IFM_AVALID;
1124 	ifmr->ifm_active = IFM_ETHER;
1125 
1126 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1127 		ifmr->ifm_active |= IFM_NONE;
1128 		return;
1129 	}
1130 	ifmr->ifm_status |= IFM_ACTIVE;
1131 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1132 }
1133 
1134 static void
1135 hn_rxvf_set_task(void *xarg, int pending __unused)
1136 {
1137 	struct hn_rxvf_setarg *arg = xarg;
1138 
1139 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1140 }
1141 
1142 static void
1143 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1144 {
1145 	struct hn_rx_ring *rxr;
1146 	struct hn_rxvf_setarg arg;
1147 	struct task task;
1148 	int i;
1149 
1150 	HN_LOCK_ASSERT(sc);
1151 
1152 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1153 
1154 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1155 		rxr = &sc->hn_rx_ring[i];
1156 
1157 		if (i < sc->hn_rx_ring_inuse) {
1158 			arg.rxr = rxr;
1159 			arg.vf_ifp = vf_ifp;
1160 			vmbus_chan_run_task(rxr->hn_chan, &task);
1161 		} else {
1162 			rxr->hn_rxvf_ifp = vf_ifp;
1163 		}
1164 	}
1165 }
1166 
1167 static bool
1168 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1169 {
1170 	const struct ifnet *hn_ifp;
1171 
1172 	hn_ifp = sc->hn_ifp;
1173 
1174 	if (ifp == hn_ifp)
1175 		return (false);
1176 
1177 	if (ifp->if_alloctype != IFT_ETHER)
1178 		return (false);
1179 
1180 	/* Ignore lagg/vlan interfaces */
1181 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1182 	    strcmp(ifp->if_dname, "vlan") == 0)
1183 		return (false);
1184 
1185 	/*
1186 	 * During detach events ifp->if_addr might be NULL.
1187 	 * Make sure the bcmp() below doesn't panic on that:
1188 	 */
1189 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1190 		return (false);
1191 
1192 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1193 		return (false);
1194 
1195 	return (true);
1196 }
1197 
1198 static void
1199 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1200 {
1201 	struct ifnet *hn_ifp;
1202 
1203 	HN_LOCK(sc);
1204 
1205 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1206 		goto out;
1207 
1208 	if (!hn_ismyvf(sc, ifp))
1209 		goto out;
1210 	hn_ifp = sc->hn_ifp;
1211 
1212 	if (rxvf) {
1213 		if (sc->hn_flags & HN_FLAG_RXVF)
1214 			goto out;
1215 
1216 		sc->hn_flags |= HN_FLAG_RXVF;
1217 		hn_rxfilter_config(sc);
1218 	} else {
1219 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1220 			goto out;
1221 
1222 		sc->hn_flags &= ~HN_FLAG_RXVF;
1223 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1224 			hn_rxfilter_config(sc);
1225 		else
1226 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1227 	}
1228 
1229 	hn_nvs_set_datapath(sc,
1230 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1231 
1232 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1233 
1234 	if (rxvf) {
1235 		hn_vf_rss_fixup(sc, true);
1236 		hn_suspend_mgmt(sc);
1237 		sc->hn_link_flags &=
1238 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1239 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1240 	} else {
1241 		hn_vf_rss_restore(sc);
1242 		hn_resume_mgmt(sc);
1243 	}
1244 
1245 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1246 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1247 
1248 	if (bootverbose) {
1249 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1250 		    rxvf ? "to" : "from", ifp->if_xname);
1251 	}
1252 out:
1253 	HN_UNLOCK(sc);
1254 }
1255 
1256 static void
1257 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1258 {
1259 
1260 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1261 		return;
1262 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1263 }
1264 
1265 static void
1266 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1267 {
1268 
1269 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1270 }
1271 
1272 static int
1273 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1274 {
1275 	struct ifnet *ifp, *vf_ifp;
1276 	uint64_t tmp;
1277 	int error;
1278 
1279 	HN_LOCK_ASSERT(sc);
1280 	ifp = sc->hn_ifp;
1281 	vf_ifp = sc->hn_vf_ifp;
1282 
1283 	/*
1284 	 * Fix up requested capabilities w/ supported capabilities,
1285 	 * since the supported capabilities could have been changed.
1286 	 */
1287 	ifr->ifr_reqcap &= ifp->if_capabilities;
1288 	/* Pass SIOCSIFCAP to VF. */
1289 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1290 
1291 	/*
1292 	 * NOTE:
1293 	 * The error will be propagated to the callers, however, it
1294 	 * is _not_ useful here.
1295 	 */
1296 
1297 	/*
1298 	 * Merge VF's enabled capabilities.
1299 	 */
1300 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1301 
1302 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1303 	if (ifp->if_capenable & IFCAP_TXCSUM)
1304 		ifp->if_hwassist |= tmp;
1305 	else
1306 		ifp->if_hwassist &= ~tmp;
1307 
1308 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1309 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1310 		ifp->if_hwassist |= tmp;
1311 	else
1312 		ifp->if_hwassist &= ~tmp;
1313 
1314 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1315 	if (ifp->if_capenable & IFCAP_TSO4)
1316 		ifp->if_hwassist |= tmp;
1317 	else
1318 		ifp->if_hwassist &= ~tmp;
1319 
1320 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1321 	if (ifp->if_capenable & IFCAP_TSO6)
1322 		ifp->if_hwassist |= tmp;
1323 	else
1324 		ifp->if_hwassist &= ~tmp;
1325 
1326 	return (error);
1327 }
1328 
1329 static int
1330 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1331 {
1332 	struct ifnet *vf_ifp;
1333 	struct ifreq ifr;
1334 
1335 	HN_LOCK_ASSERT(sc);
1336 	vf_ifp = sc->hn_vf_ifp;
1337 
1338 	memset(&ifr, 0, sizeof(ifr));
1339 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1340 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1341 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1342 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1343 }
1344 
1345 static void
1346 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1347 {
1348 	struct ifnet *ifp = sc->hn_ifp;
1349 	int allmulti = 0;
1350 
1351 	HN_LOCK_ASSERT(sc);
1352 
1353 	/* XXX vlan(4) style mcast addr maintenance */
1354 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1355 		allmulti = IFF_ALLMULTI;
1356 
1357 	/* Always set the VF's if_flags */
1358 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1359 }
1360 
1361 static void
1362 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1363 {
1364 	struct rm_priotracker pt;
1365 	struct ifnet *hn_ifp = NULL;
1366 	struct mbuf *mn;
1367 
1368 	/*
1369 	 * XXX racy, if hn(4) ever detached.
1370 	 */
1371 	rm_rlock(&hn_vfmap_lock, &pt);
1372 	if (vf_ifp->if_index < hn_vfmap_size)
1373 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1374 	rm_runlock(&hn_vfmap_lock, &pt);
1375 
1376 	if (hn_ifp != NULL) {
1377 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1378 			/*
1379 			 * Allow tapping on the VF.
1380 			 */
1381 			ETHER_BPF_MTAP(vf_ifp, mn);
1382 
1383 			/*
1384 			 * Update VF stats.
1385 			 */
1386 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1387 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1388 				    mn->m_pkthdr.len);
1389 			}
1390 			/*
1391 			 * XXX IFCOUNTER_IMCAST
1392 			 * This stat updating is kinda invasive, since it
1393 			 * requires two checks on the mbuf: the length check
1394 			 * and the ethernet header check.  As of this write,
1395 			 * all multicast packets go directly to hn(4), which
1396 			 * makes imcast stat updating in the VF a try in vian.
1397 			 */
1398 
1399 			/*
1400 			 * Fix up rcvif and increase hn(4)'s ipackets.
1401 			 */
1402 			mn->m_pkthdr.rcvif = hn_ifp;
1403 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1404 		}
1405 		/*
1406 		 * Go through hn(4)'s if_input.
1407 		 */
1408 		hn_ifp->if_input(hn_ifp, m);
1409 	} else {
1410 		/*
1411 		 * In the middle of the transition; free this
1412 		 * mbuf chain.
1413 		 */
1414 		while (m != NULL) {
1415 			mn = m->m_nextpkt;
1416 			m->m_nextpkt = NULL;
1417 			m_freem(m);
1418 			m = mn;
1419 		}
1420 	}
1421 }
1422 
1423 static void
1424 hn_mtu_change_fixup(struct hn_softc *sc)
1425 {
1426 	struct ifnet *ifp;
1427 
1428 	HN_LOCK_ASSERT(sc);
1429 	ifp = sc->hn_ifp;
1430 
1431 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1432 #if __FreeBSD_version >= 1100099
1433 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1434 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1435 #endif
1436 }
1437 
1438 static uint32_t
1439 hn_rss_type_fromndis(uint32_t rss_hash)
1440 {
1441 	uint32_t types = 0;
1442 
1443 	if (rss_hash & NDIS_HASH_IPV4)
1444 		types |= RSS_TYPE_IPV4;
1445 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1446 		types |= RSS_TYPE_TCP_IPV4;
1447 	if (rss_hash & NDIS_HASH_IPV6)
1448 		types |= RSS_TYPE_IPV6;
1449 	if (rss_hash & NDIS_HASH_IPV6_EX)
1450 		types |= RSS_TYPE_IPV6_EX;
1451 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1452 		types |= RSS_TYPE_TCP_IPV6;
1453 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1454 		types |= RSS_TYPE_TCP_IPV6_EX;
1455 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1456 		types |= RSS_TYPE_UDP_IPV4;
1457 	return (types);
1458 }
1459 
1460 static uint32_t
1461 hn_rss_type_tondis(uint32_t types)
1462 {
1463 	uint32_t rss_hash = 0;
1464 
1465 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1466 	    ("UDP6 and UDP6EX are not supported"));
1467 
1468 	if (types & RSS_TYPE_IPV4)
1469 		rss_hash |= NDIS_HASH_IPV4;
1470 	if (types & RSS_TYPE_TCP_IPV4)
1471 		rss_hash |= NDIS_HASH_TCP_IPV4;
1472 	if (types & RSS_TYPE_IPV6)
1473 		rss_hash |= NDIS_HASH_IPV6;
1474 	if (types & RSS_TYPE_IPV6_EX)
1475 		rss_hash |= NDIS_HASH_IPV6_EX;
1476 	if (types & RSS_TYPE_TCP_IPV6)
1477 		rss_hash |= NDIS_HASH_TCP_IPV6;
1478 	if (types & RSS_TYPE_TCP_IPV6_EX)
1479 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1480 	if (types & RSS_TYPE_UDP_IPV4)
1481 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1482 	return (rss_hash);
1483 }
1484 
1485 static void
1486 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1487 {
1488 	int i;
1489 
1490 	HN_LOCK_ASSERT(sc);
1491 
1492 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1493 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1494 }
1495 
1496 static void
1497 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1498 {
1499 	struct ifnet *ifp, *vf_ifp;
1500 	struct ifrsshash ifrh;
1501 	struct ifrsskey ifrk;
1502 	int error;
1503 	uint32_t my_types, diff_types, mbuf_types = 0;
1504 
1505 	HN_LOCK_ASSERT(sc);
1506 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1507 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1508 
1509 	if (sc->hn_rx_ring_inuse == 1) {
1510 		/* No RSS on synthetic parts; done. */
1511 		return;
1512 	}
1513 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1514 		/* Synthetic parts do not support Toeplitz; done. */
1515 		return;
1516 	}
1517 
1518 	ifp = sc->hn_ifp;
1519 	vf_ifp = sc->hn_vf_ifp;
1520 
1521 	/*
1522 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1523 	 * supported.
1524 	 */
1525 	memset(&ifrk, 0, sizeof(ifrk));
1526 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1527 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1528 	if (error) {
1529 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1530 		    vf_ifp->if_xname, error);
1531 		goto done;
1532 	}
1533 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1534 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1535 		    vf_ifp->if_xname, ifrk.ifrk_func);
1536 		goto done;
1537 	}
1538 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1539 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1540 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1541 		goto done;
1542 	}
1543 
1544 	/*
1545 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1546 	 */
1547 	memset(&ifrh, 0, sizeof(ifrh));
1548 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1549 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1550 	if (error) {
1551 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1552 		    vf_ifp->if_xname, error);
1553 		goto done;
1554 	}
1555 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1556 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1557 		    vf_ifp->if_xname, ifrh.ifrh_func);
1558 		goto done;
1559 	}
1560 
1561 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1562 	if ((ifrh.ifrh_types & my_types) == 0) {
1563 		/* This disables RSS; ignore it then */
1564 		if_printf(ifp, "%s intersection of RSS types failed.  "
1565 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1566 		    ifrh.ifrh_types, my_types);
1567 		goto done;
1568 	}
1569 
1570 	diff_types = my_types ^ ifrh.ifrh_types;
1571 	my_types &= ifrh.ifrh_types;
1572 	mbuf_types = my_types;
1573 
1574 	/*
1575 	 * Detect RSS hash value/type confliction.
1576 	 *
1577 	 * NOTE:
1578 	 * We don't disable the hash type, but stop delivery the hash
1579 	 * value/type through mbufs on RX path.
1580 	 *
1581 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1582 	 * hash is delivered with type of TCP_IPV4.  This means if
1583 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1584 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1585 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1586 	 * here.
1587 	 */
1588 	if ((my_types & RSS_TYPE_IPV4) &&
1589 	    (diff_types & ifrh.ifrh_types &
1590 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1591 		/* Conflict; disable IPV4 hash type/value delivery. */
1592 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1593 		mbuf_types &= ~RSS_TYPE_IPV4;
1594 	}
1595 	if ((my_types & RSS_TYPE_IPV6) &&
1596 	    (diff_types & ifrh.ifrh_types &
1597 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1598 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1599 	      RSS_TYPE_IPV6_EX))) {
1600 		/* Conflict; disable IPV6 hash type/value delivery. */
1601 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1602 		mbuf_types &= ~RSS_TYPE_IPV6;
1603 	}
1604 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1605 	    (diff_types & ifrh.ifrh_types &
1606 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1607 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1608 	      RSS_TYPE_IPV6))) {
1609 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1610 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1611 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1612 	}
1613 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1614 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1615 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1616 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1617 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1618 	}
1619 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1620 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1621 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1622 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1623 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1624 	}
1625 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1626 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1627 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1628 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1629 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1630 	}
1631 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1632 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1633 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1634 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1635 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1636 	}
1637 
1638 	/*
1639 	 * Indirect table does not matter.
1640 	 */
1641 
1642 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1643 	    hn_rss_type_tondis(my_types);
1644 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1645 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1646 
1647 	if (reconf) {
1648 		error = hn_rss_reconfig(sc);
1649 		if (error) {
1650 			/* XXX roll-back? */
1651 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1652 			/* XXX keep going. */
1653 		}
1654 	}
1655 done:
1656 	/* Hash deliverability for mbufs. */
1657 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1658 }
1659 
1660 static void
1661 hn_vf_rss_restore(struct hn_softc *sc)
1662 {
1663 
1664 	HN_LOCK_ASSERT(sc);
1665 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1666 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1667 
1668 	if (sc->hn_rx_ring_inuse == 1)
1669 		goto done;
1670 
1671 	/*
1672 	 * Restore hash types.  Key does _not_ matter.
1673 	 */
1674 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1675 		int error;
1676 
1677 		sc->hn_rss_hash = sc->hn_rss_hcap;
1678 		error = hn_rss_reconfig(sc);
1679 		if (error) {
1680 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1681 			    error);
1682 			/* XXX keep going. */
1683 		}
1684 	}
1685 done:
1686 	/* Hash deliverability for mbufs. */
1687 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1688 }
1689 
1690 static void
1691 hn_xpnt_vf_setready(struct hn_softc *sc)
1692 {
1693 	struct ifnet *ifp, *vf_ifp;
1694 	struct ifreq ifr;
1695 
1696 	HN_LOCK_ASSERT(sc);
1697 	ifp = sc->hn_ifp;
1698 	vf_ifp = sc->hn_vf_ifp;
1699 
1700 	/*
1701 	 * Mark the VF ready.
1702 	 */
1703 	sc->hn_vf_rdytick = 0;
1704 
1705 	/*
1706 	 * Save information for restoration.
1707 	 */
1708 	sc->hn_saved_caps = ifp->if_capabilities;
1709 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1710 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1711 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1712 
1713 	/*
1714 	 * Intersect supported/enabled capabilities.
1715 	 *
1716 	 * NOTE:
1717 	 * if_hwassist is not changed here.
1718 	 */
1719 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1720 	ifp->if_capenable &= ifp->if_capabilities;
1721 
1722 	/*
1723 	 * Fix TSO settings.
1724 	 */
1725 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1726 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1727 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1728 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1729 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1730 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1731 
1732 	/*
1733 	 * Change VF's enabled capabilities.
1734 	 */
1735 	memset(&ifr, 0, sizeof(ifr));
1736 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1737 	ifr.ifr_reqcap = ifp->if_capenable;
1738 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1739 
1740 	if (ifp->if_mtu != ETHERMTU) {
1741 		int error;
1742 
1743 		/*
1744 		 * Change VF's MTU.
1745 		 */
1746 		memset(&ifr, 0, sizeof(ifr));
1747 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1748 		ifr.ifr_mtu = ifp->if_mtu;
1749 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1750 		if (error) {
1751 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1752 			    vf_ifp->if_xname, ifp->if_mtu);
1753 			if (ifp->if_mtu > ETHERMTU) {
1754 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1755 
1756 				/*
1757 				 * XXX
1758 				 * No need to adjust the synthetic parts' MTU;
1759 				 * failure of the adjustment will cause us
1760 				 * infinite headache.
1761 				 */
1762 				ifp->if_mtu = ETHERMTU;
1763 				hn_mtu_change_fixup(sc);
1764 			}
1765 		}
1766 	}
1767 }
1768 
1769 static bool
1770 hn_xpnt_vf_isready(struct hn_softc *sc)
1771 {
1772 
1773 	HN_LOCK_ASSERT(sc);
1774 
1775 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1776 		return (false);
1777 
1778 	if (sc->hn_vf_rdytick == 0)
1779 		return (true);
1780 
1781 	if (sc->hn_vf_rdytick > ticks)
1782 		return (false);
1783 
1784 	/* Mark VF as ready. */
1785 	hn_xpnt_vf_setready(sc);
1786 	return (true);
1787 }
1788 
1789 static void
1790 hn_xpnt_vf_setenable(struct hn_softc *sc)
1791 {
1792 	int i;
1793 
1794 	HN_LOCK_ASSERT(sc);
1795 
1796 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1797 	rm_wlock(&sc->hn_vf_lock);
1798 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1799 	rm_wunlock(&sc->hn_vf_lock);
1800 
1801 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1802 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1803 }
1804 
1805 static void
1806 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1807 {
1808 	int i;
1809 
1810 	HN_LOCK_ASSERT(sc);
1811 
1812 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1813 	rm_wlock(&sc->hn_vf_lock);
1814 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1815 	if (clear_vf)
1816 		sc->hn_vf_ifp = NULL;
1817 	rm_wunlock(&sc->hn_vf_lock);
1818 
1819 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1820 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1821 }
1822 
1823 static void
1824 hn_xpnt_vf_init(struct hn_softc *sc)
1825 {
1826 	int error;
1827 
1828 	HN_LOCK_ASSERT(sc);
1829 
1830 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1831 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1832 
1833 	if (bootverbose) {
1834 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1835 		    sc->hn_vf_ifp->if_xname);
1836 	}
1837 
1838 	/*
1839 	 * Bring the VF up.
1840 	 */
1841 	hn_xpnt_vf_saveifflags(sc);
1842 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1843 	error = hn_xpnt_vf_iocsetflags(sc);
1844 	if (error) {
1845 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1846 		    sc->hn_vf_ifp->if_xname, error);
1847 		return;
1848 	}
1849 
1850 	/*
1851 	 * NOTE:
1852 	 * Datapath setting must happen _after_ bringing the VF up.
1853 	 */
1854 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1855 
1856 	/*
1857 	 * NOTE:
1858 	 * Fixup RSS related bits _after_ the VF is brought up, since
1859 	 * many VFs generate RSS key during it's initialization.
1860 	 */
1861 	hn_vf_rss_fixup(sc, true);
1862 
1863 	/* Mark transparent mode VF as enabled. */
1864 	hn_xpnt_vf_setenable(sc);
1865 }
1866 
1867 static void
1868 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1869 {
1870 	struct hn_softc *sc = xsc;
1871 
1872 	HN_LOCK(sc);
1873 
1874 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1875 		goto done;
1876 	if (sc->hn_vf_ifp == NULL)
1877 		goto done;
1878 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1879 		goto done;
1880 
1881 	if (sc->hn_vf_rdytick != 0) {
1882 		/* Mark VF as ready. */
1883 		hn_xpnt_vf_setready(sc);
1884 	}
1885 
1886 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1887 		/*
1888 		 * Delayed VF initialization.
1889 		 */
1890 		if (bootverbose) {
1891 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1892 			    sc->hn_vf_ifp->if_xname);
1893 		}
1894 		hn_xpnt_vf_init(sc);
1895 	}
1896 done:
1897 	HN_UNLOCK(sc);
1898 }
1899 
1900 static void
1901 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1902 {
1903 	struct hn_softc *sc = xsc;
1904 
1905 	HN_LOCK(sc);
1906 
1907 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1908 		goto done;
1909 
1910 	if (!hn_ismyvf(sc, ifp))
1911 		goto done;
1912 
1913 	if (sc->hn_vf_ifp != NULL) {
1914 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1915 		    sc->hn_vf_ifp->if_xname);
1916 		goto done;
1917 	}
1918 
1919 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1920 		/*
1921 		 * ifnet.if_start is _not_ supported by transparent
1922 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1923 		 */
1924 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1925 		    "in transparent VF mode.\n", ifp->if_xname);
1926 		goto done;
1927 	}
1928 
1929 	rm_wlock(&hn_vfmap_lock);
1930 
1931 	if (ifp->if_index >= hn_vfmap_size) {
1932 		struct ifnet **newmap;
1933 		int newsize;
1934 
1935 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1936 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1937 		    M_WAITOK | M_ZERO);
1938 
1939 		memcpy(newmap, hn_vfmap,
1940 		    sizeof(struct ifnet *) * hn_vfmap_size);
1941 		free(hn_vfmap, M_DEVBUF);
1942 		hn_vfmap = newmap;
1943 		hn_vfmap_size = newsize;
1944 	}
1945 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1946 	    ("%s: ifindex %d was mapped to %s",
1947 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1948 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1949 
1950 	rm_wunlock(&hn_vfmap_lock);
1951 
1952 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1953 	rm_wlock(&sc->hn_vf_lock);
1954 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1955 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1956 	sc->hn_vf_ifp = ifp;
1957 	rm_wunlock(&sc->hn_vf_lock);
1958 
1959 	if (hn_xpnt_vf) {
1960 		int wait_ticks;
1961 
1962 		/*
1963 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1964 		 * Save vf_ifp's current if_input for later restoration.
1965 		 */
1966 		sc->hn_vf_input = ifp->if_input;
1967 		ifp->if_input = hn_xpnt_vf_input;
1968 
1969 		/*
1970 		 * Stop link status management; use the VF's.
1971 		 */
1972 		hn_suspend_mgmt(sc);
1973 
1974 		/*
1975 		 * Give VF sometime to complete its attach routing.
1976 		 */
1977 		wait_ticks = hn_xpnt_vf_attwait * hz;
1978 		sc->hn_vf_rdytick = ticks + wait_ticks;
1979 
1980 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1981 		    wait_ticks);
1982 	}
1983 done:
1984 	HN_UNLOCK(sc);
1985 }
1986 
1987 static void
1988 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1989 {
1990 	struct hn_softc *sc = xsc;
1991 
1992 	HN_LOCK(sc);
1993 
1994 	if (sc->hn_vf_ifp == NULL)
1995 		goto done;
1996 
1997 	if (!hn_ismyvf(sc, ifp))
1998 		goto done;
1999 
2000 	if (hn_xpnt_vf) {
2001 		/*
2002 		 * Make sure that the delayed initialization is not running.
2003 		 *
2004 		 * NOTE:
2005 		 * - This lock _must_ be released, since the hn_vf_init task
2006 		 *   will try holding this lock.
2007 		 * - It is safe to release this lock here, since the
2008 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2009 		 *
2010 		 * XXX racy, if hn(4) ever detached.
2011 		 */
2012 		HN_UNLOCK(sc);
2013 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2014 		HN_LOCK(sc);
2015 
2016 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2017 		    sc->hn_ifp->if_xname));
2018 		ifp->if_input = sc->hn_vf_input;
2019 		sc->hn_vf_input = NULL;
2020 
2021 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2022 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2023 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2024 
2025 		if (sc->hn_vf_rdytick == 0) {
2026 			/*
2027 			 * The VF was ready; restore some settings.
2028 			 */
2029 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2030 			/*
2031 			 * NOTE:
2032 			 * There is _no_ need to fixup if_capenable and
2033 			 * if_hwassist, since the if_capabilities before
2034 			 * restoration was an intersection of the VF's
2035 			 * if_capabilites and the synthetic device's
2036 			 * if_capabilites.
2037 			 */
2038 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2039 			sc->hn_ifp->if_hw_tsomaxsegcount =
2040 			    sc->hn_saved_tsosegcnt;
2041 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2042 		}
2043 
2044 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2045 			/*
2046 			 * Restore RSS settings.
2047 			 */
2048 			hn_vf_rss_restore(sc);
2049 
2050 			/*
2051 			 * Resume link status management, which was suspended
2052 			 * by hn_ifnet_attevent().
2053 			 */
2054 			hn_resume_mgmt(sc);
2055 		}
2056 	}
2057 
2058 	/* Mark transparent mode VF as disabled. */
2059 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2060 
2061 	rm_wlock(&hn_vfmap_lock);
2062 
2063 	KASSERT(ifp->if_index < hn_vfmap_size,
2064 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2065 	if (hn_vfmap[ifp->if_index] != NULL) {
2066 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2067 		    ("%s: ifindex %d was mapped to %s",
2068 		     ifp->if_xname, ifp->if_index,
2069 		     hn_vfmap[ifp->if_index]->if_xname));
2070 		hn_vfmap[ifp->if_index] = NULL;
2071 	}
2072 
2073 	rm_wunlock(&hn_vfmap_lock);
2074 done:
2075 	HN_UNLOCK(sc);
2076 }
2077 
2078 static void
2079 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2080 {
2081 	struct hn_softc *sc = xsc;
2082 
2083 	if (sc->hn_vf_ifp == ifp)
2084 		if_link_state_change(sc->hn_ifp, link_state);
2085 }
2086 
2087 static int
2088 hn_probe(device_t dev)
2089 {
2090 
2091 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2092 		device_set_desc(dev, "Hyper-V Network Interface");
2093 		return BUS_PROBE_DEFAULT;
2094 	}
2095 	return ENXIO;
2096 }
2097 
2098 static int
2099 hn_attach(device_t dev)
2100 {
2101 	struct hn_softc *sc = device_get_softc(dev);
2102 	struct sysctl_oid_list *child;
2103 	struct sysctl_ctx_list *ctx;
2104 	uint8_t eaddr[ETHER_ADDR_LEN];
2105 	struct ifnet *ifp = NULL;
2106 	int error, ring_cnt, tx_ring_cnt;
2107 	uint32_t mtu;
2108 
2109 	sc->hn_dev = dev;
2110 	sc->hn_prichan = vmbus_get_channel(dev);
2111 	HN_LOCK_INIT(sc);
2112 	rm_init(&sc->hn_vf_lock, "hnvf");
2113 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2114 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2115 
2116 	/*
2117 	 * Initialize these tunables once.
2118 	 */
2119 	sc->hn_agg_size = hn_tx_agg_size;
2120 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2121 
2122 	/*
2123 	 * Setup taskqueue for transmission.
2124 	 */
2125 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2126 		int i;
2127 
2128 		sc->hn_tx_taskqs =
2129 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2130 		    M_DEVBUF, M_WAITOK);
2131 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2132 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2133 			    M_WAITOK, taskqueue_thread_enqueue,
2134 			    &sc->hn_tx_taskqs[i]);
2135 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2136 			    "%s tx%d", device_get_nameunit(dev), i);
2137 		}
2138 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2139 		sc->hn_tx_taskqs = hn_tx_taskque;
2140 	}
2141 
2142 	/*
2143 	 * Setup taskqueue for mangement tasks, e.g. link status.
2144 	 */
2145 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2146 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2147 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2148 	    device_get_nameunit(dev));
2149 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2150 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2151 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2152 	    hn_netchg_status_taskfunc, sc);
2153 
2154 	if (hn_xpnt_vf) {
2155 		/*
2156 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2157 		 */
2158 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2159 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2160 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2161 		    device_get_nameunit(dev));
2162 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2163 		    hn_xpnt_vf_init_taskfunc, sc);
2164 	}
2165 
2166 	/*
2167 	 * Allocate ifnet and setup its name earlier, so that if_printf
2168 	 * can be used by functions, which will be called after
2169 	 * ether_ifattach().
2170 	 */
2171 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2172 	ifp->if_softc = sc;
2173 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2174 
2175 	/*
2176 	 * Initialize ifmedia earlier so that it can be unconditionally
2177 	 * destroyed, if error happened later on.
2178 	 */
2179 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2180 
2181 	/*
2182 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2183 	 * to use (tx_ring_cnt).
2184 	 *
2185 	 * NOTE:
2186 	 * The # of RX rings to use is same as the # of channels to use.
2187 	 */
2188 	ring_cnt = hn_chan_cnt;
2189 	if (ring_cnt <= 0) {
2190 		/* Default */
2191 		ring_cnt = mp_ncpus;
2192 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2193 			ring_cnt = HN_RING_CNT_DEF_MAX;
2194 	} else if (ring_cnt > mp_ncpus) {
2195 		ring_cnt = mp_ncpus;
2196 	}
2197 #ifdef RSS
2198 	if (ring_cnt > rss_getnumbuckets())
2199 		ring_cnt = rss_getnumbuckets();
2200 #endif
2201 
2202 	tx_ring_cnt = hn_tx_ring_cnt;
2203 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2204 		tx_ring_cnt = ring_cnt;
2205 #ifdef HN_IFSTART_SUPPORT
2206 	if (hn_use_if_start) {
2207 		/* ifnet.if_start only needs one TX ring. */
2208 		tx_ring_cnt = 1;
2209 	}
2210 #endif
2211 
2212 	/*
2213 	 * Set the leader CPU for channels.
2214 	 */
2215 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2216 
2217 	/*
2218 	 * Create enough TX/RX rings, even if only limited number of
2219 	 * channels can be allocated.
2220 	 */
2221 	error = hn_create_tx_data(sc, tx_ring_cnt);
2222 	if (error)
2223 		goto failed;
2224 	error = hn_create_rx_data(sc, ring_cnt);
2225 	if (error)
2226 		goto failed;
2227 
2228 	/*
2229 	 * Create transaction context for NVS and RNDIS transactions.
2230 	 */
2231 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2232 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2233 	if (sc->hn_xact == NULL) {
2234 		error = ENXIO;
2235 		goto failed;
2236 	}
2237 
2238 	/*
2239 	 * Install orphan handler for the revocation of this device's
2240 	 * primary channel.
2241 	 *
2242 	 * NOTE:
2243 	 * The processing order is critical here:
2244 	 * Install the orphan handler, _before_ testing whether this
2245 	 * device's primary channel has been revoked or not.
2246 	 */
2247 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2248 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2249 		error = ENXIO;
2250 		goto failed;
2251 	}
2252 
2253 	/*
2254 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2255 	 */
2256 	error = hn_synth_attach(sc, ETHERMTU);
2257 	if (error)
2258 		goto failed;
2259 
2260 	error = hn_rndis_get_eaddr(sc, eaddr);
2261 	if (error)
2262 		goto failed;
2263 
2264 	error = hn_rndis_get_mtu(sc, &mtu);
2265 	if (error)
2266 		mtu = ETHERMTU;
2267 	else if (bootverbose)
2268 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2269 
2270 #if __FreeBSD_version >= 1100099
2271 	if (sc->hn_rx_ring_inuse > 1) {
2272 		/*
2273 		 * Reduce TCP segment aggregation limit for multiple
2274 		 * RX rings to increase ACK timeliness.
2275 		 */
2276 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2277 	}
2278 #endif
2279 
2280 	/*
2281 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2282 	 */
2283 	hn_fixup_tx_data(sc);
2284 	hn_fixup_rx_data(sc);
2285 
2286 	ctx = device_get_sysctl_ctx(dev);
2287 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2288 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2289 	    &sc->hn_nvs_ver, 0, "NVS version");
2290 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2291 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2292 	    hn_ndis_version_sysctl, "A", "NDIS version");
2293 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2294 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2295 	    hn_caps_sysctl, "A", "capabilities");
2296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2297 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 	    hn_hwassist_sysctl, "A", "hwassist");
2299 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2300 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2301 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2302 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2303 	    "max # of TSO segments");
2304 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2305 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2306 	    "max size of TSO segment");
2307 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2308 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2309 	    hn_rxfilter_sysctl, "A", "rxfilter");
2310 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2311 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2312 	    hn_rss_hash_sysctl, "A", "RSS hash");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2314 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2316 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2317 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2318 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2319 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2320 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2321 #ifndef RSS
2322 	/*
2323 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2324 	 */
2325 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2326 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 	    hn_rss_key_sysctl, "IU", "RSS key");
2328 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2329 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2330 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2331 #endif
2332 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2333 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2334 	    "RNDIS offered packet transmission aggregation size limit");
2335 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2336 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2337 	    "RNDIS offered packet transmission aggregation count limit");
2338 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2339 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2340 	    "RNDIS packet transmission aggregation alignment");
2341 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2342 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2343 	    hn_txagg_size_sysctl, "I",
2344 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2345 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2346 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2347 	    hn_txagg_pkts_sysctl, "I",
2348 	    "Packet transmission aggregation packets, "
2349 	    "0 -- disable, -1 -- auto");
2350 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2351 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2352 	    hn_polling_sysctl, "I",
2353 	    "Polling frequency: [100,1000000], 0 disable polling");
2354 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2355 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2356 	    hn_vf_sysctl, "A", "Virtual Function's name");
2357 	if (!hn_xpnt_vf) {
2358 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2359 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2360 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2361 	} else {
2362 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2363 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2364 		    hn_xpnt_vf_enabled_sysctl, "I",
2365 		    "Transparent VF enabled");
2366 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2367 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2368 		    hn_xpnt_vf_accbpf_sysctl, "I",
2369 		    "Accurate BPF for transparent VF");
2370 	}
2371 
2372 	/*
2373 	 * Setup the ifmedia, which has been initialized earlier.
2374 	 */
2375 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2376 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2377 	/* XXX ifmedia_set really should do this for us */
2378 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2379 
2380 	/*
2381 	 * Setup the ifnet for this interface.
2382 	 */
2383 
2384 	ifp->if_baudrate = IF_Gbps(10);
2385 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2386 	ifp->if_ioctl = hn_ioctl;
2387 	ifp->if_init = hn_init;
2388 #ifdef HN_IFSTART_SUPPORT
2389 	if (hn_use_if_start) {
2390 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2391 
2392 		ifp->if_start = hn_start;
2393 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2394 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2395 		IFQ_SET_READY(&ifp->if_snd);
2396 	} else
2397 #endif
2398 	{
2399 		ifp->if_transmit = hn_transmit;
2400 		ifp->if_qflush = hn_xmit_qflush;
2401 	}
2402 
2403 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2404 #ifdef foo
2405 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2406 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2407 #endif
2408 	if (sc->hn_caps & HN_CAP_VLAN) {
2409 		/* XXX not sure about VLAN_MTU. */
2410 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2411 	}
2412 
2413 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2414 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2415 		ifp->if_capabilities |= IFCAP_TXCSUM;
2416 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2417 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2418 	if (sc->hn_caps & HN_CAP_TSO4) {
2419 		ifp->if_capabilities |= IFCAP_TSO4;
2420 		ifp->if_hwassist |= CSUM_IP_TSO;
2421 	}
2422 	if (sc->hn_caps & HN_CAP_TSO6) {
2423 		ifp->if_capabilities |= IFCAP_TSO6;
2424 		ifp->if_hwassist |= CSUM_IP6_TSO;
2425 	}
2426 
2427 	/* Enable all available capabilities by default. */
2428 	ifp->if_capenable = ifp->if_capabilities;
2429 
2430 	/*
2431 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2432 	 * be enabled through SIOCSIFCAP.
2433 	 */
2434 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2435 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2436 
2437 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2438 		/*
2439 		 * Lock hn_set_tso_maxsize() to simplify its
2440 		 * internal logic.
2441 		 */
2442 		HN_LOCK(sc);
2443 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2444 		HN_UNLOCK(sc);
2445 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2446 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2447 	}
2448 
2449 	ether_ifattach(ifp, eaddr);
2450 
2451 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2452 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2453 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2454 	}
2455 	if (mtu < ETHERMTU) {
2456 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2457 		ifp->if_mtu = mtu;
2458 	}
2459 
2460 	/* Inform the upper layer about the long frame support. */
2461 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2462 
2463 	/*
2464 	 * Kick off link status check.
2465 	 */
2466 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2467 	hn_update_link_status(sc);
2468 
2469 	if (!hn_xpnt_vf) {
2470 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2471 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2472 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2473 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2474 	} else {
2475 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2476 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2477 	}
2478 
2479 	/*
2480 	 * NOTE:
2481 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2482 	 * since interface's LLADDR is needed; interface LLADDR is not
2483 	 * available when ifnet_arrival event is triggered.
2484 	 */
2485 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2486 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2487 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2488 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2489 
2490 	return (0);
2491 failed:
2492 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2493 		hn_synth_detach(sc);
2494 	hn_detach(dev);
2495 	return (error);
2496 }
2497 
2498 static int
2499 hn_detach(device_t dev)
2500 {
2501 	struct hn_softc *sc = device_get_softc(dev);
2502 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2503 
2504 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2505 		/*
2506 		 * In case that the vmbus missed the orphan handler
2507 		 * installation.
2508 		 */
2509 		vmbus_xact_ctx_orphan(sc->hn_xact);
2510 	}
2511 
2512 	if (sc->hn_ifaddr_evthand != NULL)
2513 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2514 	if (sc->hn_ifnet_evthand != NULL)
2515 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2516 	if (sc->hn_ifnet_atthand != NULL) {
2517 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2518 		    sc->hn_ifnet_atthand);
2519 	}
2520 	if (sc->hn_ifnet_dethand != NULL) {
2521 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2522 		    sc->hn_ifnet_dethand);
2523 	}
2524 	if (sc->hn_ifnet_lnkhand != NULL)
2525 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2526 
2527 	vf_ifp = sc->hn_vf_ifp;
2528 	__compiler_membar();
2529 	if (vf_ifp != NULL)
2530 		hn_ifnet_detevent(sc, vf_ifp);
2531 
2532 	if (device_is_attached(dev)) {
2533 		HN_LOCK(sc);
2534 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2535 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2536 				hn_stop(sc, true);
2537 			/*
2538 			 * NOTE:
2539 			 * hn_stop() only suspends data, so managment
2540 			 * stuffs have to be suspended manually here.
2541 			 */
2542 			hn_suspend_mgmt(sc);
2543 			hn_synth_detach(sc);
2544 		}
2545 		HN_UNLOCK(sc);
2546 		ether_ifdetach(ifp);
2547 	}
2548 
2549 	ifmedia_removeall(&sc->hn_media);
2550 	hn_destroy_rx_data(sc);
2551 	hn_destroy_tx_data(sc);
2552 
2553 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2554 		int i;
2555 
2556 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2557 			taskqueue_free(sc->hn_tx_taskqs[i]);
2558 		free(sc->hn_tx_taskqs, M_DEVBUF);
2559 	}
2560 	taskqueue_free(sc->hn_mgmt_taskq0);
2561 	if (sc->hn_vf_taskq != NULL)
2562 		taskqueue_free(sc->hn_vf_taskq);
2563 
2564 	if (sc->hn_xact != NULL) {
2565 		/*
2566 		 * Uninstall the orphan handler _before_ the xact is
2567 		 * destructed.
2568 		 */
2569 		vmbus_chan_unset_orphan(sc->hn_prichan);
2570 		vmbus_xact_ctx_destroy(sc->hn_xact);
2571 	}
2572 
2573 	if_free(ifp);
2574 
2575 	HN_LOCK_DESTROY(sc);
2576 	rm_destroy(&sc->hn_vf_lock);
2577 	return (0);
2578 }
2579 
2580 static int
2581 hn_shutdown(device_t dev)
2582 {
2583 
2584 	return (0);
2585 }
2586 
2587 static void
2588 hn_link_status(struct hn_softc *sc)
2589 {
2590 	uint32_t link_status;
2591 	int error;
2592 
2593 	error = hn_rndis_get_linkstatus(sc, &link_status);
2594 	if (error) {
2595 		/* XXX what to do? */
2596 		return;
2597 	}
2598 
2599 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2600 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2601 	else
2602 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2603 	if_link_state_change(sc->hn_ifp,
2604 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2605 	    LINK_STATE_UP : LINK_STATE_DOWN);
2606 }
2607 
2608 static void
2609 hn_link_taskfunc(void *xsc, int pending __unused)
2610 {
2611 	struct hn_softc *sc = xsc;
2612 
2613 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2614 		return;
2615 	hn_link_status(sc);
2616 }
2617 
2618 static void
2619 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2620 {
2621 	struct hn_softc *sc = xsc;
2622 
2623 	/* Prevent any link status checks from running. */
2624 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2625 
2626 	/*
2627 	 * Fake up a [link down --> link up] state change; 5 seconds
2628 	 * delay is used, which closely simulates miibus reaction
2629 	 * upon link down event.
2630 	 */
2631 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2632 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2633 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2634 	    &sc->hn_netchg_status, 5 * hz);
2635 }
2636 
2637 static void
2638 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2639 {
2640 	struct hn_softc *sc = xsc;
2641 
2642 	/* Re-allow link status checks. */
2643 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2644 	hn_link_status(sc);
2645 }
2646 
2647 static void
2648 hn_update_link_status(struct hn_softc *sc)
2649 {
2650 
2651 	if (sc->hn_mgmt_taskq != NULL)
2652 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2653 }
2654 
2655 static void
2656 hn_change_network(struct hn_softc *sc)
2657 {
2658 
2659 	if (sc->hn_mgmt_taskq != NULL)
2660 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2661 }
2662 
2663 static __inline int
2664 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2665     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2666 {
2667 	struct mbuf *m = *m_head;
2668 	int error;
2669 
2670 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2671 
2672 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2673 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2674 	if (error == EFBIG) {
2675 		struct mbuf *m_new;
2676 
2677 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2678 		if (m_new == NULL)
2679 			return ENOBUFS;
2680 		else
2681 			*m_head = m = m_new;
2682 		txr->hn_tx_collapsed++;
2683 
2684 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2685 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2686 	}
2687 	if (!error) {
2688 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2689 		    BUS_DMASYNC_PREWRITE);
2690 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2691 	}
2692 	return error;
2693 }
2694 
2695 static __inline int
2696 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2697 {
2698 
2699 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2700 	    ("put an onlist txd %#x", txd->flags));
2701 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2702 	    ("put an onagg txd %#x", txd->flags));
2703 
2704 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2705 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2706 		return 0;
2707 
2708 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2709 		struct hn_txdesc *tmp_txd;
2710 
2711 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2712 			int freed;
2713 
2714 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2715 			    ("resursive aggregation on aggregated txdesc"));
2716 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2717 			    ("not aggregated txdesc"));
2718 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2719 			    ("aggregated txdesc uses dmamap"));
2720 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2721 			    ("aggregated txdesc consumes "
2722 			     "chimney sending buffer"));
2723 			KASSERT(tmp_txd->chim_size == 0,
2724 			    ("aggregated txdesc has non-zero "
2725 			     "chimney sending size"));
2726 
2727 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2728 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2729 			freed = hn_txdesc_put(txr, tmp_txd);
2730 			KASSERT(freed, ("failed to free aggregated txdesc"));
2731 		}
2732 	}
2733 
2734 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2735 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2736 		    ("chim txd uses dmamap"));
2737 		hn_chim_free(txr->hn_sc, txd->chim_index);
2738 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2739 		txd->chim_size = 0;
2740 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2741 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2742 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2743 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2744 		    txd->data_dmap);
2745 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2746 	}
2747 
2748 	if (txd->m != NULL) {
2749 		m_freem(txd->m);
2750 		txd->m = NULL;
2751 	}
2752 
2753 	txd->flags |= HN_TXD_FLAG_ONLIST;
2754 #ifndef HN_USE_TXDESC_BUFRING
2755 	mtx_lock_spin(&txr->hn_txlist_spin);
2756 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2757 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2758 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2759 	txr->hn_txdesc_avail++;
2760 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2761 	mtx_unlock_spin(&txr->hn_txlist_spin);
2762 #else	/* HN_USE_TXDESC_BUFRING */
2763 #ifdef HN_DEBUG
2764 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2765 #endif
2766 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2767 #endif	/* !HN_USE_TXDESC_BUFRING */
2768 
2769 	return 1;
2770 }
2771 
2772 static __inline struct hn_txdesc *
2773 hn_txdesc_get(struct hn_tx_ring *txr)
2774 {
2775 	struct hn_txdesc *txd;
2776 
2777 #ifndef HN_USE_TXDESC_BUFRING
2778 	mtx_lock_spin(&txr->hn_txlist_spin);
2779 	txd = SLIST_FIRST(&txr->hn_txlist);
2780 	if (txd != NULL) {
2781 		KASSERT(txr->hn_txdesc_avail > 0,
2782 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2783 		txr->hn_txdesc_avail--;
2784 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2785 	}
2786 	mtx_unlock_spin(&txr->hn_txlist_spin);
2787 #else
2788 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2789 #endif
2790 
2791 	if (txd != NULL) {
2792 #ifdef HN_USE_TXDESC_BUFRING
2793 #ifdef HN_DEBUG
2794 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2795 #endif
2796 #endif	/* HN_USE_TXDESC_BUFRING */
2797 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2798 		    STAILQ_EMPTY(&txd->agg_list) &&
2799 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2800 		    txd->chim_size == 0 &&
2801 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2802 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2803 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2804 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2805 		txd->refs = 1;
2806 	}
2807 	return txd;
2808 }
2809 
2810 static __inline void
2811 hn_txdesc_hold(struct hn_txdesc *txd)
2812 {
2813 
2814 	/* 0->1 transition will never work */
2815 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2816 	atomic_add_int(&txd->refs, 1);
2817 }
2818 
2819 static __inline void
2820 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2821 {
2822 
2823 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2824 	    ("recursive aggregation on aggregating txdesc"));
2825 
2826 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2827 	    ("already aggregated"));
2828 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2829 	    ("recursive aggregation on to-be-aggregated txdesc"));
2830 
2831 	txd->flags |= HN_TXD_FLAG_ONAGG;
2832 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2833 }
2834 
2835 static bool
2836 hn_tx_ring_pending(struct hn_tx_ring *txr)
2837 {
2838 	bool pending = false;
2839 
2840 #ifndef HN_USE_TXDESC_BUFRING
2841 	mtx_lock_spin(&txr->hn_txlist_spin);
2842 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2843 		pending = true;
2844 	mtx_unlock_spin(&txr->hn_txlist_spin);
2845 #else
2846 	if (!buf_ring_full(txr->hn_txdesc_br))
2847 		pending = true;
2848 #endif
2849 	return (pending);
2850 }
2851 
2852 static __inline void
2853 hn_txeof(struct hn_tx_ring *txr)
2854 {
2855 	txr->hn_has_txeof = 0;
2856 	txr->hn_txeof(txr);
2857 }
2858 
2859 static void
2860 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2861     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2862 {
2863 	struct hn_txdesc *txd = sndc->hn_cbarg;
2864 	struct hn_tx_ring *txr;
2865 
2866 	txr = txd->txr;
2867 	KASSERT(txr->hn_chan == chan,
2868 	    ("channel mismatch, on chan%u, should be chan%u",
2869 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2870 
2871 	txr->hn_has_txeof = 1;
2872 	hn_txdesc_put(txr, txd);
2873 
2874 	++txr->hn_txdone_cnt;
2875 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2876 		txr->hn_txdone_cnt = 0;
2877 		if (txr->hn_oactive)
2878 			hn_txeof(txr);
2879 	}
2880 }
2881 
2882 static void
2883 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2884 {
2885 #if defined(INET) || defined(INET6)
2886 	tcp_lro_flush_all(&rxr->hn_lro);
2887 #endif
2888 
2889 	/*
2890 	 * NOTE:
2891 	 * 'txr' could be NULL, if multiple channels and
2892 	 * ifnet.if_start method are enabled.
2893 	 */
2894 	if (txr == NULL || !txr->hn_has_txeof)
2895 		return;
2896 
2897 	txr->hn_txdone_cnt = 0;
2898 	hn_txeof(txr);
2899 }
2900 
2901 static __inline uint32_t
2902 hn_rndis_pktmsg_offset(uint32_t ofs)
2903 {
2904 
2905 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2906 	    ("invalid RNDIS packet msg offset %u", ofs));
2907 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2908 }
2909 
2910 static __inline void *
2911 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2912     size_t pi_dlen, uint32_t pi_type)
2913 {
2914 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2915 	struct rndis_pktinfo *pi;
2916 
2917 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2918 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2919 
2920 	/*
2921 	 * Per-packet-info does not move; it only grows.
2922 	 *
2923 	 * NOTE:
2924 	 * rm_pktinfooffset in this phase counts from the beginning
2925 	 * of rndis_packet_msg.
2926 	 */
2927 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2928 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2929 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2930 	    pkt->rm_pktinfolen);
2931 	pkt->rm_pktinfolen += pi_size;
2932 
2933 	pi->rm_size = pi_size;
2934 	pi->rm_type = pi_type;
2935 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2936 
2937 	return (pi->rm_data);
2938 }
2939 
2940 static __inline int
2941 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2942 {
2943 	struct hn_txdesc *txd;
2944 	struct mbuf *m;
2945 	int error, pkts;
2946 
2947 	txd = txr->hn_agg_txd;
2948 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2949 
2950 	/*
2951 	 * Since hn_txpkt() will reset this temporary stat, save
2952 	 * it now, so that oerrors can be updated properly, if
2953 	 * hn_txpkt() ever fails.
2954 	 */
2955 	pkts = txr->hn_stat_pkts;
2956 
2957 	/*
2958 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2959 	 * failure, save it for later freeing, if hn_txpkt() ever
2960 	 * fails.
2961 	 */
2962 	m = txd->m;
2963 	error = hn_txpkt(ifp, txr, txd);
2964 	if (__predict_false(error)) {
2965 		/* txd is freed, but m is not. */
2966 		m_freem(m);
2967 
2968 		txr->hn_flush_failed++;
2969 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2970 	}
2971 
2972 	/* Reset all aggregation states. */
2973 	txr->hn_agg_txd = NULL;
2974 	txr->hn_agg_szleft = 0;
2975 	txr->hn_agg_pktleft = 0;
2976 	txr->hn_agg_prevpkt = NULL;
2977 
2978 	return (error);
2979 }
2980 
2981 static void *
2982 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2983     int pktsize)
2984 {
2985 	void *chim;
2986 
2987 	if (txr->hn_agg_txd != NULL) {
2988 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2989 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2990 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2991 			int olen;
2992 
2993 			/*
2994 			 * Update the previous RNDIS packet's total length,
2995 			 * it can be increased due to the mandatory alignment
2996 			 * padding for this RNDIS packet.  And update the
2997 			 * aggregating txdesc's chimney sending buffer size
2998 			 * accordingly.
2999 			 *
3000 			 * XXX
3001 			 * Zero-out the padding, as required by the RNDIS spec.
3002 			 */
3003 			olen = pkt->rm_len;
3004 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3005 			agg_txd->chim_size += pkt->rm_len - olen;
3006 
3007 			/* Link this txdesc to the parent. */
3008 			hn_txdesc_agg(agg_txd, txd);
3009 
3010 			chim = (uint8_t *)pkt + pkt->rm_len;
3011 			/* Save the current packet for later fixup. */
3012 			txr->hn_agg_prevpkt = chim;
3013 
3014 			txr->hn_agg_pktleft--;
3015 			txr->hn_agg_szleft -= pktsize;
3016 			if (txr->hn_agg_szleft <=
3017 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3018 				/*
3019 				 * Probably can't aggregate more packets,
3020 				 * flush this aggregating txdesc proactively.
3021 				 */
3022 				txr->hn_agg_pktleft = 0;
3023 			}
3024 			/* Done! */
3025 			return (chim);
3026 		}
3027 		hn_flush_txagg(ifp, txr);
3028 	}
3029 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3030 
3031 	txr->hn_tx_chimney_tried++;
3032 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3033 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3034 		return (NULL);
3035 	txr->hn_tx_chimney++;
3036 
3037 	chim = txr->hn_sc->hn_chim +
3038 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3039 
3040 	if (txr->hn_agg_pktmax > 1 &&
3041 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3042 		txr->hn_agg_txd = txd;
3043 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3044 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3045 		txr->hn_agg_prevpkt = chim;
3046 	}
3047 	return (chim);
3048 }
3049 
3050 /*
3051  * NOTE:
3052  * If this function fails, then both txd and m_head0 will be freed.
3053  */
3054 static int
3055 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3056     struct mbuf **m_head0)
3057 {
3058 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3059 	int error, nsegs, i;
3060 	struct mbuf *m_head = *m_head0;
3061 	struct rndis_packet_msg *pkt;
3062 	uint32_t *pi_data;
3063 	void *chim = NULL;
3064 	int pkt_hlen, pkt_size;
3065 
3066 	pkt = txd->rndis_pkt;
3067 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3068 	if (pkt_size < txr->hn_chim_size) {
3069 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3070 		if (chim != NULL)
3071 			pkt = chim;
3072 	} else {
3073 		if (txr->hn_agg_txd != NULL)
3074 			hn_flush_txagg(ifp, txr);
3075 	}
3076 
3077 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3078 	pkt->rm_len = m_head->m_pkthdr.len;
3079 	pkt->rm_dataoffset = 0;
3080 	pkt->rm_datalen = m_head->m_pkthdr.len;
3081 	pkt->rm_oobdataoffset = 0;
3082 	pkt->rm_oobdatalen = 0;
3083 	pkt->rm_oobdataelements = 0;
3084 	pkt->rm_pktinfooffset = sizeof(*pkt);
3085 	pkt->rm_pktinfolen = 0;
3086 	pkt->rm_vchandle = 0;
3087 	pkt->rm_reserved = 0;
3088 
3089 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3090 		/*
3091 		 * Set the hash value for this packet.
3092 		 */
3093 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3094 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3095 
3096 		if (M_HASHTYPE_ISHASH(m_head))
3097 			/*
3098 			 * The flowid field contains the hash value host
3099 			 * set in the rx queue if it is a ip forwarding pkt.
3100 			 * Set the same hash value so host can send on the
3101 			 * cpu it was received.
3102 			 */
3103 			*pi_data = m_head->m_pkthdr.flowid;
3104 		else
3105 			/*
3106 			 * Otherwise just put the tx queue index.
3107 			 */
3108 			*pi_data = txr->hn_tx_idx;
3109 	}
3110 
3111 	if (m_head->m_flags & M_VLANTAG) {
3112 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3113 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3114 		*pi_data = NDIS_VLAN_INFO_MAKE(
3115 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3116 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3117 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3118 	}
3119 
3120 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3121 #if defined(INET6) || defined(INET)
3122 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3123 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3124 #ifdef INET
3125 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3126 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3127 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3128 			    m_head->m_pkthdr.tso_segsz);
3129 		}
3130 #endif
3131 #if defined(INET6) && defined(INET)
3132 		else
3133 #endif
3134 #ifdef INET6
3135 		{
3136 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3137 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3138 			    m_head->m_pkthdr.tso_segsz);
3139 		}
3140 #endif
3141 #endif	/* INET6 || INET */
3142 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3143 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3144 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3145 		if (m_head->m_pkthdr.csum_flags &
3146 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3147 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3148 		} else {
3149 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3150 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3151 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3152 		}
3153 
3154 		if (m_head->m_pkthdr.csum_flags &
3155 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3156 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3157 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3158 		} else if (m_head->m_pkthdr.csum_flags &
3159 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3160 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3161 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3162 		}
3163 	}
3164 
3165 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3166 	/* Fixup RNDIS packet message total length */
3167 	pkt->rm_len += pkt_hlen;
3168 	/* Convert RNDIS packet message offsets */
3169 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3170 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3171 
3172 	/*
3173 	 * Fast path: Chimney sending.
3174 	 */
3175 	if (chim != NULL) {
3176 		struct hn_txdesc *tgt_txd = txd;
3177 
3178 		if (txr->hn_agg_txd != NULL) {
3179 			tgt_txd = txr->hn_agg_txd;
3180 #ifdef INVARIANTS
3181 			*m_head0 = NULL;
3182 #endif
3183 		}
3184 
3185 		KASSERT(pkt == chim,
3186 		    ("RNDIS pkt not in chimney sending buffer"));
3187 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3188 		    ("chimney sending buffer is not used"));
3189 		tgt_txd->chim_size += pkt->rm_len;
3190 
3191 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3192 		    ((uint8_t *)chim) + pkt_hlen);
3193 
3194 		txr->hn_gpa_cnt = 0;
3195 		txr->hn_sendpkt = hn_txpkt_chim;
3196 		goto done;
3197 	}
3198 
3199 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3200 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3201 	    ("chimney buffer is used"));
3202 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3203 
3204 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3205 	if (__predict_false(error)) {
3206 		int freed;
3207 
3208 		/*
3209 		 * This mbuf is not linked w/ the txd yet, so free it now.
3210 		 */
3211 		m_freem(m_head);
3212 		*m_head0 = NULL;
3213 
3214 		freed = hn_txdesc_put(txr, txd);
3215 		KASSERT(freed != 0,
3216 		    ("fail to free txd upon txdma error"));
3217 
3218 		txr->hn_txdma_failed++;
3219 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3220 		return error;
3221 	}
3222 	*m_head0 = m_head;
3223 
3224 	/* +1 RNDIS packet message */
3225 	txr->hn_gpa_cnt = nsegs + 1;
3226 
3227 	/* send packet with page buffer */
3228 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3229 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3230 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3231 
3232 	/*
3233 	 * Fill the page buffers with mbuf info after the page
3234 	 * buffer for RNDIS packet message.
3235 	 */
3236 	for (i = 0; i < nsegs; ++i) {
3237 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3238 
3239 		gpa->gpa_page = atop(segs[i].ds_addr);
3240 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3241 		gpa->gpa_len = segs[i].ds_len;
3242 	}
3243 
3244 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3245 	txd->chim_size = 0;
3246 	txr->hn_sendpkt = hn_txpkt_sglist;
3247 done:
3248 	txd->m = m_head;
3249 
3250 	/* Set the completion routine */
3251 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3252 
3253 	/* Update temporary stats for later use. */
3254 	txr->hn_stat_pkts++;
3255 	txr->hn_stat_size += m_head->m_pkthdr.len;
3256 	if (m_head->m_flags & M_MCAST)
3257 		txr->hn_stat_mcasts++;
3258 
3259 	return 0;
3260 }
3261 
3262 /*
3263  * NOTE:
3264  * If this function fails, then txd will be freed, but the mbuf
3265  * associated w/ the txd will _not_ be freed.
3266  */
3267 static int
3268 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3269 {
3270 	int error, send_failed = 0, has_bpf;
3271 
3272 again:
3273 	has_bpf = bpf_peers_present(ifp->if_bpf);
3274 	if (has_bpf) {
3275 		/*
3276 		 * Make sure that this txd and any aggregated txds are not
3277 		 * freed before ETHER_BPF_MTAP.
3278 		 */
3279 		hn_txdesc_hold(txd);
3280 	}
3281 	error = txr->hn_sendpkt(txr, txd);
3282 	if (!error) {
3283 		if (has_bpf) {
3284 			const struct hn_txdesc *tmp_txd;
3285 
3286 			ETHER_BPF_MTAP(ifp, txd->m);
3287 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3288 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3289 		}
3290 
3291 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3292 #ifdef HN_IFSTART_SUPPORT
3293 		if (!hn_use_if_start)
3294 #endif
3295 		{
3296 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3297 			    txr->hn_stat_size);
3298 			if (txr->hn_stat_mcasts != 0) {
3299 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3300 				    txr->hn_stat_mcasts);
3301 			}
3302 		}
3303 		txr->hn_pkts += txr->hn_stat_pkts;
3304 		txr->hn_sends++;
3305 	}
3306 	if (has_bpf)
3307 		hn_txdesc_put(txr, txd);
3308 
3309 	if (__predict_false(error)) {
3310 		int freed;
3311 
3312 		/*
3313 		 * This should "really rarely" happen.
3314 		 *
3315 		 * XXX Too many RX to be acked or too many sideband
3316 		 * commands to run?  Ask netvsc_channel_rollup()
3317 		 * to kick start later.
3318 		 */
3319 		txr->hn_has_txeof = 1;
3320 		if (!send_failed) {
3321 			txr->hn_send_failed++;
3322 			send_failed = 1;
3323 			/*
3324 			 * Try sending again after set hn_has_txeof;
3325 			 * in case that we missed the last
3326 			 * netvsc_channel_rollup().
3327 			 */
3328 			goto again;
3329 		}
3330 		if_printf(ifp, "send failed\n");
3331 
3332 		/*
3333 		 * Caller will perform further processing on the
3334 		 * associated mbuf, so don't free it in hn_txdesc_put();
3335 		 * only unload it from the DMA map in hn_txdesc_put(),
3336 		 * if it was loaded.
3337 		 */
3338 		txd->m = NULL;
3339 		freed = hn_txdesc_put(txr, txd);
3340 		KASSERT(freed != 0,
3341 		    ("fail to free txd upon send error"));
3342 
3343 		txr->hn_send_failed++;
3344 	}
3345 
3346 	/* Reset temporary stats, after this sending is done. */
3347 	txr->hn_stat_size = 0;
3348 	txr->hn_stat_pkts = 0;
3349 	txr->hn_stat_mcasts = 0;
3350 
3351 	return (error);
3352 }
3353 
3354 /*
3355  * Append the specified data to the indicated mbuf chain,
3356  * Extend the mbuf chain if the new data does not fit in
3357  * existing space.
3358  *
3359  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3360  * There should be an equivalent in the kernel mbuf code,
3361  * but there does not appear to be one yet.
3362  *
3363  * Differs from m_append() in that additional mbufs are
3364  * allocated with cluster size MJUMPAGESIZE, and filled
3365  * accordingly.
3366  *
3367  * Return the last mbuf in the chain or NULL if failed to
3368  * allocate new mbuf.
3369  */
3370 static struct mbuf *
3371 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3372 {
3373 	struct mbuf *m, *n;
3374 	int remainder, space;
3375 
3376 	for (m = m0; m->m_next != NULL; m = m->m_next)
3377 		;
3378 	remainder = len;
3379 	space = M_TRAILINGSPACE(m);
3380 	if (space > 0) {
3381 		/*
3382 		 * Copy into available space.
3383 		 */
3384 		if (space > remainder)
3385 			space = remainder;
3386 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3387 		m->m_len += space;
3388 		cp += space;
3389 		remainder -= space;
3390 	}
3391 	while (remainder > 0) {
3392 		/*
3393 		 * Allocate a new mbuf; could check space
3394 		 * and allocate a cluster instead.
3395 		 */
3396 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3397 		if (n == NULL)
3398 			return NULL;
3399 		n->m_len = min(MJUMPAGESIZE, remainder);
3400 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3401 		cp += n->m_len;
3402 		remainder -= n->m_len;
3403 		m->m_next = n;
3404 		m = n;
3405 	}
3406 
3407 	return m;
3408 }
3409 
3410 #if defined(INET) || defined(INET6)
3411 static __inline int
3412 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3413 {
3414 #if __FreeBSD_version >= 1100095
3415 	if (hn_lro_mbufq_depth) {
3416 		tcp_lro_queue_mbuf(lc, m);
3417 		return 0;
3418 	}
3419 #endif
3420 	return tcp_lro_rx(lc, m, 0);
3421 }
3422 #endif
3423 
3424 static int
3425 hn_rxpkt(struct hn_rx_ring *rxr)
3426 {
3427 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3428 	struct mbuf *m_new, *n;
3429 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3430 	int hash_type = M_HASHTYPE_NONE;
3431 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3432 	int i;
3433 
3434 	ifp = hn_ifp;
3435 	if (rxr->hn_rxvf_ifp != NULL) {
3436 		/*
3437 		 * Non-transparent mode VF; pretend this packet is from
3438 		 * the VF.
3439 		 */
3440 		ifp = rxr->hn_rxvf_ifp;
3441 		is_vf = 1;
3442 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3443 		/* Transparent mode VF. */
3444 		is_vf = 1;
3445 	}
3446 
3447 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3448 		/*
3449 		 * NOTE:
3450 		 * See the NOTE of hn_rndis_init_fixat().  This
3451 		 * function can be reached, immediately after the
3452 		 * RNDIS is initialized but before the ifnet is
3453 		 * setup on the hn_attach() path; drop the unexpected
3454 		 * packets.
3455 		 */
3456 		return (0);
3457 	}
3458 
3459 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3460 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3461 		return (0);
3462 	}
3463 
3464 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3465 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3466 		if (m_new == NULL) {
3467 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3468 			return (0);
3469 		}
3470 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3471 		    rxr->rsc.frag_len[0]);
3472 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3473 	} else {
3474 		/*
3475 		 * Get an mbuf with a cluster.  For packets 2K or less,
3476 		 * get a standard 2K cluster.  For anything larger, get a
3477 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3478 		 * if looped around to the Hyper-V TX channel, so avoid them.
3479 		 */
3480 		size = MCLBYTES;
3481 		if (rxr->rsc.pktlen > MCLBYTES) {
3482 			/* 4096 */
3483 			size = MJUMPAGESIZE;
3484 		}
3485 
3486 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3487 		if (m_new == NULL) {
3488 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3489 			return (0);
3490 		}
3491 
3492 		n = m_new;
3493 		for (i = 0; i < rxr->rsc.cnt; i++) {
3494 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3495 			    rxr->rsc.frag_data[i]);
3496 			if (n == NULL) {
3497 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3498 				return (0);
3499 			} else {
3500 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3501 			}
3502 		}
3503 	}
3504 	if (rxr->rsc.pktlen <= MHLEN)
3505 		rxr->hn_small_pkts++;
3506 
3507 	m_new->m_pkthdr.rcvif = ifp;
3508 
3509 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3510 		do_csum = 0;
3511 
3512 	/* receive side checksum offload */
3513 	if (rxr->rsc.csum_info != NULL) {
3514 		/* IP csum offload */
3515 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3516 			m_new->m_pkthdr.csum_flags |=
3517 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3518 			rxr->hn_csum_ip++;
3519 		}
3520 
3521 		/* TCP/UDP csum offload */
3522 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3523 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3524 			m_new->m_pkthdr.csum_flags |=
3525 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3526 			m_new->m_pkthdr.csum_data = 0xffff;
3527 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3528 				rxr->hn_csum_tcp++;
3529 			else
3530 				rxr->hn_csum_udp++;
3531 		}
3532 
3533 		/*
3534 		 * XXX
3535 		 * As of this write (Oct 28th, 2016), host side will turn
3536 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3537 		 * the do_lro setting here is actually _not_ accurate.  We
3538 		 * depend on the RSS hash type check to reset do_lro.
3539 		 */
3540 		if ((*(rxr->rsc.csum_info) &
3541 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3542 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3543 			do_lro = 1;
3544 	} else {
3545 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3546 		if (l3proto == ETHERTYPE_IP) {
3547 			if (l4proto == IPPROTO_TCP) {
3548 				if (do_csum &&
3549 				    (rxr->hn_trust_hcsum &
3550 				     HN_TRUST_HCSUM_TCP)) {
3551 					rxr->hn_csum_trusted++;
3552 					m_new->m_pkthdr.csum_flags |=
3553 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3554 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3555 					m_new->m_pkthdr.csum_data = 0xffff;
3556 				}
3557 				do_lro = 1;
3558 			} else if (l4proto == IPPROTO_UDP) {
3559 				if (do_csum &&
3560 				    (rxr->hn_trust_hcsum &
3561 				     HN_TRUST_HCSUM_UDP)) {
3562 					rxr->hn_csum_trusted++;
3563 					m_new->m_pkthdr.csum_flags |=
3564 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3565 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3566 					m_new->m_pkthdr.csum_data = 0xffff;
3567 				}
3568 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3569 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3570 				rxr->hn_csum_trusted++;
3571 				m_new->m_pkthdr.csum_flags |=
3572 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3573 			}
3574 		}
3575 	}
3576 
3577 	if (rxr->rsc.vlan_info != NULL) {
3578 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3579 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3580 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3581 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3582 		m_new->m_flags |= M_VLANTAG;
3583 	}
3584 
3585 	/*
3586 	 * If VF is activated (tranparent/non-transparent mode does not
3587 	 * matter here).
3588 	 *
3589 	 * - Disable LRO
3590 	 *
3591 	 *   hn(4) will only receive broadcast packets, multicast packets,
3592 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3593 	 *   packet types.
3594 	 *
3595 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3596 	 *   all, since the LRO flush will use hn(4) as the receiving
3597 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3598 	 */
3599 	if (is_vf)
3600 		do_lro = 0;
3601 
3602 	/*
3603 	 * If VF is activated (tranparent/non-transparent mode does not
3604 	 * matter here), do _not_ mess with unsupported hash types or
3605 	 * functions.
3606 	 */
3607 	if (rxr->rsc.hash_info != NULL) {
3608 		rxr->hn_rss_pkts++;
3609 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3610 		if (!is_vf)
3611 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3612 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3613 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3614 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3615 			    rxr->hn_mbuf_hash);
3616 
3617 			/*
3618 			 * NOTE:
3619 			 * do_lro is resetted, if the hash types are not TCP
3620 			 * related.  See the comment in the above csum_flags
3621 			 * setup section.
3622 			 */
3623 			switch (type) {
3624 			case NDIS_HASH_IPV4:
3625 				hash_type = M_HASHTYPE_RSS_IPV4;
3626 				do_lro = 0;
3627 				break;
3628 
3629 			case NDIS_HASH_TCP_IPV4:
3630 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3631 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3632 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3633 
3634 					if (is_vf)
3635 						def_htype = M_HASHTYPE_NONE;
3636 
3637 					/*
3638 					 * UDP 4-tuple hash is delivered as
3639 					 * TCP 4-tuple hash.
3640 					 */
3641 					if (l3proto == ETHERTYPE_MAX) {
3642 						hn_rxpkt_proto(m_new,
3643 						    &l3proto, &l4proto);
3644 					}
3645 					if (l3proto == ETHERTYPE_IP) {
3646 						if (l4proto == IPPROTO_UDP &&
3647 						    (rxr->hn_mbuf_hash &
3648 						     NDIS_HASH_UDP_IPV4_X)) {
3649 							hash_type =
3650 							M_HASHTYPE_RSS_UDP_IPV4;
3651 							do_lro = 0;
3652 						} else if (l4proto !=
3653 						    IPPROTO_TCP) {
3654 							hash_type = def_htype;
3655 							do_lro = 0;
3656 						}
3657 					} else {
3658 						hash_type = def_htype;
3659 						do_lro = 0;
3660 					}
3661 				}
3662 				break;
3663 
3664 			case NDIS_HASH_IPV6:
3665 				hash_type = M_HASHTYPE_RSS_IPV6;
3666 				do_lro = 0;
3667 				break;
3668 
3669 			case NDIS_HASH_IPV6_EX:
3670 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3671 				do_lro = 0;
3672 				break;
3673 
3674 			case NDIS_HASH_TCP_IPV6:
3675 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3676 				break;
3677 
3678 			case NDIS_HASH_TCP_IPV6_EX:
3679 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3680 				break;
3681 			}
3682 		}
3683 	} else if (!is_vf) {
3684 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3685 		hash_type = M_HASHTYPE_OPAQUE;
3686 	}
3687 	M_HASHTYPE_SET(m_new, hash_type);
3688 
3689 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3690 	if (hn_ifp != ifp) {
3691 		const struct ether_header *eh;
3692 
3693 		/*
3694 		 * Non-transparent mode VF is activated.
3695 		 */
3696 
3697 		/*
3698 		 * Allow tapping on hn(4).
3699 		 */
3700 		ETHER_BPF_MTAP(hn_ifp, m_new);
3701 
3702 		/*
3703 		 * Update hn(4)'s stats.
3704 		 */
3705 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3706 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3707 		/* Checked at the beginning of this function. */
3708 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3709 		eh = mtod(m_new, struct ether_header *);
3710 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3711 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3712 	}
3713 	rxr->hn_pkts++;
3714 
3715 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3716 #if defined(INET) || defined(INET6)
3717 		struct lro_ctrl *lro = &rxr->hn_lro;
3718 
3719 		if (lro->lro_cnt) {
3720 			rxr->hn_lro_tried++;
3721 			if (hn_lro_rx(lro, m_new) == 0) {
3722 				/* DONE! */
3723 				return 0;
3724 			}
3725 		}
3726 #endif
3727 	}
3728 	ifp->if_input(ifp, m_new);
3729 
3730 	return (0);
3731 }
3732 
3733 static int
3734 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3735 {
3736 	struct hn_softc *sc = ifp->if_softc;
3737 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3738 	struct ifnet *vf_ifp;
3739 	int mask, error = 0;
3740 	struct ifrsskey *ifrk;
3741 	struct ifrsshash *ifrh;
3742 	uint32_t mtu;
3743 
3744 	switch (cmd) {
3745 	case SIOCSIFMTU:
3746 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3747 			error = EINVAL;
3748 			break;
3749 		}
3750 
3751 		HN_LOCK(sc);
3752 
3753 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3754 			HN_UNLOCK(sc);
3755 			break;
3756 		}
3757 
3758 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3759 			/* Can't change MTU */
3760 			HN_UNLOCK(sc);
3761 			error = EOPNOTSUPP;
3762 			break;
3763 		}
3764 
3765 		if (ifp->if_mtu == ifr->ifr_mtu) {
3766 			HN_UNLOCK(sc);
3767 			break;
3768 		}
3769 
3770 		if (hn_xpnt_vf_isready(sc)) {
3771 			vf_ifp = sc->hn_vf_ifp;
3772 			ifr_vf = *ifr;
3773 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3774 			    sizeof(ifr_vf.ifr_name));
3775 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3776 			    (caddr_t)&ifr_vf);
3777 			if (error) {
3778 				HN_UNLOCK(sc);
3779 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3780 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3781 				break;
3782 			}
3783 		}
3784 
3785 		/*
3786 		 * Suspend this interface before the synthetic parts
3787 		 * are ripped.
3788 		 */
3789 		hn_suspend(sc);
3790 
3791 		/*
3792 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3793 		 */
3794 		hn_synth_detach(sc);
3795 
3796 		/*
3797 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3798 		 * with the new MTU setting.
3799 		 */
3800 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3801 		if (error) {
3802 			HN_UNLOCK(sc);
3803 			break;
3804 		}
3805 
3806 		error = hn_rndis_get_mtu(sc, &mtu);
3807 		if (error)
3808 			mtu = ifr->ifr_mtu;
3809 		else if (bootverbose)
3810 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3811 
3812 		/*
3813 		 * Commit the requested MTU, after the synthetic parts
3814 		 * have been successfully attached.
3815 		 */
3816 		if (mtu >= ifr->ifr_mtu) {
3817 			mtu = ifr->ifr_mtu;
3818 		} else {
3819 			if_printf(ifp, "fixup mtu %d -> %u\n",
3820 			    ifr->ifr_mtu, mtu);
3821 		}
3822 		ifp->if_mtu = mtu;
3823 
3824 		/*
3825 		 * Synthetic parts' reattach may change the chimney
3826 		 * sending size; update it.
3827 		 */
3828 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3829 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3830 
3831 		/*
3832 		 * Make sure that various parameters based on MTU are
3833 		 * still valid, after the MTU change.
3834 		 */
3835 		hn_mtu_change_fixup(sc);
3836 
3837 		/*
3838 		 * All done!  Resume the interface now.
3839 		 */
3840 		hn_resume(sc);
3841 
3842 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3843 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3844 			/*
3845 			 * Since we have reattached the NVS part,
3846 			 * change the datapath to VF again; in case
3847 			 * that it is lost, after the NVS was detached.
3848 			 */
3849 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3850 		}
3851 
3852 		HN_UNLOCK(sc);
3853 		break;
3854 
3855 	case SIOCSIFFLAGS:
3856 		HN_LOCK(sc);
3857 
3858 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3859 			HN_UNLOCK(sc);
3860 			break;
3861 		}
3862 
3863 		if (hn_xpnt_vf_isready(sc))
3864 			hn_xpnt_vf_saveifflags(sc);
3865 
3866 		if (ifp->if_flags & IFF_UP) {
3867 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3868 				/*
3869 				 * Caller meight hold mutex, e.g.
3870 				 * bpf; use busy-wait for the RNDIS
3871 				 * reply.
3872 				 */
3873 				HN_NO_SLEEPING(sc);
3874 				hn_rxfilter_config(sc);
3875 				HN_SLEEPING_OK(sc);
3876 
3877 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3878 					error = hn_xpnt_vf_iocsetflags(sc);
3879 			} else {
3880 				hn_init_locked(sc);
3881 			}
3882 		} else {
3883 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3884 				hn_stop(sc, false);
3885 		}
3886 		sc->hn_if_flags = ifp->if_flags;
3887 
3888 		HN_UNLOCK(sc);
3889 		break;
3890 
3891 	case SIOCSIFCAP:
3892 		HN_LOCK(sc);
3893 
3894 		if (hn_xpnt_vf_isready(sc)) {
3895 			ifr_vf = *ifr;
3896 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3897 			    sizeof(ifr_vf.ifr_name));
3898 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3899 			HN_UNLOCK(sc);
3900 			break;
3901 		}
3902 
3903 		/*
3904 		 * Fix up requested capabilities w/ supported capabilities,
3905 		 * since the supported capabilities could have been changed.
3906 		 */
3907 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3908 		    ifp->if_capenable;
3909 
3910 		if (mask & IFCAP_TXCSUM) {
3911 			ifp->if_capenable ^= IFCAP_TXCSUM;
3912 			if (ifp->if_capenable & IFCAP_TXCSUM)
3913 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3914 			else
3915 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3916 		}
3917 		if (mask & IFCAP_TXCSUM_IPV6) {
3918 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3919 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3920 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3921 			else
3922 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3923 		}
3924 
3925 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3926 		if (mask & IFCAP_RXCSUM)
3927 			ifp->if_capenable ^= IFCAP_RXCSUM;
3928 #ifdef foo
3929 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3930 		if (mask & IFCAP_RXCSUM_IPV6)
3931 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3932 #endif
3933 
3934 		if (mask & IFCAP_LRO)
3935 			ifp->if_capenable ^= IFCAP_LRO;
3936 
3937 		if (mask & IFCAP_TSO4) {
3938 			ifp->if_capenable ^= IFCAP_TSO4;
3939 			if (ifp->if_capenable & IFCAP_TSO4)
3940 				ifp->if_hwassist |= CSUM_IP_TSO;
3941 			else
3942 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3943 		}
3944 		if (mask & IFCAP_TSO6) {
3945 			ifp->if_capenable ^= IFCAP_TSO6;
3946 			if (ifp->if_capenable & IFCAP_TSO6)
3947 				ifp->if_hwassist |= CSUM_IP6_TSO;
3948 			else
3949 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3950 		}
3951 
3952 		HN_UNLOCK(sc);
3953 		break;
3954 
3955 	case SIOCADDMULTI:
3956 	case SIOCDELMULTI:
3957 		HN_LOCK(sc);
3958 
3959 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3960 			HN_UNLOCK(sc);
3961 			break;
3962 		}
3963 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3964 			/*
3965 			 * Multicast uses mutex; use busy-wait for
3966 			 * the RNDIS reply.
3967 			 */
3968 			HN_NO_SLEEPING(sc);
3969 			hn_rxfilter_config(sc);
3970 			HN_SLEEPING_OK(sc);
3971 		}
3972 
3973 		/* XXX vlan(4) style mcast addr maintenance */
3974 		if (hn_xpnt_vf_isready(sc)) {
3975 			int old_if_flags;
3976 
3977 			old_if_flags = sc->hn_vf_ifp->if_flags;
3978 			hn_xpnt_vf_saveifflags(sc);
3979 
3980 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3981 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3982 			     IFF_ALLMULTI))
3983 				error = hn_xpnt_vf_iocsetflags(sc);
3984 		}
3985 
3986 		HN_UNLOCK(sc);
3987 		break;
3988 
3989 	case SIOCSIFMEDIA:
3990 	case SIOCGIFMEDIA:
3991 		HN_LOCK(sc);
3992 		if (hn_xpnt_vf_isready(sc)) {
3993 			/*
3994 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3995 			 * create and pass ifr_vf to the VF here; just
3996 			 * replace the ifr_name.
3997 			 */
3998 			vf_ifp = sc->hn_vf_ifp;
3999 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
4000 			    sizeof(ifr->ifr_name));
4001 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
4002 			/* Restore the ifr_name. */
4003 			strlcpy(ifr->ifr_name, ifp->if_xname,
4004 			    sizeof(ifr->ifr_name));
4005 			HN_UNLOCK(sc);
4006 			break;
4007 		}
4008 		HN_UNLOCK(sc);
4009 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4010 		break;
4011 
4012 	case SIOCGIFRSSHASH:
4013 		ifrh = (struct ifrsshash *)data;
4014 		HN_LOCK(sc);
4015 		if (sc->hn_rx_ring_inuse == 1) {
4016 			HN_UNLOCK(sc);
4017 			ifrh->ifrh_func = RSS_FUNC_NONE;
4018 			ifrh->ifrh_types = 0;
4019 			break;
4020 		}
4021 
4022 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4023 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4024 		else
4025 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4026 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4027 		HN_UNLOCK(sc);
4028 		break;
4029 
4030 	case SIOCGIFRSSKEY:
4031 		ifrk = (struct ifrsskey *)data;
4032 		HN_LOCK(sc);
4033 		if (sc->hn_rx_ring_inuse == 1) {
4034 			HN_UNLOCK(sc);
4035 			ifrk->ifrk_func = RSS_FUNC_NONE;
4036 			ifrk->ifrk_keylen = 0;
4037 			break;
4038 		}
4039 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4040 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4041 		else
4042 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4043 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4044 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4045 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4046 		HN_UNLOCK(sc);
4047 		break;
4048 
4049 	default:
4050 		error = ether_ioctl(ifp, cmd, data);
4051 		break;
4052 	}
4053 	return (error);
4054 }
4055 
4056 static void
4057 hn_stop(struct hn_softc *sc, bool detaching)
4058 {
4059 	struct ifnet *ifp = sc->hn_ifp;
4060 	int i;
4061 
4062 	HN_LOCK_ASSERT(sc);
4063 
4064 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4065 	    ("synthetic parts were not attached"));
4066 
4067 	/* Clear RUNNING bit ASAP. */
4068 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4069 
4070 	/* Disable polling. */
4071 	hn_polling(sc, 0);
4072 
4073 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4074 		KASSERT(sc->hn_vf_ifp != NULL,
4075 		    ("%s: VF is not attached", ifp->if_xname));
4076 
4077 		/* Mark transparent mode VF as disabled. */
4078 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4079 
4080 		/*
4081 		 * NOTE:
4082 		 * Datapath setting must happen _before_ bringing
4083 		 * the VF down.
4084 		 */
4085 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4086 
4087 		/*
4088 		 * Bring the VF down.
4089 		 */
4090 		hn_xpnt_vf_saveifflags(sc);
4091 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4092 		hn_xpnt_vf_iocsetflags(sc);
4093 	}
4094 
4095 	/* Suspend data transfers. */
4096 	hn_suspend_data(sc);
4097 
4098 	/* Clear OACTIVE bit. */
4099 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4100 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4101 		sc->hn_tx_ring[i].hn_oactive = 0;
4102 
4103 	/*
4104 	 * If the non-transparent mode VF is active, make sure
4105 	 * that the RX filter still allows packet reception.
4106 	 */
4107 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4108 		hn_rxfilter_config(sc);
4109 }
4110 
4111 static void
4112 hn_init_locked(struct hn_softc *sc)
4113 {
4114 	struct ifnet *ifp = sc->hn_ifp;
4115 	int i;
4116 
4117 	HN_LOCK_ASSERT(sc);
4118 
4119 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4120 		return;
4121 
4122 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4123 		return;
4124 
4125 	/* Configure RX filter */
4126 	hn_rxfilter_config(sc);
4127 
4128 	/* Clear OACTIVE bit. */
4129 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4130 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4131 		sc->hn_tx_ring[i].hn_oactive = 0;
4132 
4133 	/* Clear TX 'suspended' bit. */
4134 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4135 
4136 	if (hn_xpnt_vf_isready(sc)) {
4137 		/* Initialize transparent VF. */
4138 		hn_xpnt_vf_init(sc);
4139 	}
4140 
4141 	/* Everything is ready; unleash! */
4142 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4143 
4144 	/* Re-enable polling if requested. */
4145 	if (sc->hn_pollhz > 0)
4146 		hn_polling(sc, sc->hn_pollhz);
4147 }
4148 
4149 static void
4150 hn_init(void *xsc)
4151 {
4152 	struct hn_softc *sc = xsc;
4153 
4154 	HN_LOCK(sc);
4155 	hn_init_locked(sc);
4156 	HN_UNLOCK(sc);
4157 }
4158 
4159 #if __FreeBSD_version >= 1100099
4160 
4161 static int
4162 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4163 {
4164 	struct hn_softc *sc = arg1;
4165 	unsigned int lenlim;
4166 	int error;
4167 
4168 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4169 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4170 	if (error || req->newptr == NULL)
4171 		return error;
4172 
4173 	HN_LOCK(sc);
4174 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4175 	    lenlim > TCP_LRO_LENGTH_MAX) {
4176 		HN_UNLOCK(sc);
4177 		return EINVAL;
4178 	}
4179 	hn_set_lro_lenlim(sc, lenlim);
4180 	HN_UNLOCK(sc);
4181 
4182 	return 0;
4183 }
4184 
4185 static int
4186 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4187 {
4188 	struct hn_softc *sc = arg1;
4189 	int ackcnt, error, i;
4190 
4191 	/*
4192 	 * lro_ackcnt_lim is append count limit,
4193 	 * +1 to turn it into aggregation limit.
4194 	 */
4195 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4196 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4197 	if (error || req->newptr == NULL)
4198 		return error;
4199 
4200 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4201 		return EINVAL;
4202 
4203 	/*
4204 	 * Convert aggregation limit back to append
4205 	 * count limit.
4206 	 */
4207 	--ackcnt;
4208 	HN_LOCK(sc);
4209 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4210 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4211 	HN_UNLOCK(sc);
4212 	return 0;
4213 }
4214 
4215 #endif
4216 
4217 static int
4218 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4219 {
4220 	struct hn_softc *sc = arg1;
4221 	int hcsum = arg2;
4222 	int on, error, i;
4223 
4224 	on = 0;
4225 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4226 		on = 1;
4227 
4228 	error = sysctl_handle_int(oidp, &on, 0, req);
4229 	if (error || req->newptr == NULL)
4230 		return error;
4231 
4232 	HN_LOCK(sc);
4233 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4234 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4235 
4236 		if (on)
4237 			rxr->hn_trust_hcsum |= hcsum;
4238 		else
4239 			rxr->hn_trust_hcsum &= ~hcsum;
4240 	}
4241 	HN_UNLOCK(sc);
4242 	return 0;
4243 }
4244 
4245 static int
4246 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4247 {
4248 	struct hn_softc *sc = arg1;
4249 	int chim_size, error;
4250 
4251 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4252 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4253 	if (error || req->newptr == NULL)
4254 		return error;
4255 
4256 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4257 		return EINVAL;
4258 
4259 	HN_LOCK(sc);
4260 	hn_set_chim_size(sc, chim_size);
4261 	HN_UNLOCK(sc);
4262 	return 0;
4263 }
4264 
4265 #if __FreeBSD_version < 1100095
4266 static int
4267 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4268 {
4269 	struct hn_softc *sc = arg1;
4270 	int ofs = arg2, i, error;
4271 	struct hn_rx_ring *rxr;
4272 	uint64_t stat;
4273 
4274 	stat = 0;
4275 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4276 		rxr = &sc->hn_rx_ring[i];
4277 		stat += *((int *)((uint8_t *)rxr + ofs));
4278 	}
4279 
4280 	error = sysctl_handle_64(oidp, &stat, 0, req);
4281 	if (error || req->newptr == NULL)
4282 		return error;
4283 
4284 	/* Zero out this stat. */
4285 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4286 		rxr = &sc->hn_rx_ring[i];
4287 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4288 	}
4289 	return 0;
4290 }
4291 #else
4292 static int
4293 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4294 {
4295 	struct hn_softc *sc = arg1;
4296 	int ofs = arg2, i, error;
4297 	struct hn_rx_ring *rxr;
4298 	uint64_t stat;
4299 
4300 	stat = 0;
4301 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4302 		rxr = &sc->hn_rx_ring[i];
4303 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4304 	}
4305 
4306 	error = sysctl_handle_64(oidp, &stat, 0, req);
4307 	if (error || req->newptr == NULL)
4308 		return error;
4309 
4310 	/* Zero out this stat. */
4311 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4312 		rxr = &sc->hn_rx_ring[i];
4313 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4314 	}
4315 	return 0;
4316 }
4317 
4318 #endif
4319 
4320 static int
4321 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4322 {
4323 	struct hn_softc *sc = arg1;
4324 	int ofs = arg2, i, error;
4325 	struct hn_rx_ring *rxr;
4326 	u_long stat;
4327 
4328 	stat = 0;
4329 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4330 		rxr = &sc->hn_rx_ring[i];
4331 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4332 	}
4333 
4334 	error = sysctl_handle_long(oidp, &stat, 0, req);
4335 	if (error || req->newptr == NULL)
4336 		return error;
4337 
4338 	/* Zero out this stat. */
4339 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4340 		rxr = &sc->hn_rx_ring[i];
4341 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4342 	}
4343 	return 0;
4344 }
4345 
4346 static int
4347 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4348 {
4349 	struct hn_softc *sc = arg1;
4350 	int ofs = arg2, i, error;
4351 	struct hn_tx_ring *txr;
4352 	u_long stat;
4353 
4354 	stat = 0;
4355 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4356 		txr = &sc->hn_tx_ring[i];
4357 		stat += *((u_long *)((uint8_t *)txr + ofs));
4358 	}
4359 
4360 	error = sysctl_handle_long(oidp, &stat, 0, req);
4361 	if (error || req->newptr == NULL)
4362 		return error;
4363 
4364 	/* Zero out this stat. */
4365 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4366 		txr = &sc->hn_tx_ring[i];
4367 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4368 	}
4369 	return 0;
4370 }
4371 
4372 static int
4373 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4374 {
4375 	struct hn_softc *sc = arg1;
4376 	int ofs = arg2, i, error, conf;
4377 	struct hn_tx_ring *txr;
4378 
4379 	txr = &sc->hn_tx_ring[0];
4380 	conf = *((int *)((uint8_t *)txr + ofs));
4381 
4382 	error = sysctl_handle_int(oidp, &conf, 0, req);
4383 	if (error || req->newptr == NULL)
4384 		return error;
4385 
4386 	HN_LOCK(sc);
4387 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4388 		txr = &sc->hn_tx_ring[i];
4389 		*((int *)((uint8_t *)txr + ofs)) = conf;
4390 	}
4391 	HN_UNLOCK(sc);
4392 
4393 	return 0;
4394 }
4395 
4396 static int
4397 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4398 {
4399 	struct hn_softc *sc = arg1;
4400 	int error, size;
4401 
4402 	size = sc->hn_agg_size;
4403 	error = sysctl_handle_int(oidp, &size, 0, req);
4404 	if (error || req->newptr == NULL)
4405 		return (error);
4406 
4407 	HN_LOCK(sc);
4408 	sc->hn_agg_size = size;
4409 	hn_set_txagg(sc);
4410 	HN_UNLOCK(sc);
4411 
4412 	return (0);
4413 }
4414 
4415 static int
4416 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4417 {
4418 	struct hn_softc *sc = arg1;
4419 	int error, pkts;
4420 
4421 	pkts = sc->hn_agg_pkts;
4422 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4423 	if (error || req->newptr == NULL)
4424 		return (error);
4425 
4426 	HN_LOCK(sc);
4427 	sc->hn_agg_pkts = pkts;
4428 	hn_set_txagg(sc);
4429 	HN_UNLOCK(sc);
4430 
4431 	return (0);
4432 }
4433 
4434 static int
4435 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4436 {
4437 	struct hn_softc *sc = arg1;
4438 	int pkts;
4439 
4440 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4441 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4442 }
4443 
4444 static int
4445 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4446 {
4447 	struct hn_softc *sc = arg1;
4448 	int align;
4449 
4450 	align = sc->hn_tx_ring[0].hn_agg_align;
4451 	return (sysctl_handle_int(oidp, &align, 0, req));
4452 }
4453 
4454 static void
4455 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4456 {
4457 	if (pollhz == 0)
4458 		vmbus_chan_poll_disable(chan);
4459 	else
4460 		vmbus_chan_poll_enable(chan, pollhz);
4461 }
4462 
4463 static void
4464 hn_polling(struct hn_softc *sc, u_int pollhz)
4465 {
4466 	int nsubch = sc->hn_rx_ring_inuse - 1;
4467 
4468 	HN_LOCK_ASSERT(sc);
4469 
4470 	if (nsubch > 0) {
4471 		struct vmbus_channel **subch;
4472 		int i;
4473 
4474 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4475 		for (i = 0; i < nsubch; ++i)
4476 			hn_chan_polling(subch[i], pollhz);
4477 		vmbus_subchan_rel(subch, nsubch);
4478 	}
4479 	hn_chan_polling(sc->hn_prichan, pollhz);
4480 }
4481 
4482 static int
4483 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4484 {
4485 	struct hn_softc *sc = arg1;
4486 	int pollhz, error;
4487 
4488 	pollhz = sc->hn_pollhz;
4489 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4490 	if (error || req->newptr == NULL)
4491 		return (error);
4492 
4493 	if (pollhz != 0 &&
4494 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4495 		return (EINVAL);
4496 
4497 	HN_LOCK(sc);
4498 	if (sc->hn_pollhz != pollhz) {
4499 		sc->hn_pollhz = pollhz;
4500 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4501 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4502 			hn_polling(sc, sc->hn_pollhz);
4503 	}
4504 	HN_UNLOCK(sc);
4505 
4506 	return (0);
4507 }
4508 
4509 static int
4510 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4511 {
4512 	struct hn_softc *sc = arg1;
4513 	char verstr[16];
4514 
4515 	snprintf(verstr, sizeof(verstr), "%u.%u",
4516 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4517 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4518 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4519 }
4520 
4521 static int
4522 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4523 {
4524 	struct hn_softc *sc = arg1;
4525 	char caps_str[128];
4526 	uint32_t caps;
4527 
4528 	HN_LOCK(sc);
4529 	caps = sc->hn_caps;
4530 	HN_UNLOCK(sc);
4531 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4532 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4533 }
4534 
4535 static int
4536 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4537 {
4538 	struct hn_softc *sc = arg1;
4539 	char assist_str[128];
4540 	uint32_t hwassist;
4541 
4542 	HN_LOCK(sc);
4543 	hwassist = sc->hn_ifp->if_hwassist;
4544 	HN_UNLOCK(sc);
4545 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4546 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4547 }
4548 
4549 static int
4550 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4551 {
4552 	struct hn_softc *sc = arg1;
4553 	char filter_str[128];
4554 	uint32_t filter;
4555 
4556 	HN_LOCK(sc);
4557 	filter = sc->hn_rx_filter;
4558 	HN_UNLOCK(sc);
4559 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4560 	    NDIS_PACKET_TYPES);
4561 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4562 }
4563 
4564 #ifndef RSS
4565 
4566 static int
4567 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4568 {
4569 	struct hn_softc *sc = arg1;
4570 	int error;
4571 
4572 	HN_LOCK(sc);
4573 
4574 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4575 	if (error || req->newptr == NULL)
4576 		goto back;
4577 
4578 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4579 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4580 		/*
4581 		 * RSS key is synchronized w/ VF's, don't allow users
4582 		 * to change it.
4583 		 */
4584 		error = EBUSY;
4585 		goto back;
4586 	}
4587 
4588 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4589 	if (error)
4590 		goto back;
4591 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4592 
4593 	if (sc->hn_rx_ring_inuse > 1) {
4594 		error = hn_rss_reconfig(sc);
4595 	} else {
4596 		/* Not RSS capable, at least for now; just save the RSS key. */
4597 		error = 0;
4598 	}
4599 back:
4600 	HN_UNLOCK(sc);
4601 	return (error);
4602 }
4603 
4604 static int
4605 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4606 {
4607 	struct hn_softc *sc = arg1;
4608 	int error;
4609 
4610 	HN_LOCK(sc);
4611 
4612 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4613 	if (error || req->newptr == NULL)
4614 		goto back;
4615 
4616 	/*
4617 	 * Don't allow RSS indirect table change, if this interface is not
4618 	 * RSS capable currently.
4619 	 */
4620 	if (sc->hn_rx_ring_inuse == 1) {
4621 		error = EOPNOTSUPP;
4622 		goto back;
4623 	}
4624 
4625 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4626 	if (error)
4627 		goto back;
4628 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4629 
4630 	hn_rss_ind_fixup(sc);
4631 	error = hn_rss_reconfig(sc);
4632 back:
4633 	HN_UNLOCK(sc);
4634 	return (error);
4635 }
4636 
4637 #endif	/* !RSS */
4638 
4639 static int
4640 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4641 {
4642 	struct hn_softc *sc = arg1;
4643 	char hash_str[128];
4644 	uint32_t hash;
4645 
4646 	HN_LOCK(sc);
4647 	hash = sc->hn_rss_hash;
4648 	HN_UNLOCK(sc);
4649 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4650 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4651 }
4652 
4653 static int
4654 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4655 {
4656 	struct hn_softc *sc = arg1;
4657 	char hash_str[128];
4658 	uint32_t hash;
4659 
4660 	HN_LOCK(sc);
4661 	hash = sc->hn_rss_hcap;
4662 	HN_UNLOCK(sc);
4663 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4664 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4665 }
4666 
4667 static int
4668 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4669 {
4670 	struct hn_softc *sc = arg1;
4671 	char hash_str[128];
4672 	uint32_t hash;
4673 
4674 	HN_LOCK(sc);
4675 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4676 	HN_UNLOCK(sc);
4677 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4678 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4679 }
4680 
4681 static int
4682 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4683 {
4684 	struct hn_softc *sc = arg1;
4685 	char vf_name[IFNAMSIZ + 1];
4686 	struct ifnet *vf_ifp;
4687 
4688 	HN_LOCK(sc);
4689 	vf_name[0] = '\0';
4690 	vf_ifp = sc->hn_vf_ifp;
4691 	if (vf_ifp != NULL)
4692 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4693 	HN_UNLOCK(sc);
4694 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4695 }
4696 
4697 static int
4698 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4699 {
4700 	struct hn_softc *sc = arg1;
4701 	char vf_name[IFNAMSIZ + 1];
4702 	struct ifnet *vf_ifp;
4703 
4704 	HN_LOCK(sc);
4705 	vf_name[0] = '\0';
4706 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4707 	if (vf_ifp != NULL)
4708 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4709 	HN_UNLOCK(sc);
4710 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4711 }
4712 
4713 static int
4714 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4715 {
4716 	struct rm_priotracker pt;
4717 	struct sbuf *sb;
4718 	int error, i;
4719 	bool first;
4720 
4721 	error = sysctl_wire_old_buffer(req, 0);
4722 	if (error != 0)
4723 		return (error);
4724 
4725 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4726 	if (sb == NULL)
4727 		return (ENOMEM);
4728 
4729 	rm_rlock(&hn_vfmap_lock, &pt);
4730 
4731 	first = true;
4732 	for (i = 0; i < hn_vfmap_size; ++i) {
4733 		struct ifnet *ifp;
4734 
4735 		if (hn_vfmap[i] == NULL)
4736 			continue;
4737 
4738 		ifp = ifnet_byindex(i);
4739 		if (ifp != NULL) {
4740 			if (first)
4741 				sbuf_printf(sb, "%s", ifp->if_xname);
4742 			else
4743 				sbuf_printf(sb, " %s", ifp->if_xname);
4744 			first = false;
4745 		}
4746 	}
4747 
4748 	rm_runlock(&hn_vfmap_lock, &pt);
4749 
4750 	error = sbuf_finish(sb);
4751 	sbuf_delete(sb);
4752 	return (error);
4753 }
4754 
4755 static int
4756 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4757 {
4758 	struct rm_priotracker pt;
4759 	struct sbuf *sb;
4760 	int error, i;
4761 	bool first;
4762 
4763 	error = sysctl_wire_old_buffer(req, 0);
4764 	if (error != 0)
4765 		return (error);
4766 
4767 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4768 	if (sb == NULL)
4769 		return (ENOMEM);
4770 
4771 	rm_rlock(&hn_vfmap_lock, &pt);
4772 
4773 	first = true;
4774 	for (i = 0; i < hn_vfmap_size; ++i) {
4775 		struct ifnet *ifp, *hn_ifp;
4776 
4777 		hn_ifp = hn_vfmap[i];
4778 		if (hn_ifp == NULL)
4779 			continue;
4780 
4781 		ifp = ifnet_byindex(i);
4782 		if (ifp != NULL) {
4783 			if (first) {
4784 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4785 				    hn_ifp->if_xname);
4786 			} else {
4787 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4788 				    hn_ifp->if_xname);
4789 			}
4790 			first = false;
4791 		}
4792 	}
4793 
4794 	rm_runlock(&hn_vfmap_lock, &pt);
4795 
4796 	error = sbuf_finish(sb);
4797 	sbuf_delete(sb);
4798 	return (error);
4799 }
4800 
4801 static int
4802 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4803 {
4804 	struct hn_softc *sc = arg1;
4805 	int error, onoff = 0;
4806 
4807 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4808 		onoff = 1;
4809 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4810 	if (error || req->newptr == NULL)
4811 		return (error);
4812 
4813 	HN_LOCK(sc);
4814 	/* NOTE: hn_vf_lock for hn_transmit() */
4815 	rm_wlock(&sc->hn_vf_lock);
4816 	if (onoff)
4817 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4818 	else
4819 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4820 	rm_wunlock(&sc->hn_vf_lock);
4821 	HN_UNLOCK(sc);
4822 
4823 	return (0);
4824 }
4825 
4826 static int
4827 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4828 {
4829 	struct hn_softc *sc = arg1;
4830 	int enabled = 0;
4831 
4832 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4833 		enabled = 1;
4834 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4835 }
4836 
4837 static int
4838 hn_check_iplen(const struct mbuf *m, int hoff)
4839 {
4840 	const struct ip *ip;
4841 	int len, iphlen, iplen;
4842 	const struct tcphdr *th;
4843 	int thoff;				/* TCP data offset */
4844 
4845 	len = hoff + sizeof(struct ip);
4846 
4847 	/* The packet must be at least the size of an IP header. */
4848 	if (m->m_pkthdr.len < len)
4849 		return IPPROTO_DONE;
4850 
4851 	/* The fixed IP header must reside completely in the first mbuf. */
4852 	if (m->m_len < len)
4853 		return IPPROTO_DONE;
4854 
4855 	ip = mtodo(m, hoff);
4856 
4857 	/* Bound check the packet's stated IP header length. */
4858 	iphlen = ip->ip_hl << 2;
4859 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4860 		return IPPROTO_DONE;
4861 
4862 	/* The full IP header must reside completely in the one mbuf. */
4863 	if (m->m_len < hoff + iphlen)
4864 		return IPPROTO_DONE;
4865 
4866 	iplen = ntohs(ip->ip_len);
4867 
4868 	/*
4869 	 * Check that the amount of data in the buffers is as
4870 	 * at least much as the IP header would have us expect.
4871 	 */
4872 	if (m->m_pkthdr.len < hoff + iplen)
4873 		return IPPROTO_DONE;
4874 
4875 	/*
4876 	 * Ignore IP fragments.
4877 	 */
4878 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4879 		return IPPROTO_DONE;
4880 
4881 	/*
4882 	 * The TCP/IP or UDP/IP header must be entirely contained within
4883 	 * the first fragment of a packet.
4884 	 */
4885 	switch (ip->ip_p) {
4886 	case IPPROTO_TCP:
4887 		if (iplen < iphlen + sizeof(struct tcphdr))
4888 			return IPPROTO_DONE;
4889 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4890 			return IPPROTO_DONE;
4891 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4892 		thoff = th->th_off << 2;
4893 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4894 			return IPPROTO_DONE;
4895 		if (m->m_len < hoff + iphlen + thoff)
4896 			return IPPROTO_DONE;
4897 		break;
4898 	case IPPROTO_UDP:
4899 		if (iplen < iphlen + sizeof(struct udphdr))
4900 			return IPPROTO_DONE;
4901 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4902 			return IPPROTO_DONE;
4903 		break;
4904 	default:
4905 		if (iplen < iphlen)
4906 			return IPPROTO_DONE;
4907 		break;
4908 	}
4909 	return ip->ip_p;
4910 }
4911 
4912 static void
4913 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4914 {
4915 	const struct ether_header *eh;
4916 	uint16_t etype;
4917 	int hoff;
4918 
4919 	hoff = sizeof(*eh);
4920 	/* Checked at the beginning of this function. */
4921 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4922 
4923 	eh = mtod(m_new, const struct ether_header *);
4924 	etype = ntohs(eh->ether_type);
4925 	if (etype == ETHERTYPE_VLAN) {
4926 		const struct ether_vlan_header *evl;
4927 
4928 		hoff = sizeof(*evl);
4929 		if (m_new->m_len < hoff)
4930 			return;
4931 		evl = mtod(m_new, const struct ether_vlan_header *);
4932 		etype = ntohs(evl->evl_proto);
4933 	}
4934 	*l3proto = etype;
4935 
4936 	if (etype == ETHERTYPE_IP)
4937 		*l4proto = hn_check_iplen(m_new, hoff);
4938 	else
4939 		*l4proto = IPPROTO_DONE;
4940 }
4941 
4942 static int
4943 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4944 {
4945 	struct sysctl_oid_list *child;
4946 	struct sysctl_ctx_list *ctx;
4947 	device_t dev = sc->hn_dev;
4948 #if defined(INET) || defined(INET6)
4949 #if __FreeBSD_version >= 1100095
4950 	int lroent_cnt;
4951 #endif
4952 #endif
4953 	int i;
4954 
4955 	/*
4956 	 * Create RXBUF for reception.
4957 	 *
4958 	 * NOTE:
4959 	 * - It is shared by all channels.
4960 	 * - A large enough buffer is allocated, certain version of NVSes
4961 	 *   may further limit the usable space.
4962 	 */
4963 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4964 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4965 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4966 	if (sc->hn_rxbuf == NULL) {
4967 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4968 		return (ENOMEM);
4969 	}
4970 
4971 	sc->hn_rx_ring_cnt = ring_cnt;
4972 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4973 
4974 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4975 	    M_DEVBUF, M_WAITOK | M_ZERO);
4976 
4977 #if defined(INET) || defined(INET6)
4978 #if __FreeBSD_version >= 1100095
4979 	lroent_cnt = hn_lro_entry_count;
4980 	if (lroent_cnt < TCP_LRO_ENTRIES)
4981 		lroent_cnt = TCP_LRO_ENTRIES;
4982 	if (bootverbose)
4983 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4984 #endif
4985 #endif	/* INET || INET6 */
4986 
4987 	ctx = device_get_sysctl_ctx(dev);
4988 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4989 
4990 	/* Create dev.hn.UNIT.rx sysctl tree */
4991 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4992 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4993 
4994 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4995 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4996 
4997 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4998 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4999 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
5000 		if (rxr->hn_br == NULL) {
5001 			device_printf(dev, "allocate bufring failed\n");
5002 			return (ENOMEM);
5003 		}
5004 
5005 		if (hn_trust_hosttcp)
5006 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5007 		if (hn_trust_hostudp)
5008 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5009 		if (hn_trust_hostip)
5010 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5011 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5012 		rxr->hn_ifp = sc->hn_ifp;
5013 		if (i < sc->hn_tx_ring_cnt)
5014 			rxr->hn_txr = &sc->hn_tx_ring[i];
5015 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5016 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5017 		rxr->hn_rx_idx = i;
5018 		rxr->hn_rxbuf = sc->hn_rxbuf;
5019 
5020 		/*
5021 		 * Initialize LRO.
5022 		 */
5023 #if defined(INET) || defined(INET6)
5024 #if __FreeBSD_version >= 1100095
5025 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5026 		    hn_lro_mbufq_depth);
5027 #else
5028 		tcp_lro_init(&rxr->hn_lro);
5029 		rxr->hn_lro.ifp = sc->hn_ifp;
5030 #endif
5031 #if __FreeBSD_version >= 1100099
5032 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5033 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5034 #endif
5035 #endif	/* INET || INET6 */
5036 
5037 		if (sc->hn_rx_sysctl_tree != NULL) {
5038 			char name[16];
5039 
5040 			/*
5041 			 * Create per RX ring sysctl tree:
5042 			 * dev.hn.UNIT.rx.RINGID
5043 			 */
5044 			snprintf(name, sizeof(name), "%d", i);
5045 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5046 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5047 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5048 
5049 			if (rxr->hn_rx_sysctl_tree != NULL) {
5050 				SYSCTL_ADD_ULONG(ctx,
5051 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5052 				    OID_AUTO, "packets", CTLFLAG_RW,
5053 				    &rxr->hn_pkts, "# of packets received");
5054 				SYSCTL_ADD_ULONG(ctx,
5055 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5056 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5057 				    &rxr->hn_rss_pkts,
5058 				    "# of packets w/ RSS info received");
5059 				SYSCTL_ADD_ULONG(ctx,
5060 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5061 				    OID_AUTO, "rsc_pkts", CTLFLAG_RW,
5062 				    &rxr->hn_rsc_pkts,
5063 				    "# of RSC packets received");
5064 				SYSCTL_ADD_ULONG(ctx,
5065 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5066 				    OID_AUTO, "rsc_drop", CTLFLAG_RW,
5067 				    &rxr->hn_rsc_drop,
5068 				    "# of RSC fragments dropped");
5069 				SYSCTL_ADD_INT(ctx,
5070 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5071 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5072 				    &rxr->hn_pktbuf_len, 0,
5073 				    "Temporary channel packet buffer length");
5074 			}
5075 		}
5076 	}
5077 
5078 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5079 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5080 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5081 #if __FreeBSD_version < 1100095
5082 	    hn_rx_stat_int_sysctl,
5083 #else
5084 	    hn_rx_stat_u64_sysctl,
5085 #endif
5086 	    "LU", "LRO queued");
5087 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5088 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5089 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5090 #if __FreeBSD_version < 1100095
5091 	    hn_rx_stat_int_sysctl,
5092 #else
5093 	    hn_rx_stat_u64_sysctl,
5094 #endif
5095 	    "LU", "LRO flushed");
5096 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5097 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5098 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5099 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5100 #if __FreeBSD_version >= 1100099
5101 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5102 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5103 	    hn_lro_lenlim_sysctl, "IU",
5104 	    "Max # of data bytes to be aggregated by LRO");
5105 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5106 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5107 	    hn_lro_ackcnt_sysctl, "I",
5108 	    "Max # of ACKs to be aggregated by LRO");
5109 #endif
5110 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5111 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5112 	    hn_trust_hcsum_sysctl, "I",
5113 	    "Trust tcp segement verification on host side, "
5114 	    "when csum info is missing");
5115 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5116 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5117 	    hn_trust_hcsum_sysctl, "I",
5118 	    "Trust udp datagram verification on host side, "
5119 	    "when csum info is missing");
5120 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5121 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5122 	    hn_trust_hcsum_sysctl, "I",
5123 	    "Trust ip packet verification on host side, "
5124 	    "when csum info is missing");
5125 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5126 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5127 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5128 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5129 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5130 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5131 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5132 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5133 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5134 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5135 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5136 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5137 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5138 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5139 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5140 	    hn_rx_stat_ulong_sysctl, "LU",
5141 	    "# of packets that we trust host's csum verification");
5142 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5143 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5144 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5145 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5146 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5147 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5148 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5149 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5150 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5151 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5152 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5153 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5154 
5155 	return (0);
5156 }
5157 
5158 static void
5159 hn_destroy_rx_data(struct hn_softc *sc)
5160 {
5161 	int i;
5162 
5163 	if (sc->hn_rxbuf != NULL) {
5164 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5165 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5166 		else
5167 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5168 		sc->hn_rxbuf = NULL;
5169 	}
5170 
5171 	if (sc->hn_rx_ring_cnt == 0)
5172 		return;
5173 
5174 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5175 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5176 
5177 		if (rxr->hn_br == NULL)
5178 			continue;
5179 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5180 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5181 		} else {
5182 			device_printf(sc->hn_dev,
5183 			    "%dth channel bufring is referenced", i);
5184 		}
5185 		rxr->hn_br = NULL;
5186 
5187 #if defined(INET) || defined(INET6)
5188 		tcp_lro_free(&rxr->hn_lro);
5189 #endif
5190 		free(rxr->hn_pktbuf, M_DEVBUF);
5191 	}
5192 	free(sc->hn_rx_ring, M_DEVBUF);
5193 	sc->hn_rx_ring = NULL;
5194 
5195 	sc->hn_rx_ring_cnt = 0;
5196 	sc->hn_rx_ring_inuse = 0;
5197 }
5198 
5199 static int
5200 hn_tx_ring_create(struct hn_softc *sc, int id)
5201 {
5202 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5203 	device_t dev = sc->hn_dev;
5204 	bus_dma_tag_t parent_dtag;
5205 	int error, i;
5206 
5207 	txr->hn_sc = sc;
5208 	txr->hn_tx_idx = id;
5209 
5210 #ifndef HN_USE_TXDESC_BUFRING
5211 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5212 #endif
5213 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5214 
5215 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5216 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5217 	    M_DEVBUF, M_WAITOK | M_ZERO);
5218 #ifndef HN_USE_TXDESC_BUFRING
5219 	SLIST_INIT(&txr->hn_txlist);
5220 #else
5221 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5222 	    M_WAITOK, &txr->hn_tx_lock);
5223 #endif
5224 
5225 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5226 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5227 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5228 	} else {
5229 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5230 	}
5231 
5232 #ifdef HN_IFSTART_SUPPORT
5233 	if (hn_use_if_start) {
5234 		txr->hn_txeof = hn_start_txeof;
5235 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5236 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5237 	} else
5238 #endif
5239 	{
5240 		int br_depth;
5241 
5242 		txr->hn_txeof = hn_xmit_txeof;
5243 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5244 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5245 
5246 		br_depth = hn_get_txswq_depth(txr);
5247 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5248 		    M_WAITOK, &txr->hn_tx_lock);
5249 	}
5250 
5251 	txr->hn_direct_tx_size = hn_direct_tx_size;
5252 
5253 	/*
5254 	 * Always schedule transmission instead of trying to do direct
5255 	 * transmission.  This one gives the best performance so far.
5256 	 */
5257 	txr->hn_sched_tx = 1;
5258 
5259 	parent_dtag = bus_get_dma_tag(dev);
5260 
5261 	/* DMA tag for RNDIS packet messages. */
5262 	error = bus_dma_tag_create(parent_dtag, /* parent */
5263 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5264 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5265 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5266 	    BUS_SPACE_MAXADDR,		/* highaddr */
5267 	    NULL, NULL,			/* filter, filterarg */
5268 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5269 	    1,				/* nsegments */
5270 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5271 	    0,				/* flags */
5272 	    NULL,			/* lockfunc */
5273 	    NULL,			/* lockfuncarg */
5274 	    &txr->hn_tx_rndis_dtag);
5275 	if (error) {
5276 		device_printf(dev, "failed to create rndis dmatag\n");
5277 		return error;
5278 	}
5279 
5280 	/* DMA tag for data. */
5281 	error = bus_dma_tag_create(parent_dtag, /* parent */
5282 	    1,				/* alignment */
5283 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5284 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5285 	    BUS_SPACE_MAXADDR,		/* highaddr */
5286 	    NULL, NULL,			/* filter, filterarg */
5287 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5288 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5289 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5290 	    0,				/* flags */
5291 	    NULL,			/* lockfunc */
5292 	    NULL,			/* lockfuncarg */
5293 	    &txr->hn_tx_data_dtag);
5294 	if (error) {
5295 		device_printf(dev, "failed to create data dmatag\n");
5296 		return error;
5297 	}
5298 
5299 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5300 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5301 
5302 		txd->txr = txr;
5303 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5304 		STAILQ_INIT(&txd->agg_list);
5305 
5306 		/*
5307 		 * Allocate and load RNDIS packet message.
5308 		 */
5309         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5310 		    (void **)&txd->rndis_pkt,
5311 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5312 		    &txd->rndis_pkt_dmap);
5313 		if (error) {
5314 			device_printf(dev,
5315 			    "failed to allocate rndis_packet_msg, %d\n", i);
5316 			return error;
5317 		}
5318 
5319 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5320 		    txd->rndis_pkt_dmap,
5321 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5322 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5323 		    BUS_DMA_NOWAIT);
5324 		if (error) {
5325 			device_printf(dev,
5326 			    "failed to load rndis_packet_msg, %d\n", i);
5327 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5328 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5329 			return error;
5330 		}
5331 
5332 		/* DMA map for TX data. */
5333 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5334 		    &txd->data_dmap);
5335 		if (error) {
5336 			device_printf(dev,
5337 			    "failed to allocate tx data dmamap\n");
5338 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5339 			    txd->rndis_pkt_dmap);
5340 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5341 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5342 			return error;
5343 		}
5344 
5345 		/* All set, put it to list */
5346 		txd->flags |= HN_TXD_FLAG_ONLIST;
5347 #ifndef HN_USE_TXDESC_BUFRING
5348 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5349 #else
5350 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5351 #endif
5352 	}
5353 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5354 
5355 	if (sc->hn_tx_sysctl_tree != NULL) {
5356 		struct sysctl_oid_list *child;
5357 		struct sysctl_ctx_list *ctx;
5358 		char name[16];
5359 
5360 		/*
5361 		 * Create per TX ring sysctl tree:
5362 		 * dev.hn.UNIT.tx.RINGID
5363 		 */
5364 		ctx = device_get_sysctl_ctx(dev);
5365 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5366 
5367 		snprintf(name, sizeof(name), "%d", id);
5368 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5369 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5370 
5371 		if (txr->hn_tx_sysctl_tree != NULL) {
5372 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5373 
5374 #ifdef HN_DEBUG
5375 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5376 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5377 			    "# of available TX descs");
5378 #endif
5379 #ifdef HN_IFSTART_SUPPORT
5380 			if (!hn_use_if_start)
5381 #endif
5382 			{
5383 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5384 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5385 				    "over active");
5386 			}
5387 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5388 			    CTLFLAG_RW, &txr->hn_pkts,
5389 			    "# of packets transmitted");
5390 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5391 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5392 		}
5393 	}
5394 
5395 	return 0;
5396 }
5397 
5398 static void
5399 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5400 {
5401 	struct hn_tx_ring *txr = txd->txr;
5402 
5403 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5404 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5405 
5406 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5407 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5408 	    txd->rndis_pkt_dmap);
5409 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5410 }
5411 
5412 static void
5413 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5414 {
5415 
5416 	KASSERT(txd->refs == 0 || txd->refs == 1,
5417 	    ("invalid txd refs %d", txd->refs));
5418 
5419 	/* Aggregated txds will be freed by their aggregating txd. */
5420 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5421 		int freed;
5422 
5423 		freed = hn_txdesc_put(txr, txd);
5424 		KASSERT(freed, ("can't free txdesc"));
5425 	}
5426 }
5427 
5428 static void
5429 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5430 {
5431 	int i;
5432 
5433 	if (txr->hn_txdesc == NULL)
5434 		return;
5435 
5436 	/*
5437 	 * NOTE:
5438 	 * Because the freeing of aggregated txds will be deferred
5439 	 * to the aggregating txd, two passes are used here:
5440 	 * - The first pass GCes any pending txds.  This GC is necessary,
5441 	 *   since if the channels are revoked, hypervisor will not
5442 	 *   deliver send-done for all pending txds.
5443 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5444 	 *   were freed.
5445 	 */
5446 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5447 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5448 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5449 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5450 
5451 	if (txr->hn_tx_data_dtag != NULL)
5452 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5453 	if (txr->hn_tx_rndis_dtag != NULL)
5454 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5455 
5456 #ifdef HN_USE_TXDESC_BUFRING
5457 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5458 #endif
5459 
5460 	free(txr->hn_txdesc, M_DEVBUF);
5461 	txr->hn_txdesc = NULL;
5462 
5463 	if (txr->hn_mbuf_br != NULL)
5464 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5465 
5466 #ifndef HN_USE_TXDESC_BUFRING
5467 	mtx_destroy(&txr->hn_txlist_spin);
5468 #endif
5469 	mtx_destroy(&txr->hn_tx_lock);
5470 }
5471 
5472 static int
5473 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5474 {
5475 	struct sysctl_oid_list *child;
5476 	struct sysctl_ctx_list *ctx;
5477 	int i;
5478 
5479 	/*
5480 	 * Create TXBUF for chimney sending.
5481 	 *
5482 	 * NOTE: It is shared by all channels.
5483 	 */
5484 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5485 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5486 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5487 	if (sc->hn_chim == NULL) {
5488 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5489 		return (ENOMEM);
5490 	}
5491 
5492 	sc->hn_tx_ring_cnt = ring_cnt;
5493 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5494 
5495 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5496 	    M_DEVBUF, M_WAITOK | M_ZERO);
5497 
5498 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5499 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5500 
5501 	/* Create dev.hn.UNIT.tx sysctl tree */
5502 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5503 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5504 
5505 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5506 		int error;
5507 
5508 		error = hn_tx_ring_create(sc, i);
5509 		if (error)
5510 			return error;
5511 	}
5512 
5513 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5514 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5515 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5516 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5517 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5518 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5519 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5520 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5521 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5522 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5523 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5524 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5525 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5526 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5527 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5528 	    hn_tx_stat_ulong_sysctl, "LU",
5529 	    "# of packet transmission aggregation flush failure");
5530 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5531 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5532 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5533 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5534 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5535 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5536 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5537 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5538 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5539 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5540 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5541 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5542 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5543 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5544 	    "# of total TX descs");
5545 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5546 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5547 	    "Chimney send packet size upper boundary");
5548 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5549 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5550 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5551 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5552 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5553 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5554 	    hn_tx_conf_int_sysctl, "I",
5555 	    "Size of the packet for direct transmission");
5556 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5557 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5558 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5559 	    hn_tx_conf_int_sysctl, "I",
5560 	    "Always schedule transmission "
5561 	    "instead of doing direct transmission");
5562 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5563 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5564 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5565 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5566 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5567 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5568 	    "Applied packet transmission aggregation size");
5569 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5570 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5571 	    hn_txagg_pktmax_sysctl, "I",
5572 	    "Applied packet transmission aggregation packets");
5573 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5574 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5575 	    hn_txagg_align_sysctl, "I",
5576 	    "Applied packet transmission aggregation alignment");
5577 
5578 	return 0;
5579 }
5580 
5581 static void
5582 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5583 {
5584 	int i;
5585 
5586 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5587 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5588 }
5589 
5590 static void
5591 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5592 {
5593 	struct ifnet *ifp = sc->hn_ifp;
5594 	u_int hw_tsomax;
5595 	int tso_minlen;
5596 
5597 	HN_LOCK_ASSERT(sc);
5598 
5599 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5600 		return;
5601 
5602 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5603 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5604 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5605 
5606 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5607 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5608 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5609 
5610 	if (tso_maxlen < tso_minlen)
5611 		tso_maxlen = tso_minlen;
5612 	else if (tso_maxlen > IP_MAXPACKET)
5613 		tso_maxlen = IP_MAXPACKET;
5614 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5615 		tso_maxlen = sc->hn_ndis_tso_szmax;
5616 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5617 
5618 	if (hn_xpnt_vf_isready(sc)) {
5619 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5620 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5621 	}
5622 	ifp->if_hw_tsomax = hw_tsomax;
5623 	if (bootverbose)
5624 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5625 }
5626 
5627 static void
5628 hn_fixup_tx_data(struct hn_softc *sc)
5629 {
5630 	uint64_t csum_assist;
5631 	int i;
5632 
5633 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5634 	if (hn_tx_chimney_size > 0 &&
5635 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5636 		hn_set_chim_size(sc, hn_tx_chimney_size);
5637 
5638 	csum_assist = 0;
5639 	if (sc->hn_caps & HN_CAP_IPCS)
5640 		csum_assist |= CSUM_IP;
5641 	if (sc->hn_caps & HN_CAP_TCP4CS)
5642 		csum_assist |= CSUM_IP_TCP;
5643 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5644 		csum_assist |= CSUM_IP_UDP;
5645 	if (sc->hn_caps & HN_CAP_TCP6CS)
5646 		csum_assist |= CSUM_IP6_TCP;
5647 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5648 		csum_assist |= CSUM_IP6_UDP;
5649 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5650 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5651 
5652 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5653 		/*
5654 		 * Support HASHVAL pktinfo on TX path.
5655 		 */
5656 		if (bootverbose)
5657 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5658 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5659 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5660 	}
5661 }
5662 
5663 static void
5664 hn_fixup_rx_data(struct hn_softc *sc)
5665 {
5666 
5667 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5668 		int i;
5669 
5670 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5671 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5672 	}
5673 }
5674 
5675 static void
5676 hn_destroy_tx_data(struct hn_softc *sc)
5677 {
5678 	int i;
5679 
5680 	if (sc->hn_chim != NULL) {
5681 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5682 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5683 		} else {
5684 			device_printf(sc->hn_dev,
5685 			    "chimney sending buffer is referenced");
5686 		}
5687 		sc->hn_chim = NULL;
5688 	}
5689 
5690 	if (sc->hn_tx_ring_cnt == 0)
5691 		return;
5692 
5693 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5694 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5695 
5696 	free(sc->hn_tx_ring, M_DEVBUF);
5697 	sc->hn_tx_ring = NULL;
5698 
5699 	sc->hn_tx_ring_cnt = 0;
5700 	sc->hn_tx_ring_inuse = 0;
5701 }
5702 
5703 #ifdef HN_IFSTART_SUPPORT
5704 
5705 static void
5706 hn_start_taskfunc(void *xtxr, int pending __unused)
5707 {
5708 	struct hn_tx_ring *txr = xtxr;
5709 
5710 	mtx_lock(&txr->hn_tx_lock);
5711 	hn_start_locked(txr, 0);
5712 	mtx_unlock(&txr->hn_tx_lock);
5713 }
5714 
5715 static int
5716 hn_start_locked(struct hn_tx_ring *txr, int len)
5717 {
5718 	struct hn_softc *sc = txr->hn_sc;
5719 	struct ifnet *ifp = sc->hn_ifp;
5720 	int sched = 0;
5721 
5722 	KASSERT(hn_use_if_start,
5723 	    ("hn_start_locked is called, when if_start is disabled"));
5724 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5725 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5726 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5727 
5728 	if (__predict_false(txr->hn_suspended))
5729 		return (0);
5730 
5731 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5732 	    IFF_DRV_RUNNING)
5733 		return (0);
5734 
5735 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5736 		struct hn_txdesc *txd;
5737 		struct mbuf *m_head;
5738 		int error;
5739 
5740 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5741 		if (m_head == NULL)
5742 			break;
5743 
5744 		if (len > 0 && m_head->m_pkthdr.len > len) {
5745 			/*
5746 			 * This sending could be time consuming; let callers
5747 			 * dispatch this packet sending (and sending of any
5748 			 * following up packets) to tx taskqueue.
5749 			 */
5750 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5751 			sched = 1;
5752 			break;
5753 		}
5754 
5755 #if defined(INET6) || defined(INET)
5756 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5757 			m_head = hn_tso_fixup(m_head);
5758 			if (__predict_false(m_head == NULL)) {
5759 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5760 				continue;
5761 			}
5762 		} else if (m_head->m_pkthdr.csum_flags &
5763 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5764 			m_head = hn_set_hlen(m_head);
5765 			if (__predict_false(m_head == NULL)) {
5766 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5767 				continue;
5768 			}
5769 		}
5770 #endif
5771 
5772 		txd = hn_txdesc_get(txr);
5773 		if (txd == NULL) {
5774 			txr->hn_no_txdescs++;
5775 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5776 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5777 			break;
5778 		}
5779 
5780 		error = hn_encap(ifp, txr, txd, &m_head);
5781 		if (error) {
5782 			/* Both txd and m_head are freed */
5783 			KASSERT(txr->hn_agg_txd == NULL,
5784 			    ("encap failed w/ pending aggregating txdesc"));
5785 			continue;
5786 		}
5787 
5788 		if (txr->hn_agg_pktleft == 0) {
5789 			if (txr->hn_agg_txd != NULL) {
5790 				KASSERT(m_head == NULL,
5791 				    ("pending mbuf for aggregating txdesc"));
5792 				error = hn_flush_txagg(ifp, txr);
5793 				if (__predict_false(error)) {
5794 					atomic_set_int(&ifp->if_drv_flags,
5795 					    IFF_DRV_OACTIVE);
5796 					break;
5797 				}
5798 			} else {
5799 				KASSERT(m_head != NULL, ("mbuf was freed"));
5800 				error = hn_txpkt(ifp, txr, txd);
5801 				if (__predict_false(error)) {
5802 					/* txd is freed, but m_head is not */
5803 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5804 					atomic_set_int(&ifp->if_drv_flags,
5805 					    IFF_DRV_OACTIVE);
5806 					break;
5807 				}
5808 			}
5809 		}
5810 #ifdef INVARIANTS
5811 		else {
5812 			KASSERT(txr->hn_agg_txd != NULL,
5813 			    ("no aggregating txdesc"));
5814 			KASSERT(m_head == NULL,
5815 			    ("pending mbuf for aggregating txdesc"));
5816 		}
5817 #endif
5818 	}
5819 
5820 	/* Flush pending aggerated transmission. */
5821 	if (txr->hn_agg_txd != NULL)
5822 		hn_flush_txagg(ifp, txr);
5823 	return (sched);
5824 }
5825 
5826 static void
5827 hn_start(struct ifnet *ifp)
5828 {
5829 	struct hn_softc *sc = ifp->if_softc;
5830 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5831 
5832 	if (txr->hn_sched_tx)
5833 		goto do_sched;
5834 
5835 	if (mtx_trylock(&txr->hn_tx_lock)) {
5836 		int sched;
5837 
5838 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5839 		mtx_unlock(&txr->hn_tx_lock);
5840 		if (!sched)
5841 			return;
5842 	}
5843 do_sched:
5844 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5845 }
5846 
5847 static void
5848 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5849 {
5850 	struct hn_tx_ring *txr = xtxr;
5851 
5852 	mtx_lock(&txr->hn_tx_lock);
5853 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5854 	hn_start_locked(txr, 0);
5855 	mtx_unlock(&txr->hn_tx_lock);
5856 }
5857 
5858 static void
5859 hn_start_txeof(struct hn_tx_ring *txr)
5860 {
5861 	struct hn_softc *sc = txr->hn_sc;
5862 	struct ifnet *ifp = sc->hn_ifp;
5863 
5864 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5865 
5866 	if (txr->hn_sched_tx)
5867 		goto do_sched;
5868 
5869 	if (mtx_trylock(&txr->hn_tx_lock)) {
5870 		int sched;
5871 
5872 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5873 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5874 		mtx_unlock(&txr->hn_tx_lock);
5875 		if (sched) {
5876 			taskqueue_enqueue(txr->hn_tx_taskq,
5877 			    &txr->hn_tx_task);
5878 		}
5879 	} else {
5880 do_sched:
5881 		/*
5882 		 * Release the OACTIVE earlier, with the hope, that
5883 		 * others could catch up.  The task will clear the
5884 		 * flag again with the hn_tx_lock to avoid possible
5885 		 * races.
5886 		 */
5887 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5888 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5889 	}
5890 }
5891 
5892 #endif	/* HN_IFSTART_SUPPORT */
5893 
5894 static int
5895 hn_xmit(struct hn_tx_ring *txr, int len)
5896 {
5897 	struct hn_softc *sc = txr->hn_sc;
5898 	struct ifnet *ifp = sc->hn_ifp;
5899 	struct mbuf *m_head;
5900 	int sched = 0;
5901 
5902 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5903 #ifdef HN_IFSTART_SUPPORT
5904 	KASSERT(hn_use_if_start == 0,
5905 	    ("hn_xmit is called, when if_start is enabled"));
5906 #endif
5907 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5908 
5909 	if (__predict_false(txr->hn_suspended))
5910 		return (0);
5911 
5912 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5913 		return (0);
5914 
5915 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5916 		struct hn_txdesc *txd;
5917 		int error;
5918 
5919 		if (len > 0 && m_head->m_pkthdr.len > len) {
5920 			/*
5921 			 * This sending could be time consuming; let callers
5922 			 * dispatch this packet sending (and sending of any
5923 			 * following up packets) to tx taskqueue.
5924 			 */
5925 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5926 			sched = 1;
5927 			break;
5928 		}
5929 
5930 		txd = hn_txdesc_get(txr);
5931 		if (txd == NULL) {
5932 			txr->hn_no_txdescs++;
5933 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5934 			txr->hn_oactive = 1;
5935 			break;
5936 		}
5937 
5938 		error = hn_encap(ifp, txr, txd, &m_head);
5939 		if (error) {
5940 			/* Both txd and m_head are freed; discard */
5941 			KASSERT(txr->hn_agg_txd == NULL,
5942 			    ("encap failed w/ pending aggregating txdesc"));
5943 			drbr_advance(ifp, txr->hn_mbuf_br);
5944 			continue;
5945 		}
5946 
5947 		if (txr->hn_agg_pktleft == 0) {
5948 			if (txr->hn_agg_txd != NULL) {
5949 				KASSERT(m_head == NULL,
5950 				    ("pending mbuf for aggregating txdesc"));
5951 				error = hn_flush_txagg(ifp, txr);
5952 				if (__predict_false(error)) {
5953 					txr->hn_oactive = 1;
5954 					break;
5955 				}
5956 			} else {
5957 				KASSERT(m_head != NULL, ("mbuf was freed"));
5958 				error = hn_txpkt(ifp, txr, txd);
5959 				if (__predict_false(error)) {
5960 					/* txd is freed, but m_head is not */
5961 					drbr_putback(ifp, txr->hn_mbuf_br,
5962 					    m_head);
5963 					txr->hn_oactive = 1;
5964 					break;
5965 				}
5966 			}
5967 		}
5968 #ifdef INVARIANTS
5969 		else {
5970 			KASSERT(txr->hn_agg_txd != NULL,
5971 			    ("no aggregating txdesc"));
5972 			KASSERT(m_head == NULL,
5973 			    ("pending mbuf for aggregating txdesc"));
5974 		}
5975 #endif
5976 
5977 		/* Sent */
5978 		drbr_advance(ifp, txr->hn_mbuf_br);
5979 	}
5980 
5981 	/* Flush pending aggerated transmission. */
5982 	if (txr->hn_agg_txd != NULL)
5983 		hn_flush_txagg(ifp, txr);
5984 	return (sched);
5985 }
5986 
5987 static int
5988 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5989 {
5990 	struct hn_softc *sc = ifp->if_softc;
5991 	struct hn_tx_ring *txr;
5992 	int error, idx = 0;
5993 
5994 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5995 		struct rm_priotracker pt;
5996 
5997 		rm_rlock(&sc->hn_vf_lock, &pt);
5998 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5999 			struct mbuf *m_bpf = NULL;
6000 			int obytes, omcast;
6001 
6002 			obytes = m->m_pkthdr.len;
6003 			omcast = (m->m_flags & M_MCAST) != 0;
6004 
6005 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6006 				if (bpf_peers_present(ifp->if_bpf)) {
6007 					m_bpf = m_copypacket(m, M_NOWAIT);
6008 					if (m_bpf == NULL) {
6009 						/*
6010 						 * Failed to grab a shallow
6011 						 * copy; tap now.
6012 						 */
6013 						ETHER_BPF_MTAP(ifp, m);
6014 					}
6015 				}
6016 			} else {
6017 				ETHER_BPF_MTAP(ifp, m);
6018 			}
6019 
6020 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6021 			rm_runlock(&sc->hn_vf_lock, &pt);
6022 
6023 			if (m_bpf != NULL) {
6024 				if (!error)
6025 					ETHER_BPF_MTAP(ifp, m_bpf);
6026 				m_freem(m_bpf);
6027 			}
6028 
6029 			if (error == ENOBUFS) {
6030 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6031 			} else if (error) {
6032 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6033 			} else {
6034 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6035 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6036 				if (omcast) {
6037 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6038 					    omcast);
6039 				}
6040 			}
6041 			return (error);
6042 		}
6043 		rm_runlock(&sc->hn_vf_lock, &pt);
6044 	}
6045 
6046 #if defined(INET6) || defined(INET)
6047 	/*
6048 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6049 	 * since packet headers should be cache-hot.
6050 	 */
6051 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6052 		m = hn_tso_fixup(m);
6053 		if (__predict_false(m == NULL)) {
6054 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6055 			return EIO;
6056 		}
6057 	} else if (m->m_pkthdr.csum_flags &
6058 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6059 		m = hn_set_hlen(m);
6060 		if (__predict_false(m == NULL)) {
6061 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6062 			return EIO;
6063 		}
6064 	}
6065 #endif
6066 
6067 	/*
6068 	 * Select the TX ring based on flowid
6069 	 */
6070 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6071 #ifdef RSS
6072 		uint32_t bid;
6073 
6074 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6075 		    &bid) == 0)
6076 			idx = bid % sc->hn_tx_ring_inuse;
6077 		else
6078 #endif
6079 		{
6080 #if defined(INET6) || defined(INET)
6081 			int tcpsyn = 0;
6082 
6083 			if (m->m_pkthdr.len < 128 &&
6084 			    (m->m_pkthdr.csum_flags &
6085 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6086 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6087 				m = hn_check_tcpsyn(m, &tcpsyn);
6088 				if (__predict_false(m == NULL)) {
6089 					if_inc_counter(ifp,
6090 					    IFCOUNTER_OERRORS, 1);
6091 					return (EIO);
6092 				}
6093 			}
6094 #else
6095 			const int tcpsyn = 0;
6096 #endif
6097 			if (tcpsyn)
6098 				idx = 0;
6099 			else
6100 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6101 		}
6102 	}
6103 	txr = &sc->hn_tx_ring[idx];
6104 
6105 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6106 	if (error) {
6107 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6108 		return error;
6109 	}
6110 
6111 	if (txr->hn_oactive)
6112 		return 0;
6113 
6114 	if (txr->hn_sched_tx)
6115 		goto do_sched;
6116 
6117 	if (mtx_trylock(&txr->hn_tx_lock)) {
6118 		int sched;
6119 
6120 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6121 		mtx_unlock(&txr->hn_tx_lock);
6122 		if (!sched)
6123 			return 0;
6124 	}
6125 do_sched:
6126 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6127 	return 0;
6128 }
6129 
6130 static void
6131 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6132 {
6133 	struct mbuf *m;
6134 
6135 	mtx_lock(&txr->hn_tx_lock);
6136 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6137 		m_freem(m);
6138 	mtx_unlock(&txr->hn_tx_lock);
6139 }
6140 
6141 static void
6142 hn_xmit_qflush(struct ifnet *ifp)
6143 {
6144 	struct hn_softc *sc = ifp->if_softc;
6145 	struct rm_priotracker pt;
6146 	int i;
6147 
6148 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6149 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6150 	if_qflush(ifp);
6151 
6152 	rm_rlock(&sc->hn_vf_lock, &pt);
6153 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6154 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6155 	rm_runlock(&sc->hn_vf_lock, &pt);
6156 }
6157 
6158 static void
6159 hn_xmit_txeof(struct hn_tx_ring *txr)
6160 {
6161 
6162 	if (txr->hn_sched_tx)
6163 		goto do_sched;
6164 
6165 	if (mtx_trylock(&txr->hn_tx_lock)) {
6166 		int sched;
6167 
6168 		txr->hn_oactive = 0;
6169 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6170 		mtx_unlock(&txr->hn_tx_lock);
6171 		if (sched) {
6172 			taskqueue_enqueue(txr->hn_tx_taskq,
6173 			    &txr->hn_tx_task);
6174 		}
6175 	} else {
6176 do_sched:
6177 		/*
6178 		 * Release the oactive earlier, with the hope, that
6179 		 * others could catch up.  The task will clear the
6180 		 * oactive again with the hn_tx_lock to avoid possible
6181 		 * races.
6182 		 */
6183 		txr->hn_oactive = 0;
6184 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6185 	}
6186 }
6187 
6188 static void
6189 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6190 {
6191 	struct hn_tx_ring *txr = xtxr;
6192 
6193 	mtx_lock(&txr->hn_tx_lock);
6194 	hn_xmit(txr, 0);
6195 	mtx_unlock(&txr->hn_tx_lock);
6196 }
6197 
6198 static void
6199 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6200 {
6201 	struct hn_tx_ring *txr = xtxr;
6202 
6203 	mtx_lock(&txr->hn_tx_lock);
6204 	txr->hn_oactive = 0;
6205 	hn_xmit(txr, 0);
6206 	mtx_unlock(&txr->hn_tx_lock);
6207 }
6208 
6209 static int
6210 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6211 {
6212 	struct vmbus_chan_br cbr;
6213 	struct hn_rx_ring *rxr;
6214 	struct hn_tx_ring *txr = NULL;
6215 	int idx, error;
6216 
6217 	idx = vmbus_chan_subidx(chan);
6218 
6219 	/*
6220 	 * Link this channel to RX/TX ring.
6221 	 */
6222 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6223 	    ("invalid channel index %d, should > 0 && < %d",
6224 	     idx, sc->hn_rx_ring_inuse));
6225 	rxr = &sc->hn_rx_ring[idx];
6226 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6227 	    ("RX ring %d already attached", idx));
6228 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6229 	rxr->hn_chan = chan;
6230 
6231 	if (bootverbose) {
6232 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6233 		    idx, vmbus_chan_id(chan));
6234 	}
6235 
6236 	if (idx < sc->hn_tx_ring_inuse) {
6237 		txr = &sc->hn_tx_ring[idx];
6238 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6239 		    ("TX ring %d already attached", idx));
6240 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6241 
6242 		txr->hn_chan = chan;
6243 		if (bootverbose) {
6244 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6245 			    idx, vmbus_chan_id(chan));
6246 		}
6247 	}
6248 
6249 	/* Bind this channel to a proper CPU. */
6250 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6251 
6252 	/*
6253 	 * Open this channel
6254 	 */
6255 	cbr.cbr = rxr->hn_br;
6256 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6257 	cbr.cbr_txsz = HN_TXBR_SIZE;
6258 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6259 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6260 	if (error) {
6261 		if (error == EISCONN) {
6262 			if_printf(sc->hn_ifp, "bufring is connected after "
6263 			    "chan%u open failure\n", vmbus_chan_id(chan));
6264 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6265 		} else {
6266 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6267 			    vmbus_chan_id(chan), error);
6268 		}
6269 	}
6270 	return (error);
6271 }
6272 
6273 static void
6274 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6275 {
6276 	struct hn_rx_ring *rxr;
6277 	int idx, error;
6278 
6279 	idx = vmbus_chan_subidx(chan);
6280 
6281 	/*
6282 	 * Link this channel to RX/TX ring.
6283 	 */
6284 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6285 	    ("invalid channel index %d, should > 0 && < %d",
6286 	     idx, sc->hn_rx_ring_inuse));
6287 	rxr = &sc->hn_rx_ring[idx];
6288 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6289 	    ("RX ring %d is not attached", idx));
6290 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6291 
6292 	if (idx < sc->hn_tx_ring_inuse) {
6293 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6294 
6295 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6296 		    ("TX ring %d is not attached attached", idx));
6297 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6298 	}
6299 
6300 	/*
6301 	 * Close this channel.
6302 	 *
6303 	 * NOTE:
6304 	 * Channel closing does _not_ destroy the target channel.
6305 	 */
6306 	error = vmbus_chan_close_direct(chan);
6307 	if (error == EISCONN) {
6308 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6309 		    "after being closed\n", vmbus_chan_id(chan));
6310 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6311 	} else if (error) {
6312 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6313 		    vmbus_chan_id(chan), error);
6314 	}
6315 }
6316 
6317 static int
6318 hn_attach_subchans(struct hn_softc *sc)
6319 {
6320 	struct vmbus_channel **subchans;
6321 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6322 	int i, error = 0;
6323 
6324 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6325 
6326 	/* Attach the sub-channels. */
6327 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6328 	for (i = 0; i < subchan_cnt; ++i) {
6329 		int error1;
6330 
6331 		error1 = hn_chan_attach(sc, subchans[i]);
6332 		if (error1) {
6333 			error = error1;
6334 			/* Move on; all channels will be detached later. */
6335 		}
6336 	}
6337 	vmbus_subchan_rel(subchans, subchan_cnt);
6338 
6339 	if (error) {
6340 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6341 	} else {
6342 		if (bootverbose) {
6343 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6344 			    subchan_cnt);
6345 		}
6346 	}
6347 	return (error);
6348 }
6349 
6350 static void
6351 hn_detach_allchans(struct hn_softc *sc)
6352 {
6353 	struct vmbus_channel **subchans;
6354 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6355 	int i;
6356 
6357 	if (subchan_cnt == 0)
6358 		goto back;
6359 
6360 	/* Detach the sub-channels. */
6361 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6362 	for (i = 0; i < subchan_cnt; ++i)
6363 		hn_chan_detach(sc, subchans[i]);
6364 	vmbus_subchan_rel(subchans, subchan_cnt);
6365 
6366 back:
6367 	/*
6368 	 * Detach the primary channel, _after_ all sub-channels
6369 	 * are detached.
6370 	 */
6371 	hn_chan_detach(sc, sc->hn_prichan);
6372 
6373 	/* Wait for sub-channels to be destroyed, if any. */
6374 	vmbus_subchan_drain(sc->hn_prichan);
6375 
6376 #ifdef INVARIANTS
6377 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6378 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6379 		    HN_RX_FLAG_ATTACHED) == 0,
6380 		    ("%dth RX ring is still attached", i));
6381 	}
6382 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6383 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6384 		    HN_TX_FLAG_ATTACHED) == 0,
6385 		    ("%dth TX ring is still attached", i));
6386 	}
6387 #endif
6388 }
6389 
6390 static int
6391 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6392 {
6393 	struct vmbus_channel **subchans;
6394 	int nchan, rxr_cnt, error;
6395 
6396 	nchan = *nsubch + 1;
6397 	if (nchan == 1) {
6398 		/*
6399 		 * Multiple RX/TX rings are not requested.
6400 		 */
6401 		*nsubch = 0;
6402 		return (0);
6403 	}
6404 
6405 	/*
6406 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6407 	 * table entries.
6408 	 */
6409 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6410 	if (error) {
6411 		/* No RSS; this is benign. */
6412 		*nsubch = 0;
6413 		return (0);
6414 	}
6415 	if (bootverbose) {
6416 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6417 		    rxr_cnt, nchan);
6418 	}
6419 
6420 	if (nchan > rxr_cnt)
6421 		nchan = rxr_cnt;
6422 	if (nchan == 1) {
6423 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6424 		*nsubch = 0;
6425 		return (0);
6426 	}
6427 
6428 	/*
6429 	 * Allocate sub-channels from NVS.
6430 	 */
6431 	*nsubch = nchan - 1;
6432 	error = hn_nvs_alloc_subchans(sc, nsubch);
6433 	if (error || *nsubch == 0) {
6434 		/* Failed to allocate sub-channels. */
6435 		*nsubch = 0;
6436 		return (0);
6437 	}
6438 
6439 	/*
6440 	 * Wait for all sub-channels to become ready before moving on.
6441 	 */
6442 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6443 	vmbus_subchan_rel(subchans, *nsubch);
6444 	return (0);
6445 }
6446 
6447 static bool
6448 hn_synth_attachable(const struct hn_softc *sc)
6449 {
6450 	int i;
6451 
6452 	if (sc->hn_flags & HN_FLAG_ERRORS)
6453 		return (false);
6454 
6455 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6456 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6457 
6458 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6459 			return (false);
6460 	}
6461 	return (true);
6462 }
6463 
6464 /*
6465  * Make sure that the RX filter is zero after the successful
6466  * RNDIS initialization.
6467  *
6468  * NOTE:
6469  * Under certain conditions on certain versions of Hyper-V,
6470  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6471  * after the successful RNDIS initialization, which breaks
6472  * the assumption of any following code (well, it breaks the
6473  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6474  * explicitly, drain packets sneaking through, and drain the
6475  * interrupt taskqueues scheduled due to the stealth packets.
6476  */
6477 static void
6478 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6479 {
6480 
6481 	hn_disable_rx(sc);
6482 	hn_drain_rxtx(sc, nchan);
6483 }
6484 
6485 static int
6486 hn_synth_attach(struct hn_softc *sc, int mtu)
6487 {
6488 #define ATTACHED_NVS		0x0002
6489 #define ATTACHED_RNDIS		0x0004
6490 
6491 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6492 	int error, nsubch, nchan = 1, i, rndis_inited;
6493 	uint32_t old_caps, attached = 0;
6494 
6495 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6496 	    ("synthetic parts were attached"));
6497 
6498 	if (!hn_synth_attachable(sc))
6499 		return (ENXIO);
6500 
6501 	/* Save capabilities for later verification. */
6502 	old_caps = sc->hn_caps;
6503 	sc->hn_caps = 0;
6504 
6505 	/* Clear RSS stuffs. */
6506 	sc->hn_rss_ind_size = 0;
6507 	sc->hn_rss_hash = 0;
6508 	sc->hn_rss_hcap = 0;
6509 
6510 	/*
6511 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6512 	 */
6513 	error = hn_chan_attach(sc, sc->hn_prichan);
6514 	if (error)
6515 		goto failed;
6516 
6517 	/*
6518 	 * Attach NVS.
6519 	 */
6520 	error = hn_nvs_attach(sc, mtu);
6521 	if (error)
6522 		goto failed;
6523 	attached |= ATTACHED_NVS;
6524 
6525 	/*
6526 	 * Attach RNDIS _after_ NVS is attached.
6527 	 */
6528 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6529 	if (rndis_inited)
6530 		attached |= ATTACHED_RNDIS;
6531 	if (error)
6532 		goto failed;
6533 
6534 	/*
6535 	 * Make sure capabilities are not changed.
6536 	 */
6537 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6538 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6539 		    old_caps, sc->hn_caps);
6540 		error = ENXIO;
6541 		goto failed;
6542 	}
6543 
6544 	/*
6545 	 * Allocate sub-channels for multi-TX/RX rings.
6546 	 *
6547 	 * NOTE:
6548 	 * The # of RX rings that can be used is equivalent to the # of
6549 	 * channels to be requested.
6550 	 */
6551 	nsubch = sc->hn_rx_ring_cnt - 1;
6552 	error = hn_synth_alloc_subchans(sc, &nsubch);
6553 	if (error)
6554 		goto failed;
6555 	/* NOTE: _Full_ synthetic parts detach is required now. */
6556 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6557 
6558 	/*
6559 	 * Set the # of TX/RX rings that could be used according to
6560 	 * the # of channels that NVS offered.
6561 	 */
6562 	nchan = nsubch + 1;
6563 	hn_set_ring_inuse(sc, nchan);
6564 	if (nchan == 1) {
6565 		/* Only the primary channel can be used; done */
6566 		goto back;
6567 	}
6568 
6569 	/*
6570 	 * Attach the sub-channels.
6571 	 *
6572 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6573 	 */
6574 	error = hn_attach_subchans(sc);
6575 	if (error)
6576 		goto failed;
6577 
6578 	/*
6579 	 * Configure RSS key and indirect table _after_ all sub-channels
6580 	 * are attached.
6581 	 */
6582 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6583 		/*
6584 		 * RSS key is not set yet; set it to the default RSS key.
6585 		 */
6586 		if (bootverbose)
6587 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6588 #ifdef RSS
6589 		rss_getkey(rss->rss_key);
6590 #else
6591 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6592 #endif
6593 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6594 	}
6595 
6596 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6597 		/*
6598 		 * RSS indirect table is not set yet; set it up in round-
6599 		 * robin fashion.
6600 		 */
6601 		if (bootverbose) {
6602 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6603 			    "table\n");
6604 		}
6605 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6606 			uint32_t subidx;
6607 
6608 #ifdef RSS
6609 			subidx = rss_get_indirection_to_bucket(i);
6610 #else
6611 			subidx = i;
6612 #endif
6613 			rss->rss_ind[i] = subidx % nchan;
6614 		}
6615 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6616 	} else {
6617 		/*
6618 		 * # of usable channels may be changed, so we have to
6619 		 * make sure that all entries in RSS indirect table
6620 		 * are valid.
6621 		 *
6622 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6623 		 */
6624 		hn_rss_ind_fixup(sc);
6625 	}
6626 
6627 	sc->hn_rss_hash = sc->hn_rss_hcap;
6628 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6629 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6630 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6631 		hn_vf_rss_fixup(sc, false);
6632 	}
6633 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6634 	if (error)
6635 		goto failed;
6636 back:
6637 	/*
6638 	 * Fixup transmission aggregation setup.
6639 	 */
6640 	hn_set_txagg(sc);
6641 	hn_rndis_init_fixat(sc, nchan);
6642 	return (0);
6643 
6644 failed:
6645 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6646 		hn_rndis_init_fixat(sc, nchan);
6647 		hn_synth_detach(sc);
6648 	} else {
6649 		if (attached & ATTACHED_RNDIS) {
6650 			hn_rndis_init_fixat(sc, nchan);
6651 			hn_rndis_detach(sc);
6652 		}
6653 		if (attached & ATTACHED_NVS)
6654 			hn_nvs_detach(sc);
6655 		hn_chan_detach(sc, sc->hn_prichan);
6656 		/* Restore old capabilities. */
6657 		sc->hn_caps = old_caps;
6658 	}
6659 	return (error);
6660 
6661 #undef ATTACHED_RNDIS
6662 #undef ATTACHED_NVS
6663 }
6664 
6665 /*
6666  * NOTE:
6667  * The interface must have been suspended though hn_suspend(), before
6668  * this function get called.
6669  */
6670 static void
6671 hn_synth_detach(struct hn_softc *sc)
6672 {
6673 
6674 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6675 	    ("synthetic parts were not attached"));
6676 
6677 	/* Detach the RNDIS first. */
6678 	hn_rndis_detach(sc);
6679 
6680 	/* Detach NVS. */
6681 	hn_nvs_detach(sc);
6682 
6683 	/* Detach all of the channels. */
6684 	hn_detach_allchans(sc);
6685 
6686 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6687 		/*
6688 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6689 		 */
6690 		int error;
6691 
6692 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6693 		    sc->hn_rxbuf_gpadl);
6694 		if (error) {
6695 			if_printf(sc->hn_ifp,
6696 			    "rxbuf gpadl disconn failed: %d\n", error);
6697 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6698 		}
6699 		sc->hn_rxbuf_gpadl = 0;
6700 	}
6701 
6702 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6703 		/*
6704 		 * Host is post-Win2016, disconnect chimney sending buffer from
6705 		 * primary channel here.
6706 		 */
6707 		int error;
6708 
6709 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6710 		    sc->hn_chim_gpadl);
6711 		if (error) {
6712 			if_printf(sc->hn_ifp,
6713 			    "chim gpadl disconn failed: %d\n", error);
6714 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6715 		}
6716 		sc->hn_chim_gpadl = 0;
6717 	}
6718 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6719 }
6720 
6721 static void
6722 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6723 {
6724 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6725 	    ("invalid ring count %d", ring_cnt));
6726 
6727 	if (sc->hn_tx_ring_cnt > ring_cnt)
6728 		sc->hn_tx_ring_inuse = ring_cnt;
6729 	else
6730 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6731 	sc->hn_rx_ring_inuse = ring_cnt;
6732 
6733 #ifdef RSS
6734 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6735 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6736 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6737 		    rss_getnumbuckets());
6738 	}
6739 #endif
6740 
6741 	if (bootverbose) {
6742 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6743 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6744 	}
6745 }
6746 
6747 static void
6748 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6749 {
6750 
6751 	/*
6752 	 * NOTE:
6753 	 * The TX bufring will not be drained by the hypervisor,
6754 	 * if the primary channel is revoked.
6755 	 */
6756 	while (!vmbus_chan_rx_empty(chan) ||
6757 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6758 	     !vmbus_chan_tx_empty(chan)))
6759 		pause("waitch", 1);
6760 	vmbus_chan_intr_drain(chan);
6761 }
6762 
6763 static void
6764 hn_disable_rx(struct hn_softc *sc)
6765 {
6766 
6767 	/*
6768 	 * Disable RX by clearing RX filter forcefully.
6769 	 */
6770 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6771 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6772 
6773 	/*
6774 	 * Give RNDIS enough time to flush all pending data packets.
6775 	 */
6776 	pause("waitrx", (200 * hz) / 1000);
6777 }
6778 
6779 /*
6780  * NOTE:
6781  * RX/TX _must_ have been suspended/disabled, before this function
6782  * is called.
6783  */
6784 static void
6785 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6786 {
6787 	struct vmbus_channel **subch = NULL;
6788 	int nsubch;
6789 
6790 	/*
6791 	 * Drain RX/TX bufrings and interrupts.
6792 	 */
6793 	nsubch = nchan - 1;
6794 	if (nsubch > 0)
6795 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6796 
6797 	if (subch != NULL) {
6798 		int i;
6799 
6800 		for (i = 0; i < nsubch; ++i)
6801 			hn_chan_drain(sc, subch[i]);
6802 	}
6803 	hn_chan_drain(sc, sc->hn_prichan);
6804 
6805 	if (subch != NULL)
6806 		vmbus_subchan_rel(subch, nsubch);
6807 }
6808 
6809 static void
6810 hn_suspend_data(struct hn_softc *sc)
6811 {
6812 	struct hn_tx_ring *txr;
6813 	int i;
6814 
6815 	HN_LOCK_ASSERT(sc);
6816 
6817 	/*
6818 	 * Suspend TX.
6819 	 */
6820 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6821 		txr = &sc->hn_tx_ring[i];
6822 
6823 		mtx_lock(&txr->hn_tx_lock);
6824 		txr->hn_suspended = 1;
6825 		mtx_unlock(&txr->hn_tx_lock);
6826 		/* No one is able send more packets now. */
6827 
6828 		/*
6829 		 * Wait for all pending sends to finish.
6830 		 *
6831 		 * NOTE:
6832 		 * We will _not_ receive all pending send-done, if the
6833 		 * primary channel is revoked.
6834 		 */
6835 		while (hn_tx_ring_pending(txr) &&
6836 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6837 			pause("hnwtx", 1 /* 1 tick */);
6838 	}
6839 
6840 	/*
6841 	 * Disable RX.
6842 	 */
6843 	hn_disable_rx(sc);
6844 
6845 	/*
6846 	 * Drain RX/TX.
6847 	 */
6848 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6849 
6850 	/*
6851 	 * Drain any pending TX tasks.
6852 	 *
6853 	 * NOTE:
6854 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6855 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6856 	 */
6857 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6858 		txr = &sc->hn_tx_ring[i];
6859 
6860 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6861 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6862 	}
6863 }
6864 
6865 static void
6866 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6867 {
6868 
6869 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6870 }
6871 
6872 static void
6873 hn_suspend_mgmt(struct hn_softc *sc)
6874 {
6875 	struct task task;
6876 
6877 	HN_LOCK_ASSERT(sc);
6878 
6879 	/*
6880 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6881 	 * through hn_mgmt_taskq.
6882 	 */
6883 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6884 	vmbus_chan_run_task(sc->hn_prichan, &task);
6885 
6886 	/*
6887 	 * Make sure that all pending management tasks are completed.
6888 	 */
6889 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6890 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6891 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6892 }
6893 
6894 static void
6895 hn_suspend(struct hn_softc *sc)
6896 {
6897 
6898 	/* Disable polling. */
6899 	hn_polling(sc, 0);
6900 
6901 	/*
6902 	 * If the non-transparent mode VF is activated, the synthetic
6903 	 * device is receiving packets, so the data path of the
6904 	 * synthetic device must be suspended.
6905 	 */
6906 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6907 	    (sc->hn_flags & HN_FLAG_RXVF))
6908 		hn_suspend_data(sc);
6909 	hn_suspend_mgmt(sc);
6910 }
6911 
6912 static void
6913 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6914 {
6915 	int i;
6916 
6917 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6918 	    ("invalid TX ring count %d", tx_ring_cnt));
6919 
6920 	for (i = 0; i < tx_ring_cnt; ++i) {
6921 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6922 
6923 		mtx_lock(&txr->hn_tx_lock);
6924 		txr->hn_suspended = 0;
6925 		mtx_unlock(&txr->hn_tx_lock);
6926 	}
6927 }
6928 
6929 static void
6930 hn_resume_data(struct hn_softc *sc)
6931 {
6932 	int i;
6933 
6934 	HN_LOCK_ASSERT(sc);
6935 
6936 	/*
6937 	 * Re-enable RX.
6938 	 */
6939 	hn_rxfilter_config(sc);
6940 
6941 	/*
6942 	 * Make sure to clear suspend status on "all" TX rings,
6943 	 * since hn_tx_ring_inuse can be changed after
6944 	 * hn_suspend_data().
6945 	 */
6946 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6947 
6948 #ifdef HN_IFSTART_SUPPORT
6949 	if (!hn_use_if_start)
6950 #endif
6951 	{
6952 		/*
6953 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6954 		 * reduced.
6955 		 */
6956 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6957 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6958 	}
6959 
6960 	/*
6961 	 * Kick start TX.
6962 	 */
6963 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6964 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6965 
6966 		/*
6967 		 * Use txeof task, so that any pending oactive can be
6968 		 * cleared properly.
6969 		 */
6970 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6971 	}
6972 }
6973 
6974 static void
6975 hn_resume_mgmt(struct hn_softc *sc)
6976 {
6977 
6978 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6979 
6980 	/*
6981 	 * Kick off network change detection, if it was pending.
6982 	 * If no network change was pending, start link status
6983 	 * checks, which is more lightweight than network change
6984 	 * detection.
6985 	 */
6986 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6987 		hn_change_network(sc);
6988 	else
6989 		hn_update_link_status(sc);
6990 }
6991 
6992 static void
6993 hn_resume(struct hn_softc *sc)
6994 {
6995 
6996 	/*
6997 	 * If the non-transparent mode VF is activated, the synthetic
6998 	 * device have to receive packets, so the data path of the
6999 	 * synthetic device must be resumed.
7000 	 */
7001 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
7002 	    (sc->hn_flags & HN_FLAG_RXVF))
7003 		hn_resume_data(sc);
7004 
7005 	/*
7006 	 * Don't resume link status change if VF is attached/activated.
7007 	 * - In the non-transparent VF mode, the synthetic device marks
7008 	 *   link down until the VF is deactivated; i.e. VF is down.
7009 	 * - In transparent VF mode, VF's media status is used until
7010 	 *   the VF is detached.
7011 	 */
7012 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7013 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7014 		hn_resume_mgmt(sc);
7015 
7016 	/*
7017 	 * Re-enable polling if this interface is running and
7018 	 * the polling is requested.
7019 	 */
7020 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7021 		hn_polling(sc, sc->hn_pollhz);
7022 }
7023 
7024 static void
7025 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7026 {
7027 	const struct rndis_status_msg *msg;
7028 	int ofs;
7029 
7030 	if (dlen < sizeof(*msg)) {
7031 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7032 		return;
7033 	}
7034 	msg = data;
7035 
7036 	switch (msg->rm_status) {
7037 	case RNDIS_STATUS_MEDIA_CONNECT:
7038 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7039 		hn_update_link_status(sc);
7040 		break;
7041 
7042 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7043 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7044 		/* Not really useful; ignore. */
7045 		break;
7046 
7047 	case RNDIS_STATUS_NETWORK_CHANGE:
7048 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7049 		if (dlen < ofs + msg->rm_stbuflen ||
7050 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7051 			if_printf(sc->hn_ifp, "network changed\n");
7052 		} else {
7053 			uint32_t change;
7054 
7055 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7056 			    sizeof(change));
7057 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7058 			    change);
7059 		}
7060 		hn_change_network(sc);
7061 		break;
7062 
7063 	default:
7064 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7065 		    msg->rm_status);
7066 		break;
7067 	}
7068 }
7069 
7070 static int
7071 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7072 {
7073 	const struct rndis_pktinfo *pi = info_data;
7074 	uint32_t mask = 0;
7075 
7076 	while (info_dlen != 0) {
7077 		const void *data;
7078 		uint32_t dlen;
7079 
7080 		if (__predict_false(info_dlen < sizeof(*pi)))
7081 			return (EINVAL);
7082 		if (__predict_false(info_dlen < pi->rm_size))
7083 			return (EINVAL);
7084 		info_dlen -= pi->rm_size;
7085 
7086 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7087 			return (EINVAL);
7088 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7089 			return (EINVAL);
7090 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7091 		data = pi->rm_data;
7092 
7093 		if (pi->rm_internal == 1) {
7094 			switch (pi->rm_type) {
7095 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7096 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7097 					return (EINVAL);
7098 				info->pktinfo_id =
7099 				    (const struct packet_info_id *)data;
7100 				mask |= HN_RXINFO_PKTINFO_ID;
7101 				break;
7102 
7103 			default:
7104 				goto next;
7105 			}
7106 		} else {
7107 			switch (pi->rm_type) {
7108 			case NDIS_PKTINFO_TYPE_VLAN:
7109 				if (__predict_false(dlen
7110 				    < NDIS_VLAN_INFO_SIZE))
7111 					return (EINVAL);
7112 				info->vlan_info = (const uint32_t *)data;
7113 				mask |= HN_RXINFO_VLAN;
7114 				break;
7115 
7116 			case NDIS_PKTINFO_TYPE_CSUM:
7117 				if (__predict_false(dlen
7118 				    < NDIS_RXCSUM_INFO_SIZE))
7119 					return (EINVAL);
7120 				info->csum_info = (const uint32_t *)data;
7121 				mask |= HN_RXINFO_CSUM;
7122 				break;
7123 
7124 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7125 				if (__predict_false(dlen
7126 				    < HN_NDIS_HASH_VALUE_SIZE))
7127 					return (EINVAL);
7128 				info->hash_value = (const uint32_t *)data;
7129 				mask |= HN_RXINFO_HASHVAL;
7130 				break;
7131 
7132 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7133 				if (__predict_false(dlen
7134 				    < HN_NDIS_HASH_INFO_SIZE))
7135 					return (EINVAL);
7136 				info->hash_info = (const uint32_t *)data;
7137 				mask |= HN_RXINFO_HASHINF;
7138 				break;
7139 
7140 			default:
7141 				goto next;
7142 			}
7143 		}
7144 
7145 		if (mask == HN_RXINFO_ALL) {
7146 			/* All found; done */
7147 			break;
7148 		}
7149 next:
7150 		pi = (const struct rndis_pktinfo *)
7151 		    ((const uint8_t *)pi + pi->rm_size);
7152 	}
7153 
7154 	/*
7155 	 * Final fixup.
7156 	 * - If there is no hash value, invalidate the hash info.
7157 	 */
7158 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7159 		info->hash_info = NULL;
7160 	return (0);
7161 }
7162 
7163 static __inline bool
7164 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7165 {
7166 
7167 	if (off < check_off) {
7168 		if (__predict_true(off + len <= check_off))
7169 			return (false);
7170 	} else if (off > check_off) {
7171 		if (__predict_true(check_off + check_len <= off))
7172 			return (false);
7173 	}
7174 	return (true);
7175 }
7176 
7177 static __inline void
7178 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7179 		uint32_t len, struct hn_rxinfo *info)
7180 {
7181 	uint32_t cnt = rxr->rsc.cnt;
7182 
7183 	if (cnt) {
7184 		rxr->rsc.pktlen += len;
7185 	} else {
7186 		rxr->rsc.vlan_info = info->vlan_info;
7187 		rxr->rsc.csum_info = info->csum_info;
7188 		rxr->rsc.hash_info = info->hash_info;
7189 		rxr->rsc.hash_value = info->hash_value;
7190 		rxr->rsc.pktlen = len;
7191 	}
7192 
7193 	rxr->rsc.frag_data[cnt] = data;
7194 	rxr->rsc.frag_len[cnt] = len;
7195 	rxr->rsc.cnt++;
7196 }
7197 
7198 static void
7199 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7200 {
7201 	const struct rndis_packet_msg *pkt;
7202 	struct hn_rxinfo info;
7203 	int data_off, pktinfo_off, data_len, pktinfo_len;
7204 	bool rsc_more= false;
7205 
7206 	/*
7207 	 * Check length.
7208 	 */
7209 	if (__predict_false(dlen < sizeof(*pkt))) {
7210 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7211 		return;
7212 	}
7213 	pkt = data;
7214 
7215 	if (__predict_false(dlen < pkt->rm_len)) {
7216 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7217 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7218 		return;
7219 	}
7220 	if (__predict_false(pkt->rm_len <
7221 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7222 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7223 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7224 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7225 		    pkt->rm_pktinfolen);
7226 		return;
7227 	}
7228 	if (__predict_false(pkt->rm_datalen == 0)) {
7229 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7230 		return;
7231 	}
7232 
7233 	/*
7234 	 * Check offests.
7235 	 */
7236 #define IS_OFFSET_INVALID(ofs)			\
7237 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7238 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7239 
7240 	/* XXX Hyper-V does not meet data offset alignment requirement */
7241 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7242 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7243 		    "data offset %u\n", pkt->rm_dataoffset);
7244 		return;
7245 	}
7246 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7247 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7248 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7249 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7250 		return;
7251 	}
7252 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7253 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7254 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7255 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7256 		return;
7257 	}
7258 
7259 #undef IS_OFFSET_INVALID
7260 
7261 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7262 	data_len = pkt->rm_datalen;
7263 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7264 	pktinfo_len = pkt->rm_pktinfolen;
7265 
7266 	/*
7267 	 * Check OOB coverage.
7268 	 */
7269 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7270 		int oob_off, oob_len;
7271 
7272 		if_printf(rxr->hn_ifp, "got oobdata\n");
7273 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7274 		oob_len = pkt->rm_oobdatalen;
7275 
7276 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7277 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7278 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7279 			    pkt->rm_len, oob_off, oob_len);
7280 			return;
7281 		}
7282 
7283 		/*
7284 		 * Check against data.
7285 		 */
7286 		if (hn_rndis_check_overlap(oob_off, oob_len,
7287 		    data_off, data_len)) {
7288 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7289 			    "oob overlaps data, oob abs %d len %d, "
7290 			    "data abs %d len %d\n",
7291 			    oob_off, oob_len, data_off, data_len);
7292 			return;
7293 		}
7294 
7295 		/*
7296 		 * Check against pktinfo.
7297 		 */
7298 		if (pktinfo_len != 0 &&
7299 		    hn_rndis_check_overlap(oob_off, oob_len,
7300 		    pktinfo_off, pktinfo_len)) {
7301 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7302 			    "oob overlaps pktinfo, oob abs %d len %d, "
7303 			    "pktinfo abs %d len %d\n",
7304 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7305 			return;
7306 		}
7307 	}
7308 
7309 	/*
7310 	 * Check per-packet-info coverage and find useful per-packet-info.
7311 	 */
7312 	info.vlan_info = NULL;
7313 	info.csum_info = NULL;
7314 	info.hash_info = NULL;
7315 	info.pktinfo_id = NULL;
7316 
7317 	if (__predict_true(pktinfo_len != 0)) {
7318 		bool overlap;
7319 		int error;
7320 
7321 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7322 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7323 			    "pktinfo overflow, msglen %u, "
7324 			    "pktinfo abs %d len %d\n",
7325 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7326 			return;
7327 		}
7328 
7329 		/*
7330 		 * Check packet info coverage.
7331 		 */
7332 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7333 		    data_off, data_len);
7334 		if (__predict_false(overlap)) {
7335 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7336 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7337 			    "data abs %d len %d\n",
7338 			    pktinfo_off, pktinfo_len, data_off, data_len);
7339 			return;
7340 		}
7341 
7342 		/*
7343 		 * Find useful per-packet-info.
7344 		 */
7345 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7346 		    pktinfo_len, &info);
7347 		if (__predict_false(error)) {
7348 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7349 			    "pktinfo\n");
7350 			return;
7351 		}
7352 	}
7353 
7354 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7355 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7356 		    "data overflow, msglen %u, data abs %d len %d\n",
7357 		    pkt->rm_len, data_off, data_len);
7358 		return;
7359 	}
7360 
7361 	/* Identify RSC fragments, drop invalid packets */
7362 	if ((info.pktinfo_id != NULL) &&
7363 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7364 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7365 			rxr->rsc.cnt = 0;
7366 			rxr->hn_rsc_pkts++;
7367 		} else if (rxr->rsc.cnt == 0)
7368 			goto drop;
7369 
7370 		rsc_more = true;
7371 
7372 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7373 			rsc_more = false;
7374 
7375 		if (rsc_more && rxr->rsc.is_last)
7376 			goto drop;
7377 	} else {
7378 		rxr->rsc.cnt = 0;
7379 	}
7380 
7381 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7382 		goto drop;
7383 
7384 	/* Store data in per rx ring structure */
7385 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7386 	    data_len, &info);
7387 
7388 	if (rsc_more)
7389 		return;
7390 
7391 	hn_rxpkt(rxr);
7392 	rxr->rsc.cnt = 0;
7393 	return;
7394 drop:
7395 	rxr->hn_rsc_drop++;
7396 	return;
7397 }
7398 
7399 static __inline void
7400 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7401 {
7402 	const struct rndis_msghdr *hdr;
7403 
7404 	if (__predict_false(dlen < sizeof(*hdr))) {
7405 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7406 		return;
7407 	}
7408 	hdr = data;
7409 
7410 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7411 		/* Hot data path. */
7412 		hn_rndis_rx_data(rxr, data, dlen);
7413 		/* Done! */
7414 		return;
7415 	}
7416 
7417 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7418 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7419 	else
7420 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7421 }
7422 
7423 static void
7424 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7425 {
7426 	const struct hn_nvs_hdr *hdr;
7427 
7428 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7429 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7430 		return;
7431 	}
7432 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7433 
7434 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7435 		/* Useless; ignore */
7436 		return;
7437 	}
7438 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7439 }
7440 
7441 static void
7442 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7443     const struct vmbus_chanpkt_hdr *pkt)
7444 {
7445 	struct hn_nvs_sendctx *sndc;
7446 
7447 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7448 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7449 	    VMBUS_CHANPKT_DATALEN(pkt));
7450 	/*
7451 	 * NOTE:
7452 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7453 	 * its callback.
7454 	 */
7455 }
7456 
7457 static void
7458 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7459     const struct vmbus_chanpkt_hdr *pkthdr)
7460 {
7461 	const struct vmbus_chanpkt_rxbuf *pkt;
7462 	const struct hn_nvs_hdr *nvs_hdr;
7463 	int count, i, hlen;
7464 
7465 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7466 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7467 		return;
7468 	}
7469 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7470 
7471 	/* Make sure that this is a RNDIS message. */
7472 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7473 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7474 		    nvs_hdr->nvs_type);
7475 		return;
7476 	}
7477 
7478 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7479 	if (__predict_false(hlen < sizeof(*pkt))) {
7480 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7481 		return;
7482 	}
7483 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7484 
7485 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7486 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7487 		    pkt->cp_rxbuf_id);
7488 		return;
7489 	}
7490 
7491 	count = pkt->cp_rxbuf_cnt;
7492 	if (__predict_false(hlen <
7493 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7494 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7495 		return;
7496 	}
7497 
7498 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7499 	for (i = 0; i < count; ++i) {
7500 		int ofs, len;
7501 
7502 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7503 		len = pkt->cp_rxbuf[i].rb_len;
7504 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7505 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7506 			    "ofs %d, len %d\n", i, ofs, len);
7507 			continue;
7508 		}
7509 
7510 		rxr->rsc.is_last = (i == (count - 1));
7511 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7512 	}
7513 
7514 	/*
7515 	 * Ack the consumed RXBUF associated w/ this channel packet,
7516 	 * so that this RXBUF can be recycled by the hypervisor.
7517 	 */
7518 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7519 }
7520 
7521 static void
7522 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7523     uint64_t tid)
7524 {
7525 	struct hn_nvs_rndis_ack ack;
7526 	int retries, error;
7527 
7528 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7529 	ack.nvs_status = HN_NVS_STATUS_OK;
7530 
7531 	retries = 0;
7532 again:
7533 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7534 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7535 	if (__predict_false(error == EAGAIN)) {
7536 		/*
7537 		 * NOTE:
7538 		 * This should _not_ happen in real world, since the
7539 		 * consumption of the TX bufring from the TX path is
7540 		 * controlled.
7541 		 */
7542 		if (rxr->hn_ack_failed == 0)
7543 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7544 		rxr->hn_ack_failed++;
7545 		retries++;
7546 		if (retries < 10) {
7547 			DELAY(100);
7548 			goto again;
7549 		}
7550 		/* RXBUF leaks! */
7551 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7552 	}
7553 }
7554 
7555 static void
7556 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7557 {
7558 	struct hn_rx_ring *rxr = xrxr;
7559 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7560 
7561 	for (;;) {
7562 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7563 		int error, pktlen;
7564 
7565 		pktlen = rxr->hn_pktbuf_len;
7566 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7567 		if (__predict_false(error == ENOBUFS)) {
7568 			void *nbuf;
7569 			int nlen;
7570 
7571 			/*
7572 			 * Expand channel packet buffer.
7573 			 *
7574 			 * XXX
7575 			 * Use M_WAITOK here, since allocation failure
7576 			 * is fatal.
7577 			 */
7578 			nlen = rxr->hn_pktbuf_len * 2;
7579 			while (nlen < pktlen)
7580 				nlen *= 2;
7581 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7582 
7583 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7584 			    rxr->hn_pktbuf_len, nlen);
7585 
7586 			free(rxr->hn_pktbuf, M_DEVBUF);
7587 			rxr->hn_pktbuf = nbuf;
7588 			rxr->hn_pktbuf_len = nlen;
7589 			/* Retry! */
7590 			continue;
7591 		} else if (__predict_false(error == EAGAIN)) {
7592 			/* No more channel packets; done! */
7593 			break;
7594 		}
7595 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7596 
7597 		switch (pkt->cph_type) {
7598 		case VMBUS_CHANPKT_TYPE_COMP:
7599 			hn_nvs_handle_comp(sc, chan, pkt);
7600 			break;
7601 
7602 		case VMBUS_CHANPKT_TYPE_RXBUF:
7603 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7604 			break;
7605 
7606 		case VMBUS_CHANPKT_TYPE_INBAND:
7607 			hn_nvs_handle_notify(sc, pkt);
7608 			break;
7609 
7610 		default:
7611 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7612 			    pkt->cph_type);
7613 			break;
7614 		}
7615 	}
7616 	hn_chan_rollup(rxr, rxr->hn_txr);
7617 }
7618 
7619 static void
7620 hn_sysinit(void *arg __unused)
7621 {
7622 	int i;
7623 
7624 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7625 
7626 #ifdef HN_IFSTART_SUPPORT
7627 	/*
7628 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7629 	 * mainly due to the IFF_DRV_OACTIVE flag.
7630 	 */
7631 	if (hn_xpnt_vf && hn_use_if_start) {
7632 		hn_use_if_start = 0;
7633 		printf("hn: tranparent VF mode, if_transmit will be used, "
7634 		    "instead of if_start\n");
7635 	}
7636 #endif
7637 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7638 		printf("hn: invalid transparent VF attach routing "
7639 		    "wait timeout %d, reset to %d\n",
7640 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7641 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7642 	}
7643 
7644 	/*
7645 	 * Initialize VF map.
7646 	 */
7647 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7648 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7649 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7650 	    M_WAITOK | M_ZERO);
7651 
7652 	/*
7653 	 * Fix the # of TX taskqueues.
7654 	 */
7655 	if (hn_tx_taskq_cnt <= 0)
7656 		hn_tx_taskq_cnt = 1;
7657 	else if (hn_tx_taskq_cnt > mp_ncpus)
7658 		hn_tx_taskq_cnt = mp_ncpus;
7659 
7660 	/*
7661 	 * Fix the TX taskqueue mode.
7662 	 */
7663 	switch (hn_tx_taskq_mode) {
7664 	case HN_TX_TASKQ_M_INDEP:
7665 	case HN_TX_TASKQ_M_GLOBAL:
7666 	case HN_TX_TASKQ_M_EVTTQ:
7667 		break;
7668 	default:
7669 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7670 		break;
7671 	}
7672 
7673 	if (vm_guest != VM_GUEST_HV)
7674 		return;
7675 
7676 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7677 		return;
7678 
7679 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7680 	    M_DEVBUF, M_WAITOK);
7681 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7682 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7683 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7684 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7685 		    "hn tx%d", i);
7686 	}
7687 }
7688 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7689 
7690 static void
7691 hn_sysuninit(void *arg __unused)
7692 {
7693 
7694 	if (hn_tx_taskque != NULL) {
7695 		int i;
7696 
7697 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7698 			taskqueue_free(hn_tx_taskque[i]);
7699 		free(hn_tx_taskque, M_DEVBUF);
7700 	}
7701 
7702 	if (hn_vfmap != NULL)
7703 		free(hn_vfmap, M_DEVBUF);
7704 	rm_destroy(&hn_vfmap_lock);
7705 
7706 	counter_u64_free(hn_udpcs_fixup);
7707 }
7708 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7709