xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision ee12faa062c04a49bf6fe4e6867bad8606e2413f)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 #include "opt_hn.h"
57 #include "opt_inet6.h"
58 #include "opt_inet.h"
59 #include "opt_rss.h"
60 
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/bus.h>
64 #include <sys/counter.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/proc.h>
73 #include <sys/rmlock.h>
74 #include <sys/sbuf.h>
75 #include <sys/sched.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 #include <sys/epoch.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_extern.h>
88 #include <vm/pmap.h>
89 
90 #include <machine/atomic.h>
91 #include <machine/in_cksum.h>
92 
93 #include <net/bpf.h>
94 #include <net/ethernet.h>
95 #include <net/if.h>
96 #include <net/if_dl.h>
97 #include <net/if_media.h>
98 #include <net/if_types.h>
99 #include <net/if_var.h>
100 #include <net/rndis.h>
101 #ifdef RSS
102 #include <net/rss_config.h>
103 #endif
104 
105 #include <netinet/in_systm.h>
106 #include <netinet/in.h>
107 #include <netinet/ip.h>
108 #include <netinet/ip6.h>
109 #include <netinet/tcp.h>
110 #include <netinet/tcp_lro.h>
111 #include <netinet/udp.h>
112 
113 #include <dev/hyperv/include/hyperv.h>
114 #include <dev/hyperv/include/hyperv_busdma.h>
115 #include <dev/hyperv/include/vmbus.h>
116 #include <dev/hyperv/include/vmbus_xact.h>
117 
118 #include <dev/hyperv/netvsc/ndis.h>
119 #include <dev/hyperv/netvsc/if_hnreg.h>
120 #include <dev/hyperv/netvsc/if_hnvar.h>
121 #include <dev/hyperv/netvsc/hn_nvs.h>
122 #include <dev/hyperv/netvsc/hn_rndis.h>
123 
124 #include "vmbus_if.h"
125 
126 #define HN_IFSTART_SUPPORT
127 
128 #define HN_RING_CNT_DEF_MAX		8
129 
130 #define HN_VFMAP_SIZE_DEF		8
131 
132 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
133 
134 /* YYY should get it from the underlying channel */
135 #define HN_TX_DESC_CNT			512
136 
137 #define HN_RNDIS_PKT_LEN					\
138 	(sizeof(struct rndis_packet_msg) +			\
139 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
141 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
142 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
143 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
144 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
145 
146 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
147 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
148 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
149 /* -1 for RNDIS packet message */
150 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
151 
152 #define HN_DIRECT_TX_SIZE_DEF		128
153 
154 #define HN_EARLY_TXEOF_THRESH		8
155 
156 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
157 
158 #define HN_LROENT_CNT_DEF		128
159 
160 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
161 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
162 /* YYY 2*MTU is a bit rough, but should be good enough. */
163 #define HN_LRO_LENLIM_MIN(ifp)		(2 * if_getmtu(ifp))
164 
165 #define HN_LRO_ACKCNT_DEF		1
166 
167 #define HN_LOCK_INIT(sc)		\
168 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
169 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
170 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
171 #define HN_LOCK(sc)					\
172 do {							\
173 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
174 		/* Relinquish cpu to avoid deadlock */	\
175 		sched_relinquish(curthread);		\
176 		DELAY(1000);				\
177 	}						\
178 } while (0)
179 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
180 
181 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
182 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
183 #define HN_CSUM_IP_HWASSIST(sc)		\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
185 #define HN_CSUM_IP6_HWASSIST(sc)	\
186 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
187 
188 #define HN_PKTSIZE_MIN(align)		\
189 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
190 	    HN_RNDIS_PKT_LEN, (align))
191 #define HN_PKTSIZE(m, align)		\
192 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
193 
194 #ifdef RSS
195 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
196 #else
197 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
198 #endif
199 
200 struct hn_txdesc {
201 #ifndef HN_USE_TXDESC_BUFRING
202 	SLIST_ENTRY(hn_txdesc)		link;
203 #endif
204 	STAILQ_ENTRY(hn_txdesc)		agg_link;
205 
206 	/* Aggregated txdescs, in sending order. */
207 	STAILQ_HEAD(, hn_txdesc)	agg_list;
208 
209 	/* The oldest packet, if transmission aggregation happens. */
210 	struct mbuf			*m;
211 	struct hn_tx_ring		*txr;
212 	int				refs;
213 	uint32_t			flags;	/* HN_TXD_FLAG_ */
214 	struct hn_nvs_sendctx		send_ctx;
215 	uint32_t			chim_index;
216 	int				chim_size;
217 
218 	bus_dmamap_t			data_dmap;
219 
220 	bus_addr_t			rndis_pkt_paddr;
221 	struct rndis_packet_msg		*rndis_pkt;
222 	bus_dmamap_t			rndis_pkt_dmap;
223 };
224 
225 #define HN_TXD_FLAG_ONLIST		0x0001
226 #define HN_TXD_FLAG_DMAMAP		0x0002
227 #define HN_TXD_FLAG_ONAGG		0x0004
228 
229 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
230 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
231 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
232 
233 struct packet_info_id {
234 	uint8_t				ver;
235 	uint8_t				flag;
236 	uint16_t			pkt_id;
237 };
238 
239 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
240 
241 
242 struct hn_rxinfo {
243 	const uint32_t			*vlan_info;
244 	const uint32_t			*csum_info;
245 	const uint32_t			*hash_info;
246 	const uint32_t			*hash_value;
247 	const struct packet_info_id	*pktinfo_id;
248 };
249 
250 struct hn_rxvf_setarg {
251 	struct hn_rx_ring	*rxr;
252 	if_t			vf_ifp;
253 };
254 
255 #define HN_RXINFO_VLAN			0x0001
256 #define HN_RXINFO_CSUM			0x0002
257 #define HN_RXINFO_HASHINF		0x0004
258 #define HN_RXINFO_HASHVAL		0x0008
259 #define HN_RXINFO_PKTINFO_ID		0x0010
260 #define HN_RXINFO_ALL			\
261 	(HN_RXINFO_VLAN |		\
262 	 HN_RXINFO_CSUM |		\
263 	 HN_RXINFO_HASHINF |		\
264 	 HN_RXINFO_HASHVAL |		\
265 	 HN_RXINFO_PKTINFO_ID)
266 
267 static int			hn_probe(device_t);
268 static int			hn_attach(device_t);
269 static int			hn_detach(device_t);
270 static int			hn_shutdown(device_t);
271 static void			hn_chan_callback(struct vmbus_channel *,
272 				    void *);
273 
274 static void			hn_init(void *);
275 static int			hn_ioctl(if_t, u_long, caddr_t);
276 #ifdef HN_IFSTART_SUPPORT
277 static void			hn_start(if_t);
278 #endif
279 static int			hn_transmit(if_t, struct mbuf *);
280 static void			hn_xmit_qflush(if_t);
281 static int			hn_ifmedia_upd(if_t);
282 static void			hn_ifmedia_sts(if_t,
283 				    struct ifmediareq *);
284 
285 static void			hn_ifnet_event(void *, if_t, int);
286 static void			hn_ifaddr_event(void *, if_t);
287 static void			hn_ifnet_attevent(void *, if_t);
288 static void			hn_ifnet_detevent(void *, if_t);
289 static void			hn_ifnet_lnkevent(void *, if_t, int);
290 
291 static bool			hn_ismyvf(const struct hn_softc *,
292 				    const if_t);
293 static void			hn_rxvf_change(struct hn_softc *,
294 				    if_t, bool);
295 static void			hn_rxvf_set(struct hn_softc *, if_t);
296 static void			hn_rxvf_set_task(void *, int);
297 static void			hn_xpnt_vf_input(if_t, struct mbuf *);
298 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
299 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
300 				    struct ifreq *);
301 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
302 static bool			hn_xpnt_vf_isready(struct hn_softc *);
303 static void			hn_xpnt_vf_setready(struct hn_softc *);
304 static void			hn_xpnt_vf_init_taskfunc(void *, int);
305 static void			hn_xpnt_vf_init(struct hn_softc *);
306 static void			hn_xpnt_vf_setenable(struct hn_softc *);
307 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
308 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
309 static void			hn_vf_rss_restore(struct hn_softc *);
310 
311 static int			hn_rndis_rxinfo(const void *, int,
312 				    struct hn_rxinfo *);
313 static void			hn_rndis_rx_data(struct hn_rx_ring *,
314 				    const void *, int);
315 static void			hn_rndis_rx_status(struct hn_softc *,
316 				    const void *, int);
317 static void			hn_rndis_init_fixat(struct hn_softc *, int);
318 
319 static void			hn_nvs_handle_notify(struct hn_softc *,
320 				    const struct vmbus_chanpkt_hdr *);
321 static void			hn_nvs_handle_comp(struct hn_softc *,
322 				    struct vmbus_channel *,
323 				    const struct vmbus_chanpkt_hdr *);
324 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
325 				    struct vmbus_channel *,
326 				    const struct vmbus_chanpkt_hdr *);
327 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
328 				    struct vmbus_channel *, uint64_t);
329 
330 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
342 #ifndef RSS
343 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
345 #endif
346 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
360 
361 static void			hn_stop(struct hn_softc *, bool);
362 static void			hn_init_locked(struct hn_softc *);
363 static int			hn_chan_attach(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static void			hn_chan_detach(struct hn_softc *,
366 				    struct vmbus_channel *);
367 static int			hn_attach_subchans(struct hn_softc *);
368 static void			hn_detach_allchans(struct hn_softc *);
369 static void			hn_chan_rollup(struct hn_rx_ring *,
370 				    struct hn_tx_ring *);
371 static void			hn_set_ring_inuse(struct hn_softc *, int);
372 static int			hn_synth_attach(struct hn_softc *, int);
373 static void			hn_synth_detach(struct hn_softc *);
374 static int			hn_synth_alloc_subchans(struct hn_softc *,
375 				    int *);
376 static bool			hn_synth_attachable(const struct hn_softc *);
377 static void			hn_suspend(struct hn_softc *);
378 static void			hn_suspend_data(struct hn_softc *);
379 static void			hn_suspend_mgmt(struct hn_softc *);
380 static void			hn_resume(struct hn_softc *);
381 static void			hn_resume_data(struct hn_softc *);
382 static void			hn_resume_mgmt(struct hn_softc *);
383 static void			hn_suspend_mgmt_taskfunc(void *, int);
384 static void			hn_chan_drain(struct hn_softc *,
385 				    struct vmbus_channel *);
386 static void			hn_disable_rx(struct hn_softc *);
387 static void			hn_drain_rxtx(struct hn_softc *, int);
388 static void			hn_polling(struct hn_softc *, u_int);
389 static void			hn_chan_polling(struct vmbus_channel *, u_int);
390 static void			hn_mtu_change_fixup(struct hn_softc *);
391 
392 static void			hn_update_link_status(struct hn_softc *);
393 static void			hn_change_network(struct hn_softc *);
394 static void			hn_link_taskfunc(void *, int);
395 static void			hn_netchg_init_taskfunc(void *, int);
396 static void			hn_netchg_status_taskfunc(void *, int);
397 static void			hn_link_status(struct hn_softc *);
398 
399 static int			hn_create_rx_data(struct hn_softc *, int);
400 static void			hn_destroy_rx_data(struct hn_softc *);
401 static int			hn_check_iplen(const struct mbuf *, int);
402 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
403 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
404 static int			hn_rxfilter_config(struct hn_softc *);
405 static int			hn_rss_reconfig(struct hn_softc *);
406 static void			hn_rss_ind_fixup(struct hn_softc *);
407 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
408 static int			hn_rxpkt(struct hn_rx_ring *);
409 static uint32_t			hn_rss_type_fromndis(uint32_t);
410 static uint32_t			hn_rss_type_tondis(uint32_t);
411 
412 static int			hn_tx_ring_create(struct hn_softc *, int);
413 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
414 static int			hn_create_tx_data(struct hn_softc *, int);
415 static void			hn_fixup_tx_data(struct hn_softc *);
416 static void			hn_fixup_rx_data(struct hn_softc *);
417 static void			hn_destroy_tx_data(struct hn_softc *);
418 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
419 static void			hn_txdesc_gc(struct hn_tx_ring *,
420 				    struct hn_txdesc *);
421 static int			hn_encap(if_t, struct hn_tx_ring *,
422 				    struct hn_txdesc *, struct mbuf **);
423 static int			hn_txpkt(if_t, struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static void			hn_set_chim_size(struct hn_softc *, int);
426 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
427 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
428 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
429 static void			hn_resume_tx(struct hn_softc *, int);
430 static void			hn_set_txagg(struct hn_softc *);
431 static void			*hn_try_txagg(if_t,
432 				    struct hn_tx_ring *, struct hn_txdesc *,
433 				    int);
434 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
435 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
436 				    struct hn_softc *, struct vmbus_channel *,
437 				    const void *, int);
438 static int			hn_txpkt_sglist(struct hn_tx_ring *,
439 				    struct hn_txdesc *);
440 static int			hn_txpkt_chim(struct hn_tx_ring *,
441 				    struct hn_txdesc *);
442 static int			hn_xmit(struct hn_tx_ring *, int);
443 static void			hn_xmit_taskfunc(void *, int);
444 static void			hn_xmit_txeof(struct hn_tx_ring *);
445 static void			hn_xmit_txeof_taskfunc(void *, int);
446 #ifdef HN_IFSTART_SUPPORT
447 static int			hn_start_locked(struct hn_tx_ring *, int);
448 static void			hn_start_taskfunc(void *, int);
449 static void			hn_start_txeof(struct hn_tx_ring *);
450 static void			hn_start_txeof_taskfunc(void *, int);
451 #endif
452 
453 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
454 
455 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
456     "Hyper-V network interface");
457 
458 /* Trust tcp segment verification on host side. */
459 static int			hn_trust_hosttcp = 1;
460 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
461     &hn_trust_hosttcp, 0,
462     "Trust tcp segment verification on host side, "
463     "when csum info is missing (global setting)");
464 
465 /* Trust udp datagrams verification on host side. */
466 static int			hn_trust_hostudp = 1;
467 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
468     &hn_trust_hostudp, 0,
469     "Trust udp datagram verification on host side, "
470     "when csum info is missing (global setting)");
471 
472 /* Trust ip packets verification on host side. */
473 static int			hn_trust_hostip = 1;
474 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
475     &hn_trust_hostip, 0,
476     "Trust ip packet verification on host side, "
477     "when csum info is missing (global setting)");
478 
479 /*
480  * Offload UDP/IPv4 checksum.
481  */
482 static int			hn_enable_udp4cs = 1;
483 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
484     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
485 
486 /*
487  * Offload UDP/IPv6 checksum.
488  */
489 static int			hn_enable_udp6cs = 1;
490 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
491     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
492 
493 /* Stats. */
494 static counter_u64_t		hn_udpcs_fixup;
495 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
496     &hn_udpcs_fixup, "# of UDP checksum fixup");
497 
498 /*
499  * See hn_set_hlen().
500  *
501  * This value is for Azure.  For Hyper-V, set this above
502  * 65536 to disable UDP datagram checksum fixup.
503  */
504 static int			hn_udpcs_fixup_mtu = 1420;
505 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
506     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
507 
508 /* Limit TSO burst size */
509 static int			hn_tso_maxlen = IP_MAXPACKET;
510 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
511     &hn_tso_maxlen, 0, "TSO burst limit");
512 
513 /* Limit chimney send size */
514 static int			hn_tx_chimney_size = 0;
515 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
516     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
517 
518 /* Limit the size of packet for direct transmission */
519 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
520 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
521     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
522 
523 /* # of LRO entries per RX ring */
524 #if defined(INET) || defined(INET6)
525 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
526 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
527     &hn_lro_entry_count, 0, "LRO entry count");
528 #endif
529 
530 static int			hn_tx_taskq_cnt = 1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
532     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
533 
534 #define HN_TX_TASKQ_M_INDEP	0
535 #define HN_TX_TASKQ_M_GLOBAL	1
536 #define HN_TX_TASKQ_M_EVTTQ	2
537 
538 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
539 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
540     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
541     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
542 
543 #ifndef HN_USE_TXDESC_BUFRING
544 static int			hn_use_txdesc_bufring = 0;
545 #else
546 static int			hn_use_txdesc_bufring = 1;
547 #endif
548 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
549     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
550 
551 #ifdef HN_IFSTART_SUPPORT
552 /* Use ifnet.if_start instead of ifnet.if_transmit */
553 static int			hn_use_if_start = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
555     &hn_use_if_start, 0, "Use if_start TX method");
556 #endif
557 
558 /* # of channels to use */
559 static int			hn_chan_cnt = 0;
560 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
561     &hn_chan_cnt, 0,
562     "# of channels to use; each channel has one RX ring and one TX ring");
563 
564 /* # of transmit rings to use */
565 static int			hn_tx_ring_cnt = 0;
566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
567     &hn_tx_ring_cnt, 0, "# of TX rings to use");
568 
569 /* Software TX ring deptch */
570 static int			hn_tx_swq_depth = 0;
571 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
572     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
573 
574 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
575 static u_int			hn_lro_mbufq_depth = 0;
576 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
577     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
578 
579 /* Packet transmission aggregation size limit */
580 static int			hn_tx_agg_size = -1;
581 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
582     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
583 
584 /* Packet transmission aggregation count limit */
585 static int			hn_tx_agg_pkts = -1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
587     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
588 
589 /* VF list */
590 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
591     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
592     hn_vflist_sysctl, "A",
593     "VF list");
594 
595 /* VF mapping */
596 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
597     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
598     hn_vfmap_sysctl, "A",
599     "VF mapping");
600 
601 /* Transparent VF */
602 static int			hn_xpnt_vf = 1;
603 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
604     &hn_xpnt_vf, 0, "Transparent VF mod");
605 
606 /* Accurate BPF support for Transparent VF */
607 static int			hn_xpnt_vf_accbpf = 0;
608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
609     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
610 
611 /* Extra wait for transparent VF attach routing; unit seconds. */
612 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
614     &hn_xpnt_vf_attwait, 0,
615     "Extra wait for transparent VF attach routing; unit: seconds");
616 
617 static u_int			hn_cpu_index;	/* next CPU for channel */
618 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
619 
620 static struct rmlock		hn_vfmap_lock;
621 static int			hn_vfmap_size;
622 static if_t			*hn_vfmap;
623 
624 #ifndef RSS
625 static const uint8_t
626 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
627 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
628 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
629 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
630 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
631 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
632 };
633 #endif	/* !RSS */
634 
635 static const struct hyperv_guid	hn_guid = {
636 	.hv_guid = {
637 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
638 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
639 };
640 
641 static device_method_t hn_methods[] = {
642 	/* Device interface */
643 	DEVMETHOD(device_probe,		hn_probe),
644 	DEVMETHOD(device_attach,	hn_attach),
645 	DEVMETHOD(device_detach,	hn_detach),
646 	DEVMETHOD(device_shutdown,	hn_shutdown),
647 	DEVMETHOD_END
648 };
649 
650 static driver_t hn_driver = {
651 	"hn",
652 	hn_methods,
653 	sizeof(struct hn_softc)
654 };
655 
656 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
657 MODULE_VERSION(hn, 1);
658 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
659 
660 static void
661 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
662 {
663 	int i;
664 
665 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
666 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
667 }
668 
669 static int
670 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
671 {
672 
673 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
675 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
676 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
677 }
678 
679 static int
680 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682 	struct hn_nvs_rndis rndis;
683 
684 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
685 	    txd->chim_size > 0, ("invalid rndis chim txd"));
686 
687 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
688 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
689 	rndis.nvs_chim_idx = txd->chim_index;
690 	rndis.nvs_chim_sz = txd->chim_size;
691 
692 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
693 	    &rndis, sizeof(rndis), &txd->send_ctx));
694 }
695 
696 static __inline uint32_t
697 hn_chim_alloc(struct hn_softc *sc)
698 {
699 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
700 	u_long *bmap = sc->hn_chim_bmap;
701 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
702 
703 	for (i = 0; i < bmap_cnt; ++i) {
704 		int idx;
705 
706 		idx = ffsl(~bmap[i]);
707 		if (idx == 0)
708 			continue;
709 
710 		--idx; /* ffsl is 1-based */
711 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
712 		    ("invalid i %d and idx %d", i, idx));
713 
714 		if (atomic_testandset_long(&bmap[i], idx))
715 			continue;
716 
717 		ret = i * LONG_BIT + idx;
718 		break;
719 	}
720 	return (ret);
721 }
722 
723 static __inline void
724 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
725 {
726 	u_long mask;
727 	uint32_t idx;
728 
729 	idx = chim_idx / LONG_BIT;
730 	KASSERT(idx < sc->hn_chim_bmap_cnt,
731 	    ("invalid chimney index 0x%x", chim_idx));
732 
733 	mask = 1UL << (chim_idx % LONG_BIT);
734 	KASSERT(sc->hn_chim_bmap[idx] & mask,
735 	    ("index bitmap 0x%lx, chimney index %u, "
736 	     "bitmap idx %d, bitmask 0x%lx",
737 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
738 
739 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
740 }
741 
742 #if defined(INET6) || defined(INET)
743 
744 #define PULLUP_HDR(m, len)				\
745 do {							\
746 	if (__predict_false((m)->m_len < (len))) {	\
747 		(m) = m_pullup((m), (len));		\
748 		if ((m) == NULL)			\
749 			return (NULL);			\
750 	}						\
751 } while (0)
752 
753 /*
754  * NOTE: If this function failed, the m_head would be freed.
755  */
756 static __inline struct mbuf *
757 hn_tso_fixup(struct mbuf *m_head)
758 {
759 	struct ether_vlan_header *evl;
760 	struct tcphdr *th;
761 	int ehlen;
762 
763 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
764 
765 	PULLUP_HDR(m_head, sizeof(*evl));
766 	evl = mtod(m_head, struct ether_vlan_header *);
767 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
768 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
769 	else
770 		ehlen = ETHER_HDR_LEN;
771 	m_head->m_pkthdr.l2hlen = ehlen;
772 
773 #ifdef INET
774 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
775 		struct ip *ip;
776 		int iphlen;
777 
778 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
779 		ip = mtodo(m_head, ehlen);
780 		iphlen = ip->ip_hl << 2;
781 		m_head->m_pkthdr.l3hlen = iphlen;
782 
783 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
784 		th = mtodo(m_head, ehlen + iphlen);
785 
786 		ip->ip_len = 0;
787 		ip->ip_sum = 0;
788 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
789 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
790 	}
791 #endif
792 #if defined(INET6) && defined(INET)
793 	else
794 #endif
795 #ifdef INET6
796 	{
797 		struct ip6_hdr *ip6;
798 
799 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
800 		ip6 = mtodo(m_head, ehlen);
801 		if (ip6->ip6_nxt != IPPROTO_TCP) {
802 			m_freem(m_head);
803 			return (NULL);
804 		}
805 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
806 
807 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
808 		th = mtodo(m_head, ehlen + sizeof(*ip6));
809 
810 		ip6->ip6_plen = 0;
811 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
812 	}
813 #endif
814 	return (m_head);
815 }
816 
817 /*
818  * NOTE: If this function failed, the m_head would be freed.
819  */
820 static __inline struct mbuf *
821 hn_set_hlen(struct mbuf *m_head)
822 {
823 	const struct ether_vlan_header *evl;
824 	int ehlen;
825 
826 	PULLUP_HDR(m_head, sizeof(*evl));
827 	evl = mtod(m_head, const struct ether_vlan_header *);
828 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
829 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
830 	else
831 		ehlen = ETHER_HDR_LEN;
832 	m_head->m_pkthdr.l2hlen = ehlen;
833 
834 #ifdef INET
835 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
836 		const struct ip *ip;
837 		int iphlen;
838 
839 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
840 		ip = mtodo(m_head, ehlen);
841 		iphlen = ip->ip_hl << 2;
842 		m_head->m_pkthdr.l3hlen = iphlen;
843 
844 		/*
845 		 * UDP checksum offload does not work in Azure, if the
846 		 * following conditions meet:
847 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
848 		 * - IP_DF is not set in the IP hdr.
849 		 *
850 		 * Fallback to software checksum for these UDP datagrams.
851 		 */
852 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
853 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
854 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
855 			uint16_t off = ehlen + iphlen;
856 
857 			counter_u64_add(hn_udpcs_fixup, 1);
858 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
859 			*(uint16_t *)(m_head->m_data + off +
860                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
861 			    m_head, m_head->m_pkthdr.len, off);
862 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
863 		}
864 	}
865 #endif
866 #if defined(INET6) && defined(INET)
867 	else
868 #endif
869 #ifdef INET6
870 	{
871 		const struct ip6_hdr *ip6;
872 
873 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
874 		ip6 = mtodo(m_head, ehlen);
875 		if (ip6->ip6_nxt != IPPROTO_TCP &&
876 		    ip6->ip6_nxt != IPPROTO_UDP) {
877 			m_freem(m_head);
878 			return (NULL);
879 		}
880 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
881 	}
882 #endif
883 	return (m_head);
884 }
885 
886 /*
887  * NOTE: If this function failed, the m_head would be freed.
888  */
889 static __inline struct mbuf *
890 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
891 {
892 	const struct tcphdr *th;
893 	int ehlen, iphlen;
894 
895 	*tcpsyn = 0;
896 	ehlen = m_head->m_pkthdr.l2hlen;
897 	iphlen = m_head->m_pkthdr.l3hlen;
898 
899 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
900 	th = mtodo(m_head, ehlen + iphlen);
901 	if (th->th_flags & TH_SYN)
902 		*tcpsyn = 1;
903 	return (m_head);
904 }
905 
906 #undef PULLUP_HDR
907 
908 #endif	/* INET6 || INET */
909 
910 static int
911 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
912 {
913 	int error = 0;
914 
915 	HN_LOCK_ASSERT(sc);
916 
917 	if (sc->hn_rx_filter != filter) {
918 		error = hn_rndis_set_rxfilter(sc, filter);
919 		if (!error)
920 			sc->hn_rx_filter = filter;
921 	}
922 	return (error);
923 }
924 
925 static int
926 hn_rxfilter_config(struct hn_softc *sc)
927 {
928 	if_t ifp = sc->hn_ifp;
929 	uint32_t filter;
930 
931 	HN_LOCK_ASSERT(sc);
932 
933 	/*
934 	 * If the non-transparent mode VF is activated, we don't know how
935 	 * its RX filter is configured, so stick the synthetic device in
936 	 * the promiscous mode.
937 	 */
938 	if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
939 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
940 	} else {
941 		filter = NDIS_PACKET_TYPE_DIRECTED;
942 		if (if_getflags(ifp) & IFF_BROADCAST)
943 			filter |= NDIS_PACKET_TYPE_BROADCAST;
944 		/* TODO: support multicast list */
945 		if ((if_getflags(ifp) & IFF_ALLMULTI) ||
946 		    !if_maddr_empty(ifp))
947 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
948 	}
949 	return (hn_set_rxfilter(sc, filter));
950 }
951 
952 static void
953 hn_set_txagg(struct hn_softc *sc)
954 {
955 	uint32_t size, pkts;
956 	int i;
957 
958 	/*
959 	 * Setup aggregation size.
960 	 */
961 	if (sc->hn_agg_size < 0)
962 		size = UINT32_MAX;
963 	else
964 		size = sc->hn_agg_size;
965 
966 	if (sc->hn_rndis_agg_size < size)
967 		size = sc->hn_rndis_agg_size;
968 
969 	/* NOTE: We only aggregate packets using chimney sending buffers. */
970 	if (size > (uint32_t)sc->hn_chim_szmax)
971 		size = sc->hn_chim_szmax;
972 
973 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
974 		/* Disable */
975 		size = 0;
976 		pkts = 0;
977 		goto done;
978 	}
979 
980 	/* NOTE: Type of the per TX ring setting is 'int'. */
981 	if (size > INT_MAX)
982 		size = INT_MAX;
983 
984 	/*
985 	 * Setup aggregation packet count.
986 	 */
987 	if (sc->hn_agg_pkts < 0)
988 		pkts = UINT32_MAX;
989 	else
990 		pkts = sc->hn_agg_pkts;
991 
992 	if (sc->hn_rndis_agg_pkts < pkts)
993 		pkts = sc->hn_rndis_agg_pkts;
994 
995 	if (pkts <= 1) {
996 		/* Disable */
997 		size = 0;
998 		pkts = 0;
999 		goto done;
1000 	}
1001 
1002 	/* NOTE: Type of the per TX ring setting is 'short'. */
1003 	if (pkts > SHRT_MAX)
1004 		pkts = SHRT_MAX;
1005 
1006 done:
1007 	/* NOTE: Type of the per TX ring setting is 'short'. */
1008 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1009 		/* Disable */
1010 		size = 0;
1011 		pkts = 0;
1012 	}
1013 
1014 	if (bootverbose) {
1015 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1016 		    size, pkts, sc->hn_rndis_agg_align);
1017 	}
1018 
1019 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1020 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1021 
1022 		mtx_lock(&txr->hn_tx_lock);
1023 		txr->hn_agg_szmax = size;
1024 		txr->hn_agg_pktmax = pkts;
1025 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1026 		mtx_unlock(&txr->hn_tx_lock);
1027 	}
1028 }
1029 
1030 static int
1031 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1032 {
1033 
1034 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1035 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1036 		return txr->hn_txdesc_cnt;
1037 	return hn_tx_swq_depth;
1038 }
1039 
1040 static int
1041 hn_rss_reconfig(struct hn_softc *sc)
1042 {
1043 	int error;
1044 
1045 	HN_LOCK_ASSERT(sc);
1046 
1047 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1048 		return (ENXIO);
1049 
1050 	/*
1051 	 * Disable RSS first.
1052 	 *
1053 	 * NOTE:
1054 	 * Direct reconfiguration by setting the UNCHG flags does
1055 	 * _not_ work properly.
1056 	 */
1057 	if (bootverbose)
1058 		if_printf(sc->hn_ifp, "disable RSS\n");
1059 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1060 	if (error) {
1061 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1062 		return (error);
1063 	}
1064 
1065 	/*
1066 	 * Reenable the RSS w/ the updated RSS key or indirect
1067 	 * table.
1068 	 */
1069 	if (bootverbose)
1070 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1071 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1072 	if (error) {
1073 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1074 		return (error);
1075 	}
1076 	return (0);
1077 }
1078 
1079 static void
1080 hn_rss_ind_fixup(struct hn_softc *sc)
1081 {
1082 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1083 	int i, nchan;
1084 
1085 	nchan = sc->hn_rx_ring_inuse;
1086 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1087 
1088 	/*
1089 	 * Check indirect table to make sure that all channels in it
1090 	 * can be used.
1091 	 */
1092 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1093 		if (rss->rss_ind[i] >= nchan) {
1094 			if_printf(sc->hn_ifp,
1095 			    "RSS indirect table %d fixup: %u -> %d\n",
1096 			    i, rss->rss_ind[i], nchan - 1);
1097 			rss->rss_ind[i] = nchan - 1;
1098 		}
1099 	}
1100 }
1101 
1102 static int
1103 hn_ifmedia_upd(if_t ifp __unused)
1104 {
1105 
1106 	return EOPNOTSUPP;
1107 }
1108 
1109 static void
1110 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1111 {
1112 	struct hn_softc *sc = if_getsoftc(ifp);
1113 
1114 	ifmr->ifm_status = IFM_AVALID;
1115 	ifmr->ifm_active = IFM_ETHER;
1116 
1117 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1118 		ifmr->ifm_active |= IFM_NONE;
1119 		return;
1120 	}
1121 	ifmr->ifm_status |= IFM_ACTIVE;
1122 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1123 }
1124 
1125 static void
1126 hn_rxvf_set_task(void *xarg, int pending __unused)
1127 {
1128 	struct hn_rxvf_setarg *arg = xarg;
1129 
1130 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1131 }
1132 
1133 static void
1134 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1135 {
1136 	struct hn_rx_ring *rxr;
1137 	struct hn_rxvf_setarg arg;
1138 	struct task task;
1139 	int i;
1140 
1141 	HN_LOCK_ASSERT(sc);
1142 
1143 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1144 
1145 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1146 		rxr = &sc->hn_rx_ring[i];
1147 
1148 		if (i < sc->hn_rx_ring_inuse) {
1149 			arg.rxr = rxr;
1150 			arg.vf_ifp = vf_ifp;
1151 			vmbus_chan_run_task(rxr->hn_chan, &task);
1152 		} else {
1153 			rxr->hn_rxvf_ifp = vf_ifp;
1154 		}
1155 	}
1156 }
1157 
1158 static bool
1159 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1160 {
1161 	if_t hn_ifp;
1162 
1163 	hn_ifp = sc->hn_ifp;
1164 
1165 	if (ifp == hn_ifp)
1166 		return (false);
1167 
1168 	if (if_getalloctype(ifp) != IFT_ETHER)
1169 		return (false);
1170 
1171 	/* Ignore lagg/vlan interfaces */
1172 	if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1173 	    strcmp(if_getdname(ifp), "vlan") == 0)
1174 		return (false);
1175 
1176 	/*
1177 	 * During detach events if_getifaddr(ifp) might be NULL.
1178 	 * Make sure the bcmp() below doesn't panic on that:
1179 	 */
1180 	if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1181 		return (false);
1182 
1183 	if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1184 		return (false);
1185 
1186 	return (true);
1187 }
1188 
1189 static void
1190 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1191 {
1192 	if_t hn_ifp;
1193 
1194 	HN_LOCK(sc);
1195 
1196 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1197 		goto out;
1198 
1199 	if (!hn_ismyvf(sc, ifp))
1200 		goto out;
1201 	hn_ifp = sc->hn_ifp;
1202 
1203 	if (rxvf) {
1204 		if (sc->hn_flags & HN_FLAG_RXVF)
1205 			goto out;
1206 
1207 		sc->hn_flags |= HN_FLAG_RXVF;
1208 		hn_rxfilter_config(sc);
1209 	} else {
1210 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1211 			goto out;
1212 
1213 		sc->hn_flags &= ~HN_FLAG_RXVF;
1214 		if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1215 			hn_rxfilter_config(sc);
1216 		else
1217 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1218 	}
1219 
1220 	hn_nvs_set_datapath(sc,
1221 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1222 
1223 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1224 
1225 	if (rxvf) {
1226 		hn_vf_rss_fixup(sc, true);
1227 		hn_suspend_mgmt(sc);
1228 		sc->hn_link_flags &=
1229 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1230 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1231 	} else {
1232 		hn_vf_rss_restore(sc);
1233 		hn_resume_mgmt(sc);
1234 	}
1235 
1236 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1237 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1238 
1239 	if (bootverbose) {
1240 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1241 		    rxvf ? "to" : "from", if_name(ifp));
1242 	}
1243 out:
1244 	HN_UNLOCK(sc);
1245 }
1246 
1247 static void
1248 hn_ifnet_event(void *arg, if_t ifp, int event)
1249 {
1250 
1251 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1252 		return;
1253 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1254 }
1255 
1256 static void
1257 hn_ifaddr_event(void *arg, if_t ifp)
1258 {
1259 
1260 	hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1261 }
1262 
1263 static int
1264 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused)
1265 {
1266 	if_t ifp, vf_ifp;
1267 
1268 	HN_LOCK_ASSERT(sc);
1269 	ifp = sc->hn_ifp;
1270 	vf_ifp = sc->hn_vf_ifp;
1271 
1272 	/*
1273 	 * Just sync up with VF's enabled capabilities.
1274 	 */
1275 	if_setcapenable(ifp, if_getcapenable(vf_ifp));
1276 	if_sethwassist(ifp, if_gethwassist(vf_ifp));
1277 
1278 	return (0);
1279 }
1280 
1281 static int
1282 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1283 {
1284 	if_t vf_ifp;
1285 	struct ifreq ifr;
1286 
1287 	HN_LOCK_ASSERT(sc);
1288 	vf_ifp = sc->hn_vf_ifp;
1289 
1290 	memset(&ifr, 0, sizeof(ifr));
1291 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1292 	ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1293 	ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1294 	return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1295 }
1296 
1297 static void
1298 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1299 {
1300 	if_t ifp = sc->hn_ifp;
1301 	int allmulti = 0;
1302 
1303 	HN_LOCK_ASSERT(sc);
1304 
1305 	/* XXX vlan(4) style mcast addr maintenance */
1306 	if (!if_maddr_empty(ifp))
1307 		allmulti = IFF_ALLMULTI;
1308 
1309 	/* Always set the VF's if_flags */
1310 	if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1311 }
1312 
1313 static void
1314 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1315 {
1316 	struct rm_priotracker pt;
1317 	if_t hn_ifp = NULL;
1318 	struct mbuf *mn;
1319 
1320 	/*
1321 	 * XXX racy, if hn(4) ever detached.
1322 	 */
1323 	rm_rlock(&hn_vfmap_lock, &pt);
1324 	if (if_getindex(vf_ifp) < hn_vfmap_size)
1325 		hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1326 	rm_runlock(&hn_vfmap_lock, &pt);
1327 
1328 	if (hn_ifp != NULL) {
1329 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1330 			/*
1331 			 * Allow tapping on the VF.
1332 			 */
1333 			ETHER_BPF_MTAP(vf_ifp, mn);
1334 
1335 			/*
1336 			 * Update VF stats.
1337 			 */
1338 			if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1339 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1340 				    mn->m_pkthdr.len);
1341 			}
1342 			/*
1343 			 * XXX IFCOUNTER_IMCAST
1344 			 * This stat updating is kinda invasive, since it
1345 			 * requires two checks on the mbuf: the length check
1346 			 * and the ethernet header check.  As of this write,
1347 			 * all multicast packets go directly to hn(4), which
1348 			 * makes imcast stat updating in the VF a try in vian.
1349 			 */
1350 
1351 			/*
1352 			 * Fix up rcvif and increase hn(4)'s ipackets.
1353 			 */
1354 			mn->m_pkthdr.rcvif = hn_ifp;
1355 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1356 		}
1357 		/*
1358 		 * Go through hn(4)'s if_input.
1359 		 */
1360 		if_input(hn_ifp, m);
1361 	} else {
1362 		/*
1363 		 * In the middle of the transition; free this
1364 		 * mbuf chain.
1365 		 */
1366 		while (m != NULL) {
1367 			mn = m->m_nextpkt;
1368 			m->m_nextpkt = NULL;
1369 			m_freem(m);
1370 			m = mn;
1371 		}
1372 	}
1373 }
1374 
1375 static void
1376 hn_mtu_change_fixup(struct hn_softc *sc)
1377 {
1378 	if_t ifp;
1379 
1380 	HN_LOCK_ASSERT(sc);
1381 	ifp = sc->hn_ifp;
1382 
1383 	hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1384 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1385 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1386 }
1387 
1388 static uint32_t
1389 hn_rss_type_fromndis(uint32_t rss_hash)
1390 {
1391 	uint32_t types = 0;
1392 
1393 	if (rss_hash & NDIS_HASH_IPV4)
1394 		types |= RSS_TYPE_IPV4;
1395 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1396 		types |= RSS_TYPE_TCP_IPV4;
1397 	if (rss_hash & NDIS_HASH_IPV6)
1398 		types |= RSS_TYPE_IPV6;
1399 	if (rss_hash & NDIS_HASH_IPV6_EX)
1400 		types |= RSS_TYPE_IPV6_EX;
1401 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1402 		types |= RSS_TYPE_TCP_IPV6;
1403 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1404 		types |= RSS_TYPE_TCP_IPV6_EX;
1405 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1406 		types |= RSS_TYPE_UDP_IPV4;
1407 	return (types);
1408 }
1409 
1410 static uint32_t
1411 hn_rss_type_tondis(uint32_t types)
1412 {
1413 	uint32_t rss_hash = 0;
1414 
1415 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1416 	    ("UDP6 and UDP6EX are not supported"));
1417 
1418 	if (types & RSS_TYPE_IPV4)
1419 		rss_hash |= NDIS_HASH_IPV4;
1420 	if (types & RSS_TYPE_TCP_IPV4)
1421 		rss_hash |= NDIS_HASH_TCP_IPV4;
1422 	if (types & RSS_TYPE_IPV6)
1423 		rss_hash |= NDIS_HASH_IPV6;
1424 	if (types & RSS_TYPE_IPV6_EX)
1425 		rss_hash |= NDIS_HASH_IPV6_EX;
1426 	if (types & RSS_TYPE_TCP_IPV6)
1427 		rss_hash |= NDIS_HASH_TCP_IPV6;
1428 	if (types & RSS_TYPE_TCP_IPV6_EX)
1429 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1430 	if (types & RSS_TYPE_UDP_IPV4)
1431 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1432 	return (rss_hash);
1433 }
1434 
1435 static void
1436 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1437 {
1438 	int i;
1439 
1440 	HN_LOCK_ASSERT(sc);
1441 
1442 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1443 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1444 }
1445 
1446 static void
1447 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1448 {
1449 	if_t ifp, vf_ifp;
1450 	struct ifrsshash ifrh;
1451 	struct ifrsskey ifrk;
1452 	int error;
1453 	uint32_t my_types, diff_types, mbuf_types = 0;
1454 
1455 	HN_LOCK_ASSERT(sc);
1456 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1457 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1458 
1459 	if (sc->hn_rx_ring_inuse == 1) {
1460 		/* No RSS on synthetic parts; done. */
1461 		return;
1462 	}
1463 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1464 		/* Synthetic parts do not support Toeplitz; done. */
1465 		return;
1466 	}
1467 
1468 	ifp = sc->hn_ifp;
1469 	vf_ifp = sc->hn_vf_ifp;
1470 
1471 	/*
1472 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1473 	 * supported.
1474 	 */
1475 	memset(&ifrk, 0, sizeof(ifrk));
1476 	strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1477 	error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1478 	if (error) {
1479 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1480 		    if_name(vf_ifp), error);
1481 		goto done;
1482 	}
1483 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1484 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1485 		    if_name(vf_ifp), ifrk.ifrk_func);
1486 		goto done;
1487 	}
1488 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1489 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1490 		    if_name(vf_ifp), ifrk.ifrk_keylen);
1491 		goto done;
1492 	}
1493 
1494 	/*
1495 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1496 	 */
1497 	memset(&ifrh, 0, sizeof(ifrh));
1498 	strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1499 	error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1500 	if (error) {
1501 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1502 		    if_name(vf_ifp), error);
1503 		goto done;
1504 	}
1505 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1506 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1507 		    if_name(vf_ifp), ifrh.ifrh_func);
1508 		goto done;
1509 	}
1510 
1511 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1512 	if ((ifrh.ifrh_types & my_types) == 0) {
1513 		/* This disables RSS; ignore it then */
1514 		if_printf(ifp, "%s intersection of RSS types failed.  "
1515 		    "VF %#x, mine %#x\n", if_name(vf_ifp),
1516 		    ifrh.ifrh_types, my_types);
1517 		goto done;
1518 	}
1519 
1520 	diff_types = my_types ^ ifrh.ifrh_types;
1521 	my_types &= ifrh.ifrh_types;
1522 	mbuf_types = my_types;
1523 
1524 	/*
1525 	 * Detect RSS hash value/type confliction.
1526 	 *
1527 	 * NOTE:
1528 	 * We don't disable the hash type, but stop delivery the hash
1529 	 * value/type through mbufs on RX path.
1530 	 *
1531 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1532 	 * hash is delivered with type of TCP_IPV4.  This means if
1533 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1534 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1535 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1536 	 * here.
1537 	 */
1538 	if ((my_types & RSS_TYPE_IPV4) &&
1539 	    (diff_types & ifrh.ifrh_types &
1540 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1541 		/* Conflict; disable IPV4 hash type/value delivery. */
1542 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1543 		mbuf_types &= ~RSS_TYPE_IPV4;
1544 	}
1545 	if ((my_types & RSS_TYPE_IPV6) &&
1546 	    (diff_types & ifrh.ifrh_types &
1547 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1548 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1549 	      RSS_TYPE_IPV6_EX))) {
1550 		/* Conflict; disable IPV6 hash type/value delivery. */
1551 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1552 		mbuf_types &= ~RSS_TYPE_IPV6;
1553 	}
1554 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1555 	    (diff_types & ifrh.ifrh_types &
1556 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1557 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1558 	      RSS_TYPE_IPV6))) {
1559 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1560 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1561 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1562 	}
1563 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1564 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1565 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1566 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1567 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1568 	}
1569 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1570 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1571 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1572 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1573 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1574 	}
1575 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1576 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1577 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1578 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1579 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1580 	}
1581 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1582 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1583 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1584 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1585 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1586 	}
1587 
1588 	/*
1589 	 * Indirect table does not matter.
1590 	 */
1591 
1592 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1593 	    hn_rss_type_tondis(my_types);
1594 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1595 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1596 
1597 	if (reconf) {
1598 		error = hn_rss_reconfig(sc);
1599 		if (error) {
1600 			/* XXX roll-back? */
1601 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1602 			/* XXX keep going. */
1603 		}
1604 	}
1605 done:
1606 	/* Hash deliverability for mbufs. */
1607 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1608 }
1609 
1610 static void
1611 hn_vf_rss_restore(struct hn_softc *sc)
1612 {
1613 
1614 	HN_LOCK_ASSERT(sc);
1615 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1616 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1617 
1618 	if (sc->hn_rx_ring_inuse == 1)
1619 		goto done;
1620 
1621 	/*
1622 	 * Restore hash types.  Key does _not_ matter.
1623 	 */
1624 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1625 		int error;
1626 
1627 		sc->hn_rss_hash = sc->hn_rss_hcap;
1628 		error = hn_rss_reconfig(sc);
1629 		if (error) {
1630 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1631 			    error);
1632 			/* XXX keep going. */
1633 		}
1634 	}
1635 done:
1636 	/* Hash deliverability for mbufs. */
1637 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1638 }
1639 
1640 static void
1641 hn_xpnt_vf_setready(struct hn_softc *sc)
1642 {
1643 	if_t ifp, vf_ifp;
1644 	struct ifreq ifr;
1645 
1646 	HN_LOCK_ASSERT(sc);
1647 	ifp = sc->hn_ifp;
1648 	vf_ifp = sc->hn_vf_ifp;
1649 
1650 	/*
1651 	 * Mark the VF ready.
1652 	 */
1653 	sc->hn_vf_rdytick = 0;
1654 
1655 	/*
1656 	 * Save information for restoration.
1657 	 */
1658 	sc->hn_saved_caps = if_getcapabilities(ifp);
1659 	sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1660 	sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1661 	sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1662 	sc->hn_saved_capenable = if_getcapenable(ifp);
1663 	sc->hn_saved_hwassist = if_gethwassist(ifp);
1664 
1665 	/*
1666 	 * Intersect supported/enabled capabilities.
1667 	 *
1668 	 * NOTE:
1669 	 * if_hwassist is not changed here.
1670 	 */
1671 	if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1672 	if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1673 
1674 	/*
1675 	 * Fix TSO settings.
1676 	 */
1677 	if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1678 		if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1679 	if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1680 		if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1681 	if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1682 		if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1683 
1684 	/*
1685 	 * Change VF's enabled capabilities.
1686 	 */
1687 	memset(&ifr, 0, sizeof(ifr));
1688 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1689 	ifr.ifr_reqcap = if_getcapenable(ifp);
1690 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1691 
1692 	if (if_getmtu(ifp) != ETHERMTU) {
1693 		int error;
1694 
1695 		/*
1696 		 * Change VF's MTU.
1697 		 */
1698 		memset(&ifr, 0, sizeof(ifr));
1699 		strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1700 		ifr.ifr_mtu = if_getmtu(ifp);
1701 		error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1702 		if (error) {
1703 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1704 			    if_name(vf_ifp), if_getmtu(ifp));
1705 			if (if_getmtu(ifp) > ETHERMTU) {
1706 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1707 
1708 				/*
1709 				 * XXX
1710 				 * No need to adjust the synthetic parts' MTU;
1711 				 * failure of the adjustment will cause us
1712 				 * infinite headache.
1713 				 */
1714 				if_setmtu(ifp, ETHERMTU);
1715 				hn_mtu_change_fixup(sc);
1716 			}
1717 		}
1718 	}
1719 }
1720 
1721 static bool
1722 hn_xpnt_vf_isready(struct hn_softc *sc)
1723 {
1724 
1725 	HN_LOCK_ASSERT(sc);
1726 
1727 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1728 		return (false);
1729 
1730 	if (sc->hn_vf_rdytick == 0)
1731 		return (true);
1732 
1733 	if (sc->hn_vf_rdytick > ticks)
1734 		return (false);
1735 
1736 	/* Mark VF as ready. */
1737 	hn_xpnt_vf_setready(sc);
1738 	return (true);
1739 }
1740 
1741 static void
1742 hn_xpnt_vf_setenable(struct hn_softc *sc)
1743 {
1744 	int i;
1745 
1746 	HN_LOCK_ASSERT(sc);
1747 
1748 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1749 	rm_wlock(&sc->hn_vf_lock);
1750 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1751 	rm_wunlock(&sc->hn_vf_lock);
1752 
1753 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1754 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1755 }
1756 
1757 static void
1758 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1759 {
1760 	int i;
1761 
1762 	HN_LOCK_ASSERT(sc);
1763 
1764 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1765 	rm_wlock(&sc->hn_vf_lock);
1766 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1767 	if (clear_vf)
1768 		sc->hn_vf_ifp = NULL;
1769 	rm_wunlock(&sc->hn_vf_lock);
1770 
1771 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1772 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1773 }
1774 
1775 static void
1776 hn_xpnt_vf_init(struct hn_softc *sc)
1777 {
1778 	int error;
1779 
1780 	HN_LOCK_ASSERT(sc);
1781 
1782 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1783 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1784 
1785 	if (bootverbose) {
1786 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1787 		    if_name(sc->hn_vf_ifp));
1788 	}
1789 
1790 	/*
1791 	 * Bring the VF up.
1792 	 */
1793 	hn_xpnt_vf_saveifflags(sc);
1794 	if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1795 	error = hn_xpnt_vf_iocsetflags(sc);
1796 	if (error) {
1797 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1798 		    if_name(sc->hn_vf_ifp), error);
1799 		return;
1800 	}
1801 
1802 	/*
1803 	 * NOTE:
1804 	 * Datapath setting must happen _after_ bringing the VF up.
1805 	 */
1806 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1807 
1808 	/*
1809 	 * NOTE:
1810 	 * Fixup RSS related bits _after_ the VF is brought up, since
1811 	 * many VFs generate RSS key during it's initialization.
1812 	 */
1813 	hn_vf_rss_fixup(sc, true);
1814 
1815 	/* Mark transparent mode VF as enabled. */
1816 	hn_xpnt_vf_setenable(sc);
1817 }
1818 
1819 static void
1820 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1821 {
1822 	struct hn_softc *sc = xsc;
1823 
1824 	HN_LOCK(sc);
1825 
1826 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1827 		goto done;
1828 	if (sc->hn_vf_ifp == NULL)
1829 		goto done;
1830 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1831 		goto done;
1832 
1833 	if (sc->hn_vf_rdytick != 0) {
1834 		/* Mark VF as ready. */
1835 		hn_xpnt_vf_setready(sc);
1836 	}
1837 
1838 	if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1839 		/*
1840 		 * Delayed VF initialization.
1841 		 */
1842 		if (bootverbose) {
1843 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1844 			    if_name(sc->hn_vf_ifp));
1845 		}
1846 		hn_xpnt_vf_init(sc);
1847 	}
1848 done:
1849 	HN_UNLOCK(sc);
1850 }
1851 
1852 static void
1853 hn_ifnet_attevent(void *xsc, if_t ifp)
1854 {
1855 	struct hn_softc *sc = xsc;
1856 
1857 	HN_LOCK(sc);
1858 
1859 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1860 		goto done;
1861 
1862 	if (!hn_ismyvf(sc, ifp))
1863 		goto done;
1864 
1865 	if (sc->hn_vf_ifp != NULL) {
1866 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1867 		    if_name(sc->hn_vf_ifp));
1868 		goto done;
1869 	}
1870 
1871 	if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1872 		/*
1873 		 * ifnet.if_start is _not_ supported by transparent
1874 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1875 		 */
1876 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1877 		    "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1878 
1879 		goto done;
1880 	}
1881 
1882 	rm_wlock(&hn_vfmap_lock);
1883 
1884 	if (if_getindex(ifp) >= hn_vfmap_size) {
1885 		if_t *newmap;
1886 		int newsize;
1887 
1888 		newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1889 		newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1890 		    M_WAITOK | M_ZERO);
1891 
1892 		memcpy(newmap, hn_vfmap,
1893 		    sizeof(if_t) * hn_vfmap_size);
1894 		free(hn_vfmap, M_DEVBUF);
1895 		hn_vfmap = newmap;
1896 		hn_vfmap_size = newsize;
1897 	}
1898 	KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1899 	    ("%s: ifindex %d was mapped to %s",
1900 	     if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1901 	hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1902 
1903 	rm_wunlock(&hn_vfmap_lock);
1904 
1905 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1906 	rm_wlock(&sc->hn_vf_lock);
1907 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1908 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1909 	sc->hn_vf_ifp = ifp;
1910 	rm_wunlock(&sc->hn_vf_lock);
1911 
1912 	if (hn_xpnt_vf) {
1913 		int wait_ticks;
1914 
1915 		/*
1916 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1917 		 * Save vf_ifp's current if_input for later restoration.
1918 		 */
1919 		sc->hn_vf_input = if_getinputfn(ifp);
1920 		if_setinputfn(ifp, hn_xpnt_vf_input);
1921 
1922 		/*
1923 		 * Stop link status management; use the VF's.
1924 		 */
1925 		hn_suspend_mgmt(sc);
1926 
1927 		/*
1928 		 * Give VF sometime to complete its attach routing.
1929 		 */
1930 		wait_ticks = hn_xpnt_vf_attwait * hz;
1931 		sc->hn_vf_rdytick = ticks + wait_ticks;
1932 
1933 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1934 		    wait_ticks);
1935 	}
1936 done:
1937 	HN_UNLOCK(sc);
1938 }
1939 
1940 static void
1941 hn_ifnet_detevent(void *xsc, if_t ifp)
1942 {
1943 	struct hn_softc *sc = xsc;
1944 
1945 	HN_LOCK(sc);
1946 
1947 	if (sc->hn_vf_ifp == NULL)
1948 		goto done;
1949 
1950 	if (!hn_ismyvf(sc, ifp))
1951 		goto done;
1952 
1953 	if (hn_xpnt_vf) {
1954 		/*
1955 		 * Make sure that the delayed initialization is not running.
1956 		 *
1957 		 * NOTE:
1958 		 * - This lock _must_ be released, since the hn_vf_init task
1959 		 *   will try holding this lock.
1960 		 * - It is safe to release this lock here, since the
1961 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1962 		 *
1963 		 * XXX racy, if hn(4) ever detached.
1964 		 */
1965 		HN_UNLOCK(sc);
1966 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1967 		HN_LOCK(sc);
1968 
1969 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1970 		    if_name(sc->hn_ifp)));
1971 		if_setinputfn(ifp, sc->hn_vf_input);
1972 		sc->hn_vf_input = NULL;
1973 
1974 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1975 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1976 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1977 
1978 		if (sc->hn_vf_rdytick == 0) {
1979 			/*
1980 			 * The VF was ready; restore some settings.
1981 			 */
1982 			if_setcapabilities(ifp, sc->hn_saved_caps);
1983 
1984 			if_sethwtsomax(ifp, sc->hn_saved_tsomax);
1985 			if_sethwtsomaxsegcount(sc->hn_ifp,
1986 			    sc->hn_saved_tsosegcnt);
1987 			if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
1988 
1989 			if_setcapenable(ifp, sc->hn_saved_capenable);
1990 			if_sethwassist(ifp, sc->hn_saved_hwassist);
1991 		}
1992 
1993 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1994 			/*
1995 			 * Restore RSS settings.
1996 			 */
1997 			hn_vf_rss_restore(sc);
1998 
1999 			/*
2000 			 * Resume link status management, which was suspended
2001 			 * by hn_ifnet_attevent().
2002 			 */
2003 			hn_resume_mgmt(sc);
2004 		}
2005 	}
2006 
2007 	/* Mark transparent mode VF as disabled. */
2008 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2009 
2010 	rm_wlock(&hn_vfmap_lock);
2011 
2012 	KASSERT(if_getindex(ifp) < hn_vfmap_size,
2013 	    ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2014 	if (hn_vfmap[if_getindex(ifp)] != NULL) {
2015 		KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2016 		    ("%s: ifindex %d was mapped to %s",
2017 		     if_name(ifp), if_getindex(ifp),
2018 		     if_name(hn_vfmap[if_getindex(ifp)])));
2019 		hn_vfmap[if_getindex(ifp)] = NULL;
2020 	}
2021 
2022 	rm_wunlock(&hn_vfmap_lock);
2023 done:
2024 	HN_UNLOCK(sc);
2025 }
2026 
2027 static void
2028 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2029 {
2030 	struct hn_softc *sc = xsc;
2031 
2032 	if (sc->hn_vf_ifp == ifp)
2033 		if_link_state_change(sc->hn_ifp, link_state);
2034 }
2035 
2036 static int
2037 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2038 {
2039 	struct hn_softc *sc = arg1;
2040 	unsigned int tsomax;
2041 	int error;
2042 
2043 	tsomax = if_gethwtsomax(sc->hn_ifp);
2044 	error = sysctl_handle_int(oidp, &tsomax, 0, req);
2045 	return error;
2046 }
2047 
2048 static int
2049 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2050 {
2051 	struct hn_softc *sc = arg1;
2052 	unsigned int tsomaxsegcnt;
2053 	int error;
2054 
2055 	tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2056 	error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2057 	return error;
2058 }
2059 
2060 static int
2061 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2062 {
2063 	struct hn_softc *sc = arg1;
2064 	unsigned int tsomaxsegsz;
2065 	int error;
2066 
2067 	tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2068 	error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2069 	return error;
2070 }
2071 
2072 static int
2073 hn_probe(device_t dev)
2074 {
2075 
2076 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2077 		device_set_desc(dev, "Hyper-V Network Interface");
2078 		return BUS_PROBE_DEFAULT;
2079 	}
2080 	return ENXIO;
2081 }
2082 
2083 static int
2084 hn_attach(device_t dev)
2085 {
2086 	struct hn_softc *sc = device_get_softc(dev);
2087 	struct sysctl_oid_list *child;
2088 	struct sysctl_ctx_list *ctx;
2089 	uint8_t eaddr[ETHER_ADDR_LEN];
2090 	if_t ifp = NULL;
2091 	int error, ring_cnt, tx_ring_cnt;
2092 	uint32_t mtu;
2093 
2094 	sc->hn_dev = dev;
2095 	sc->hn_prichan = vmbus_get_channel(dev);
2096 	HN_LOCK_INIT(sc);
2097 	rm_init(&sc->hn_vf_lock, "hnvf");
2098 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2099 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2100 
2101 	/*
2102 	 * Initialize these tunables once.
2103 	 */
2104 	sc->hn_agg_size = hn_tx_agg_size;
2105 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2106 
2107 	/*
2108 	 * Setup taskqueue for transmission.
2109 	 */
2110 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2111 		int i;
2112 
2113 		sc->hn_tx_taskqs =
2114 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2115 		    M_DEVBUF, M_WAITOK);
2116 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2117 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2118 			    M_WAITOK, taskqueue_thread_enqueue,
2119 			    &sc->hn_tx_taskqs[i]);
2120 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2121 			    "%s tx%d", device_get_nameunit(dev), i);
2122 		}
2123 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2124 		sc->hn_tx_taskqs = hn_tx_taskque;
2125 	}
2126 
2127 	/*
2128 	 * Setup taskqueue for mangement tasks, e.g. link status.
2129 	 */
2130 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2131 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2132 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2133 	    device_get_nameunit(dev));
2134 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2135 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2136 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2137 	    hn_netchg_status_taskfunc, sc);
2138 
2139 	if (hn_xpnt_vf) {
2140 		/*
2141 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2142 		 */
2143 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2144 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2145 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2146 		    device_get_nameunit(dev));
2147 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2148 		    hn_xpnt_vf_init_taskfunc, sc);
2149 	}
2150 
2151 	/*
2152 	 * Allocate ifnet and setup its name earlier, so that if_printf
2153 	 * can be used by functions, which will be called after
2154 	 * ether_ifattach().
2155 	 */
2156 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2157 	if_setsoftc(ifp, sc);
2158 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2159 
2160 	/*
2161 	 * Initialize ifmedia earlier so that it can be unconditionally
2162 	 * destroyed, if error happened later on.
2163 	 */
2164 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2165 
2166 	/*
2167 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2168 	 * to use (tx_ring_cnt).
2169 	 *
2170 	 * NOTE:
2171 	 * The # of RX rings to use is same as the # of channels to use.
2172 	 */
2173 	ring_cnt = hn_chan_cnt;
2174 	if (ring_cnt <= 0) {
2175 		/* Default */
2176 		ring_cnt = mp_ncpus;
2177 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2178 			ring_cnt = HN_RING_CNT_DEF_MAX;
2179 	} else if (ring_cnt > mp_ncpus) {
2180 		ring_cnt = mp_ncpus;
2181 	}
2182 #ifdef RSS
2183 	if (ring_cnt > rss_getnumbuckets())
2184 		ring_cnt = rss_getnumbuckets();
2185 #endif
2186 
2187 	tx_ring_cnt = hn_tx_ring_cnt;
2188 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2189 		tx_ring_cnt = ring_cnt;
2190 #ifdef HN_IFSTART_SUPPORT
2191 	if (hn_use_if_start) {
2192 		/* ifnet.if_start only needs one TX ring. */
2193 		tx_ring_cnt = 1;
2194 	}
2195 #endif
2196 
2197 	/*
2198 	 * Set the leader CPU for channels.
2199 	 */
2200 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2201 
2202 	/*
2203 	 * Create enough TX/RX rings, even if only limited number of
2204 	 * channels can be allocated.
2205 	 */
2206 	error = hn_create_tx_data(sc, tx_ring_cnt);
2207 	if (error)
2208 		goto failed;
2209 	error = hn_create_rx_data(sc, ring_cnt);
2210 	if (error)
2211 		goto failed;
2212 
2213 	/*
2214 	 * Create transaction context for NVS and RNDIS transactions.
2215 	 */
2216 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2217 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2218 	if (sc->hn_xact == NULL) {
2219 		error = ENXIO;
2220 		goto failed;
2221 	}
2222 
2223 	/*
2224 	 * Install orphan handler for the revocation of this device's
2225 	 * primary channel.
2226 	 *
2227 	 * NOTE:
2228 	 * The processing order is critical here:
2229 	 * Install the orphan handler, _before_ testing whether this
2230 	 * device's primary channel has been revoked or not.
2231 	 */
2232 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2233 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2234 		error = ENXIO;
2235 		goto failed;
2236 	}
2237 
2238 	/*
2239 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2240 	 */
2241 	error = hn_synth_attach(sc, ETHERMTU);
2242 	if (error)
2243 		goto failed;
2244 
2245 	error = hn_rndis_get_eaddr(sc, eaddr);
2246 	if (error)
2247 		goto failed;
2248 
2249 	error = hn_rndis_get_mtu(sc, &mtu);
2250 	if (error)
2251 		mtu = ETHERMTU;
2252 	else if (bootverbose)
2253 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2254 
2255 	if (sc->hn_rx_ring_inuse > 1) {
2256 		/*
2257 		 * Reduce TCP segment aggregation limit for multiple
2258 		 * RX rings to increase ACK timeliness.
2259 		 */
2260 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2261 	}
2262 
2263 	/*
2264 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2265 	 */
2266 	hn_fixup_tx_data(sc);
2267 	hn_fixup_rx_data(sc);
2268 
2269 	ctx = device_get_sysctl_ctx(dev);
2270 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2271 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2272 	    &sc->hn_nvs_ver, 0, "NVS version");
2273 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2274 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2275 	    hn_ndis_version_sysctl, "A", "NDIS version");
2276 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2277 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2278 	    hn_caps_sysctl, "A", "capabilities");
2279 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2280 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 	    hn_hwassist_sysctl, "A", "hwassist");
2282 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2283 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2284 	    "IU", "max TSO size");
2285 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2286 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2287 	    "IU", "max # of TSO segments");
2288 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2289 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2290 	    "IU", "max size of TSO segment");
2291 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2292 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2293 	    hn_rxfilter_sysctl, "A", "rxfilter");
2294 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2295 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2296 	    hn_rss_hash_sysctl, "A", "RSS hash");
2297 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2298 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2299 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2300 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2301 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2302 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2303 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2304 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2305 #ifndef RSS
2306 	/*
2307 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2308 	 */
2309 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2310 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2311 	    hn_rss_key_sysctl, "IU", "RSS key");
2312 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2313 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2314 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2315 #endif
2316 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2317 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2318 	    "RNDIS offered packet transmission aggregation size limit");
2319 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2320 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2321 	    "RNDIS offered packet transmission aggregation count limit");
2322 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2323 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2324 	    "RNDIS packet transmission aggregation alignment");
2325 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2326 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 	    hn_txagg_size_sysctl, "I",
2328 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2329 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2330 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 	    hn_txagg_pkts_sysctl, "I",
2332 	    "Packet transmission aggregation packets, "
2333 	    "0 -- disable, -1 -- auto");
2334 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2335 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2336 	    hn_polling_sysctl, "I",
2337 	    "Polling frequency: [100,1000000], 0 disable polling");
2338 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2339 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2340 	    hn_vf_sysctl, "A", "Virtual Function's name");
2341 	if (!hn_xpnt_vf) {
2342 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2343 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2344 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2345 	} else {
2346 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2347 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2348 		    hn_xpnt_vf_enabled_sysctl, "I",
2349 		    "Transparent VF enabled");
2350 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2351 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2352 		    hn_xpnt_vf_accbpf_sysctl, "I",
2353 		    "Accurate BPF for transparent VF");
2354 	}
2355 
2356 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2357 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2358 	    "switch to rsc");
2359 
2360 	/*
2361 	 * Setup the ifmedia, which has been initialized earlier.
2362 	 */
2363 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2364 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2365 	/* XXX ifmedia_set really should do this for us */
2366 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2367 
2368 	/*
2369 	 * Setup the ifnet for this interface.
2370 	 */
2371 
2372 	if_setbaudrate(ifp, IF_Gbps(10));
2373 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2374 	if_setioctlfn(ifp, hn_ioctl);
2375 	if_setinitfn(ifp, hn_init);
2376 #ifdef HN_IFSTART_SUPPORT
2377 	if (hn_use_if_start) {
2378 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2379 
2380 		if_setstartfn(ifp, hn_start);
2381 		if_setsendqlen(ifp, qdepth);
2382 		if_setsendqready(ifp);
2383 	} else
2384 #endif
2385 	{
2386 		if_settransmitfn(ifp, hn_transmit);
2387 		if_setqflushfn(ifp, hn_xmit_qflush);
2388 	}
2389 
2390 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2391 #ifdef foo
2392 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2393 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2394 #endif
2395 	if (sc->hn_caps & HN_CAP_VLAN) {
2396 		/* XXX not sure about VLAN_MTU. */
2397 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2398 	}
2399 
2400 	if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2401 	if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2402 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2403 	if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2404 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2405 	if (sc->hn_caps & HN_CAP_TSO4) {
2406 		if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2407 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2408 	}
2409 	if (sc->hn_caps & HN_CAP_TSO6) {
2410 		if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2411 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2412 	}
2413 
2414 	/* Enable all available capabilities by default. */
2415 	if_setcapenable(ifp, if_getcapabilities(ifp));
2416 
2417 	/*
2418 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2419 	 * be enabled through SIOCSIFCAP.
2420 	 */
2421 	if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2422 	if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2423 
2424 	if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2425 		/*
2426 		 * Lock hn_set_tso_maxsize() to simplify its
2427 		 * internal logic.
2428 		 */
2429 		HN_LOCK(sc);
2430 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2431 		HN_UNLOCK(sc);
2432 		if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2433 		if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2434 	}
2435 
2436 	ether_ifattach(ifp, eaddr);
2437 
2438 	if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2439 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2440 		    if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2441 	}
2442 	if (mtu < ETHERMTU) {
2443 
2444 		if_setmtu(ifp, mtu);
2445 	}
2446 
2447 	/* Inform the upper layer about the long frame support. */
2448 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2449 
2450 	/*
2451 	 * Kick off link status check.
2452 	 */
2453 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2454 	hn_update_link_status(sc);
2455 
2456 	if (!hn_xpnt_vf) {
2457 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2458 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2459 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2460 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2461 	} else {
2462 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2463 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2464 	}
2465 
2466 	/*
2467 	 * NOTE:
2468 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2469 	 * since interface's LLADDR is needed; interface LLADDR is not
2470 	 * available when ifnet_arrival event is triggered.
2471 	 */
2472 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2473 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2474 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2475 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2476 
2477 	return (0);
2478 failed:
2479 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2480 		hn_synth_detach(sc);
2481 	hn_detach(dev);
2482 	return (error);
2483 }
2484 
2485 static int
2486 hn_detach(device_t dev)
2487 {
2488 	struct hn_softc *sc = device_get_softc(dev);
2489 	if_t ifp = sc->hn_ifp, vf_ifp;
2490 
2491 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2492 		/*
2493 		 * In case that the vmbus missed the orphan handler
2494 		 * installation.
2495 		 */
2496 		vmbus_xact_ctx_orphan(sc->hn_xact);
2497 	}
2498 
2499 	if (sc->hn_ifaddr_evthand != NULL)
2500 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2501 	if (sc->hn_ifnet_evthand != NULL)
2502 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2503 	if (sc->hn_ifnet_atthand != NULL) {
2504 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2505 		    sc->hn_ifnet_atthand);
2506 	}
2507 	if (sc->hn_ifnet_dethand != NULL) {
2508 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2509 		    sc->hn_ifnet_dethand);
2510 	}
2511 	if (sc->hn_ifnet_lnkhand != NULL)
2512 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2513 
2514 	vf_ifp = sc->hn_vf_ifp;
2515 	__compiler_membar();
2516 	if (vf_ifp != NULL)
2517 		hn_ifnet_detevent(sc, vf_ifp);
2518 
2519 	if (device_is_attached(dev)) {
2520 		HN_LOCK(sc);
2521 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2522 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2523 				hn_stop(sc, true);
2524 			/*
2525 			 * NOTE:
2526 			 * hn_stop() only suspends data, so managment
2527 			 * stuffs have to be suspended manually here.
2528 			 */
2529 			hn_suspend_mgmt(sc);
2530 			hn_synth_detach(sc);
2531 		}
2532 		HN_UNLOCK(sc);
2533 		ether_ifdetach(ifp);
2534 	}
2535 
2536 	ifmedia_removeall(&sc->hn_media);
2537 	hn_destroy_rx_data(sc);
2538 	hn_destroy_tx_data(sc);
2539 
2540 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2541 		int i;
2542 
2543 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2544 			taskqueue_free(sc->hn_tx_taskqs[i]);
2545 		free(sc->hn_tx_taskqs, M_DEVBUF);
2546 	}
2547 	taskqueue_free(sc->hn_mgmt_taskq0);
2548 	if (sc->hn_vf_taskq != NULL)
2549 		taskqueue_free(sc->hn_vf_taskq);
2550 
2551 	if (sc->hn_xact != NULL) {
2552 		/*
2553 		 * Uninstall the orphan handler _before_ the xact is
2554 		 * destructed.
2555 		 */
2556 		vmbus_chan_unset_orphan(sc->hn_prichan);
2557 		vmbus_xact_ctx_destroy(sc->hn_xact);
2558 	}
2559 
2560 	if_free(ifp);
2561 
2562 	HN_LOCK_DESTROY(sc);
2563 	rm_destroy(&sc->hn_vf_lock);
2564 	return (0);
2565 }
2566 
2567 static int
2568 hn_shutdown(device_t dev)
2569 {
2570 
2571 	return (0);
2572 }
2573 
2574 static void
2575 hn_link_status(struct hn_softc *sc)
2576 {
2577 	uint32_t link_status;
2578 	int error;
2579 
2580 	error = hn_rndis_get_linkstatus(sc, &link_status);
2581 	if (error) {
2582 		/* XXX what to do? */
2583 		return;
2584 	}
2585 
2586 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2587 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2588 	else
2589 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2590 	if_link_state_change(sc->hn_ifp,
2591 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2592 	    LINK_STATE_UP : LINK_STATE_DOWN);
2593 }
2594 
2595 static void
2596 hn_link_taskfunc(void *xsc, int pending __unused)
2597 {
2598 	struct hn_softc *sc = xsc;
2599 
2600 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2601 		return;
2602 	hn_link_status(sc);
2603 }
2604 
2605 static void
2606 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2607 {
2608 	struct hn_softc *sc = xsc;
2609 
2610 	/* Prevent any link status checks from running. */
2611 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2612 
2613 	/*
2614 	 * Fake up a [link down --> link up] state change; 5 seconds
2615 	 * delay is used, which closely simulates miibus reaction
2616 	 * upon link down event.
2617 	 */
2618 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2619 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2620 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2621 	    &sc->hn_netchg_status, 5 * hz);
2622 }
2623 
2624 static void
2625 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2626 {
2627 	struct hn_softc *sc = xsc;
2628 
2629 	/* Re-allow link status checks. */
2630 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2631 	hn_link_status(sc);
2632 }
2633 
2634 static void
2635 hn_update_link_status(struct hn_softc *sc)
2636 {
2637 
2638 	if (sc->hn_mgmt_taskq != NULL)
2639 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2640 }
2641 
2642 static void
2643 hn_change_network(struct hn_softc *sc)
2644 {
2645 
2646 	if (sc->hn_mgmt_taskq != NULL)
2647 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2648 }
2649 
2650 static __inline int
2651 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2652     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2653 {
2654 	struct mbuf *m = *m_head;
2655 	int error;
2656 
2657 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2658 
2659 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2660 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2661 	if (error == EFBIG) {
2662 		struct mbuf *m_new;
2663 
2664 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2665 		if (m_new == NULL)
2666 			return ENOBUFS;
2667 		else
2668 			*m_head = m = m_new;
2669 		txr->hn_tx_collapsed++;
2670 
2671 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2672 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2673 	}
2674 	if (!error) {
2675 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2676 		    BUS_DMASYNC_PREWRITE);
2677 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2678 	}
2679 	return error;
2680 }
2681 
2682 static __inline int
2683 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2684 {
2685 
2686 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2687 	    ("put an onlist txd %#x", txd->flags));
2688 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2689 	    ("put an onagg txd %#x", txd->flags));
2690 
2691 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2692 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2693 		return 0;
2694 
2695 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2696 		struct hn_txdesc *tmp_txd;
2697 
2698 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2699 			int freed __diagused;
2700 
2701 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2702 			    ("resursive aggregation on aggregated txdesc"));
2703 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2704 			    ("not aggregated txdesc"));
2705 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2706 			    ("aggregated txdesc uses dmamap"));
2707 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2708 			    ("aggregated txdesc consumes "
2709 			     "chimney sending buffer"));
2710 			KASSERT(tmp_txd->chim_size == 0,
2711 			    ("aggregated txdesc has non-zero "
2712 			     "chimney sending size"));
2713 
2714 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2715 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2716 			freed = hn_txdesc_put(txr, tmp_txd);
2717 			KASSERT(freed, ("failed to free aggregated txdesc"));
2718 		}
2719 	}
2720 
2721 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2722 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2723 		    ("chim txd uses dmamap"));
2724 		hn_chim_free(txr->hn_sc, txd->chim_index);
2725 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2726 		txd->chim_size = 0;
2727 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2728 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2729 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2730 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2731 		    txd->data_dmap);
2732 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2733 	}
2734 
2735 	if (txd->m != NULL) {
2736 		m_freem(txd->m);
2737 		txd->m = NULL;
2738 	}
2739 
2740 	txd->flags |= HN_TXD_FLAG_ONLIST;
2741 #ifndef HN_USE_TXDESC_BUFRING
2742 	mtx_lock_spin(&txr->hn_txlist_spin);
2743 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2744 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2745 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2746 	txr->hn_txdesc_avail++;
2747 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2748 	mtx_unlock_spin(&txr->hn_txlist_spin);
2749 #else	/* HN_USE_TXDESC_BUFRING */
2750 #ifdef HN_DEBUG
2751 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2752 #endif
2753 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2754 #endif	/* !HN_USE_TXDESC_BUFRING */
2755 
2756 	return 1;
2757 }
2758 
2759 static __inline struct hn_txdesc *
2760 hn_txdesc_get(struct hn_tx_ring *txr)
2761 {
2762 	struct hn_txdesc *txd;
2763 
2764 #ifndef HN_USE_TXDESC_BUFRING
2765 	mtx_lock_spin(&txr->hn_txlist_spin);
2766 	txd = SLIST_FIRST(&txr->hn_txlist);
2767 	if (txd != NULL) {
2768 		KASSERT(txr->hn_txdesc_avail > 0,
2769 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2770 		txr->hn_txdesc_avail--;
2771 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2772 	}
2773 	mtx_unlock_spin(&txr->hn_txlist_spin);
2774 #else
2775 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2776 #endif
2777 
2778 	if (txd != NULL) {
2779 #ifdef HN_USE_TXDESC_BUFRING
2780 #ifdef HN_DEBUG
2781 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2782 #endif
2783 #endif	/* HN_USE_TXDESC_BUFRING */
2784 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2785 		    STAILQ_EMPTY(&txd->agg_list) &&
2786 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2787 		    txd->chim_size == 0 &&
2788 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2789 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2790 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2791 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2792 		txd->refs = 1;
2793 	}
2794 	return txd;
2795 }
2796 
2797 static __inline void
2798 hn_txdesc_hold(struct hn_txdesc *txd)
2799 {
2800 
2801 	/* 0->1 transition will never work */
2802 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2803 	atomic_add_int(&txd->refs, 1);
2804 }
2805 
2806 static __inline void
2807 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2808 {
2809 
2810 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2811 	    ("recursive aggregation on aggregating txdesc"));
2812 
2813 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2814 	    ("already aggregated"));
2815 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2816 	    ("recursive aggregation on to-be-aggregated txdesc"));
2817 
2818 	txd->flags |= HN_TXD_FLAG_ONAGG;
2819 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2820 }
2821 
2822 static bool
2823 hn_tx_ring_pending(struct hn_tx_ring *txr)
2824 {
2825 	bool pending = false;
2826 
2827 #ifndef HN_USE_TXDESC_BUFRING
2828 	mtx_lock_spin(&txr->hn_txlist_spin);
2829 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2830 		pending = true;
2831 	mtx_unlock_spin(&txr->hn_txlist_spin);
2832 #else
2833 	if (!buf_ring_full(txr->hn_txdesc_br))
2834 		pending = true;
2835 #endif
2836 	return (pending);
2837 }
2838 
2839 static __inline void
2840 hn_txeof(struct hn_tx_ring *txr)
2841 {
2842 	txr->hn_has_txeof = 0;
2843 	txr->hn_txeof(txr);
2844 }
2845 
2846 static void
2847 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2848     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2849 {
2850 	struct hn_txdesc *txd = sndc->hn_cbarg;
2851 	struct hn_tx_ring *txr;
2852 
2853 	txr = txd->txr;
2854 	KASSERT(txr->hn_chan == chan,
2855 	    ("channel mismatch, on chan%u, should be chan%u",
2856 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2857 
2858 	txr->hn_has_txeof = 1;
2859 	hn_txdesc_put(txr, txd);
2860 
2861 	++txr->hn_txdone_cnt;
2862 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2863 		txr->hn_txdone_cnt = 0;
2864 		if (txr->hn_oactive)
2865 			hn_txeof(txr);
2866 	}
2867 }
2868 
2869 static void
2870 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2871 {
2872 #if defined(INET) || defined(INET6)
2873 	struct epoch_tracker et;
2874 
2875 	NET_EPOCH_ENTER(et);
2876 	tcp_lro_flush_all(&rxr->hn_lro);
2877 	NET_EPOCH_EXIT(et);
2878 #endif
2879 
2880 	/*
2881 	 * NOTE:
2882 	 * 'txr' could be NULL, if multiple channels and
2883 	 * ifnet.if_start method are enabled.
2884 	 */
2885 	if (txr == NULL || !txr->hn_has_txeof)
2886 		return;
2887 
2888 	txr->hn_txdone_cnt = 0;
2889 	hn_txeof(txr);
2890 }
2891 
2892 static __inline uint32_t
2893 hn_rndis_pktmsg_offset(uint32_t ofs)
2894 {
2895 
2896 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2897 	    ("invalid RNDIS packet msg offset %u", ofs));
2898 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2899 }
2900 
2901 static __inline void *
2902 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2903     size_t pi_dlen, uint32_t pi_type)
2904 {
2905 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2906 	struct rndis_pktinfo *pi;
2907 
2908 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2909 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2910 
2911 	/*
2912 	 * Per-packet-info does not move; it only grows.
2913 	 *
2914 	 * NOTE:
2915 	 * rm_pktinfooffset in this phase counts from the beginning
2916 	 * of rndis_packet_msg.
2917 	 */
2918 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2919 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2920 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2921 	    pkt->rm_pktinfolen);
2922 	pkt->rm_pktinfolen += pi_size;
2923 
2924 	pi->rm_size = pi_size;
2925 	pi->rm_type = pi_type;
2926 	pi->rm_internal = 0;
2927 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2928 
2929 	return (pi->rm_data);
2930 }
2931 
2932 static __inline int
2933 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2934 {
2935 	struct hn_txdesc *txd;
2936 	struct mbuf *m;
2937 	int error, pkts;
2938 
2939 	txd = txr->hn_agg_txd;
2940 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2941 
2942 	/*
2943 	 * Since hn_txpkt() will reset this temporary stat, save
2944 	 * it now, so that oerrors can be updated properly, if
2945 	 * hn_txpkt() ever fails.
2946 	 */
2947 	pkts = txr->hn_stat_pkts;
2948 
2949 	/*
2950 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2951 	 * failure, save it for later freeing, if hn_txpkt() ever
2952 	 * fails.
2953 	 */
2954 	m = txd->m;
2955 	error = hn_txpkt(ifp, txr, txd);
2956 	if (__predict_false(error)) {
2957 		/* txd is freed, but m is not. */
2958 		m_freem(m);
2959 
2960 		txr->hn_flush_failed++;
2961 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2962 	}
2963 
2964 	/* Reset all aggregation states. */
2965 	txr->hn_agg_txd = NULL;
2966 	txr->hn_agg_szleft = 0;
2967 	txr->hn_agg_pktleft = 0;
2968 	txr->hn_agg_prevpkt = NULL;
2969 
2970 	return (error);
2971 }
2972 
2973 static void *
2974 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2975     int pktsize)
2976 {
2977 	void *chim;
2978 
2979 	if (txr->hn_agg_txd != NULL) {
2980 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2981 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2982 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2983 			int olen;
2984 
2985 			/*
2986 			 * Update the previous RNDIS packet's total length,
2987 			 * it can be increased due to the mandatory alignment
2988 			 * padding for this RNDIS packet.  And update the
2989 			 * aggregating txdesc's chimney sending buffer size
2990 			 * accordingly.
2991 			 *
2992 			 * XXX
2993 			 * Zero-out the padding, as required by the RNDIS spec.
2994 			 */
2995 			olen = pkt->rm_len;
2996 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2997 			agg_txd->chim_size += pkt->rm_len - olen;
2998 
2999 			/* Link this txdesc to the parent. */
3000 			hn_txdesc_agg(agg_txd, txd);
3001 
3002 			chim = (uint8_t *)pkt + pkt->rm_len;
3003 			/* Save the current packet for later fixup. */
3004 			txr->hn_agg_prevpkt = chim;
3005 
3006 			txr->hn_agg_pktleft--;
3007 			txr->hn_agg_szleft -= pktsize;
3008 			if (txr->hn_agg_szleft <=
3009 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3010 				/*
3011 				 * Probably can't aggregate more packets,
3012 				 * flush this aggregating txdesc proactively.
3013 				 */
3014 				txr->hn_agg_pktleft = 0;
3015 			}
3016 			/* Done! */
3017 			return (chim);
3018 		}
3019 		hn_flush_txagg(ifp, txr);
3020 	}
3021 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3022 
3023 	txr->hn_tx_chimney_tried++;
3024 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3025 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3026 		return (NULL);
3027 	txr->hn_tx_chimney++;
3028 
3029 	chim = txr->hn_sc->hn_chim +
3030 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3031 
3032 	if (txr->hn_agg_pktmax > 1 &&
3033 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3034 		txr->hn_agg_txd = txd;
3035 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3036 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3037 		txr->hn_agg_prevpkt = chim;
3038 	}
3039 	return (chim);
3040 }
3041 
3042 /*
3043  * NOTE:
3044  * If this function fails, then both txd and m_head0 will be freed.
3045  */
3046 static int
3047 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3048     struct mbuf **m_head0)
3049 {
3050 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3051 	int error, nsegs, i;
3052 	struct mbuf *m_head = *m_head0;
3053 	struct rndis_packet_msg *pkt;
3054 	uint32_t *pi_data;
3055 	void *chim = NULL;
3056 	int pkt_hlen, pkt_size;
3057 
3058 	pkt = txd->rndis_pkt;
3059 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3060 	if (pkt_size < txr->hn_chim_size) {
3061 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3062 		if (chim != NULL)
3063 			pkt = chim;
3064 	} else {
3065 		if (txr->hn_agg_txd != NULL)
3066 			hn_flush_txagg(ifp, txr);
3067 	}
3068 
3069 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3070 	pkt->rm_len = m_head->m_pkthdr.len;
3071 	pkt->rm_dataoffset = 0;
3072 	pkt->rm_datalen = m_head->m_pkthdr.len;
3073 	pkt->rm_oobdataoffset = 0;
3074 	pkt->rm_oobdatalen = 0;
3075 	pkt->rm_oobdataelements = 0;
3076 	pkt->rm_pktinfooffset = sizeof(*pkt);
3077 	pkt->rm_pktinfolen = 0;
3078 	pkt->rm_vchandle = 0;
3079 	pkt->rm_reserved = 0;
3080 
3081 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3082 		/*
3083 		 * Set the hash value for this packet.
3084 		 */
3085 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3086 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3087 
3088 		if (M_HASHTYPE_ISHASH(m_head))
3089 			/*
3090 			 * The flowid field contains the hash value host
3091 			 * set in the rx queue if it is a ip forwarding pkt.
3092 			 * Set the same hash value so host can send on the
3093 			 * cpu it was received.
3094 			 */
3095 			*pi_data = m_head->m_pkthdr.flowid;
3096 		else
3097 			/*
3098 			 * Otherwise just put the tx queue index.
3099 			 */
3100 			*pi_data = txr->hn_tx_idx;
3101 	}
3102 
3103 	if (m_head->m_flags & M_VLANTAG) {
3104 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3105 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3106 		*pi_data = NDIS_VLAN_INFO_MAKE(
3107 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3108 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3109 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3110 	}
3111 
3112 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3113 #if defined(INET6) || defined(INET)
3114 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3115 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3116 #ifdef INET
3117 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3118 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3119 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3120 			    m_head->m_pkthdr.tso_segsz);
3121 		}
3122 #endif
3123 #if defined(INET6) && defined(INET)
3124 		else
3125 #endif
3126 #ifdef INET6
3127 		{
3128 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3129 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3130 			    m_head->m_pkthdr.tso_segsz);
3131 		}
3132 #endif
3133 #endif	/* INET6 || INET */
3134 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3135 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3136 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3137 		if (m_head->m_pkthdr.csum_flags &
3138 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3139 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3140 		} else {
3141 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3142 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3143 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3144 		}
3145 
3146 		if (m_head->m_pkthdr.csum_flags &
3147 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3148 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3149 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3150 		} else if (m_head->m_pkthdr.csum_flags &
3151 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3152 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3153 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3154 		}
3155 	}
3156 
3157 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3158 	/* Fixup RNDIS packet message total length */
3159 	pkt->rm_len += pkt_hlen;
3160 	/* Convert RNDIS packet message offsets */
3161 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3162 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3163 
3164 	/*
3165 	 * Fast path: Chimney sending.
3166 	 */
3167 	if (chim != NULL) {
3168 		struct hn_txdesc *tgt_txd = txd;
3169 
3170 		if (txr->hn_agg_txd != NULL) {
3171 			tgt_txd = txr->hn_agg_txd;
3172 #ifdef INVARIANTS
3173 			*m_head0 = NULL;
3174 #endif
3175 		}
3176 
3177 		KASSERT(pkt == chim,
3178 		    ("RNDIS pkt not in chimney sending buffer"));
3179 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3180 		    ("chimney sending buffer is not used"));
3181 		tgt_txd->chim_size += pkt->rm_len;
3182 
3183 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3184 		    ((uint8_t *)chim) + pkt_hlen);
3185 
3186 		txr->hn_gpa_cnt = 0;
3187 		txr->hn_sendpkt = hn_txpkt_chim;
3188 		goto done;
3189 	}
3190 
3191 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3192 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3193 	    ("chimney buffer is used"));
3194 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3195 
3196 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3197 	if (__predict_false(error)) {
3198 		int freed __diagused;
3199 
3200 		/*
3201 		 * This mbuf is not linked w/ the txd yet, so free it now.
3202 		 */
3203 		m_freem(m_head);
3204 		*m_head0 = NULL;
3205 
3206 		freed = hn_txdesc_put(txr, txd);
3207 		KASSERT(freed != 0,
3208 		    ("fail to free txd upon txdma error"));
3209 
3210 		txr->hn_txdma_failed++;
3211 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3212 		return error;
3213 	}
3214 	*m_head0 = m_head;
3215 
3216 	/* +1 RNDIS packet message */
3217 	txr->hn_gpa_cnt = nsegs + 1;
3218 
3219 	/* send packet with page buffer */
3220 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3221 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3222 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3223 
3224 	/*
3225 	 * Fill the page buffers with mbuf info after the page
3226 	 * buffer for RNDIS packet message.
3227 	 */
3228 	for (i = 0; i < nsegs; ++i) {
3229 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3230 
3231 		gpa->gpa_page = atop(segs[i].ds_addr);
3232 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3233 		gpa->gpa_len = segs[i].ds_len;
3234 	}
3235 
3236 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3237 	txd->chim_size = 0;
3238 	txr->hn_sendpkt = hn_txpkt_sglist;
3239 done:
3240 	txd->m = m_head;
3241 
3242 	/* Set the completion routine */
3243 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3244 
3245 	/* Update temporary stats for later use. */
3246 	txr->hn_stat_pkts++;
3247 	txr->hn_stat_size += m_head->m_pkthdr.len;
3248 	if (m_head->m_flags & M_MCAST)
3249 		txr->hn_stat_mcasts++;
3250 
3251 	return 0;
3252 }
3253 
3254 /*
3255  * NOTE:
3256  * If this function fails, then txd will be freed, but the mbuf
3257  * associated w/ the txd will _not_ be freed.
3258  */
3259 static int
3260 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3261 {
3262 	int error, send_failed = 0, has_bpf;
3263 
3264 again:
3265 	has_bpf = bpf_peers_present(if_getbpf(ifp));
3266 	if (has_bpf) {
3267 		/*
3268 		 * Make sure that this txd and any aggregated txds are not
3269 		 * freed before ETHER_BPF_MTAP.
3270 		 */
3271 		hn_txdesc_hold(txd);
3272 	}
3273 	error = txr->hn_sendpkt(txr, txd);
3274 	if (!error) {
3275 		if (has_bpf) {
3276 			const struct hn_txdesc *tmp_txd;
3277 
3278 			ETHER_BPF_MTAP(ifp, txd->m);
3279 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3280 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3281 		}
3282 
3283 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3284 #ifdef HN_IFSTART_SUPPORT
3285 		if (!hn_use_if_start)
3286 #endif
3287 		{
3288 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3289 			    txr->hn_stat_size);
3290 			if (txr->hn_stat_mcasts != 0) {
3291 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3292 				    txr->hn_stat_mcasts);
3293 			}
3294 		}
3295 		txr->hn_pkts += txr->hn_stat_pkts;
3296 		txr->hn_sends++;
3297 	}
3298 	if (has_bpf)
3299 		hn_txdesc_put(txr, txd);
3300 
3301 	if (__predict_false(error)) {
3302 		int freed __diagused;
3303 
3304 		/*
3305 		 * This should "really rarely" happen.
3306 		 *
3307 		 * XXX Too many RX to be acked or too many sideband
3308 		 * commands to run?  Ask netvsc_channel_rollup()
3309 		 * to kick start later.
3310 		 */
3311 		txr->hn_has_txeof = 1;
3312 		if (!send_failed) {
3313 			txr->hn_send_failed++;
3314 			send_failed = 1;
3315 			/*
3316 			 * Try sending again after set hn_has_txeof;
3317 			 * in case that we missed the last
3318 			 * netvsc_channel_rollup().
3319 			 */
3320 			goto again;
3321 		}
3322 		if_printf(ifp, "send failed\n");
3323 
3324 		/*
3325 		 * Caller will perform further processing on the
3326 		 * associated mbuf, so don't free it in hn_txdesc_put();
3327 		 * only unload it from the DMA map in hn_txdesc_put(),
3328 		 * if it was loaded.
3329 		 */
3330 		txd->m = NULL;
3331 		freed = hn_txdesc_put(txr, txd);
3332 		KASSERT(freed != 0,
3333 		    ("fail to free txd upon send error"));
3334 
3335 		txr->hn_send_failed++;
3336 	}
3337 
3338 	/* Reset temporary stats, after this sending is done. */
3339 	txr->hn_stat_size = 0;
3340 	txr->hn_stat_pkts = 0;
3341 	txr->hn_stat_mcasts = 0;
3342 
3343 	return (error);
3344 }
3345 
3346 /*
3347  * Append the specified data to the indicated mbuf chain,
3348  * Extend the mbuf chain if the new data does not fit in
3349  * existing space.
3350  *
3351  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3352  * There should be an equivalent in the kernel mbuf code,
3353  * but there does not appear to be one yet.
3354  *
3355  * Differs from m_append() in that additional mbufs are
3356  * allocated with cluster size MJUMPAGESIZE, and filled
3357  * accordingly.
3358  *
3359  * Return the last mbuf in the chain or NULL if failed to
3360  * allocate new mbuf.
3361  */
3362 static struct mbuf *
3363 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3364 {
3365 	struct mbuf *m, *n;
3366 	int remainder, space;
3367 
3368 	for (m = m0; m->m_next != NULL; m = m->m_next)
3369 		;
3370 	remainder = len;
3371 	space = M_TRAILINGSPACE(m);
3372 	if (space > 0) {
3373 		/*
3374 		 * Copy into available space.
3375 		 */
3376 		if (space > remainder)
3377 			space = remainder;
3378 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3379 		m->m_len += space;
3380 		cp += space;
3381 		remainder -= space;
3382 	}
3383 	while (remainder > 0) {
3384 		/*
3385 		 * Allocate a new mbuf; could check space
3386 		 * and allocate a cluster instead.
3387 		 */
3388 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3389 		if (n == NULL)
3390 			return NULL;
3391 		n->m_len = min(MJUMPAGESIZE, remainder);
3392 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3393 		cp += n->m_len;
3394 		remainder -= n->m_len;
3395 		m->m_next = n;
3396 		m = n;
3397 	}
3398 
3399 	return m;
3400 }
3401 
3402 #if defined(INET) || defined(INET6)
3403 static __inline int
3404 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3405 {
3406 	if (hn_lro_mbufq_depth) {
3407 		tcp_lro_queue_mbuf(lc, m);
3408 		return 0;
3409 	}
3410 	return tcp_lro_rx(lc, m, 0);
3411 }
3412 #endif
3413 
3414 static int
3415 hn_rxpkt(struct hn_rx_ring *rxr)
3416 {
3417 	if_t ifp, hn_ifp = rxr->hn_ifp;
3418 	struct mbuf *m_new, *n;
3419 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3420 	int hash_type = M_HASHTYPE_NONE;
3421 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3422 	int i;
3423 
3424 	ifp = hn_ifp;
3425 	if (rxr->hn_rxvf_ifp != NULL) {
3426 		/*
3427 		 * Non-transparent mode VF; pretend this packet is from
3428 		 * the VF.
3429 		 */
3430 		ifp = rxr->hn_rxvf_ifp;
3431 		is_vf = 1;
3432 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3433 		/* Transparent mode VF. */
3434 		is_vf = 1;
3435 	}
3436 
3437 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3438 		/*
3439 		 * NOTE:
3440 		 * See the NOTE of hn_rndis_init_fixat().  This
3441 		 * function can be reached, immediately after the
3442 		 * RNDIS is initialized but before the ifnet is
3443 		 * setup on the hn_attach() path; drop the unexpected
3444 		 * packets.
3445 		 */
3446 		return (0);
3447 	}
3448 
3449 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3450 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3451 		return (0);
3452 	}
3453 
3454 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3455 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3456 		if (m_new == NULL) {
3457 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3458 			return (0);
3459 		}
3460 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3461 		    rxr->rsc.frag_len[0]);
3462 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3463 	} else {
3464 		/*
3465 		 * Get an mbuf with a cluster.  For packets 2K or less,
3466 		 * get a standard 2K cluster.  For anything larger, get a
3467 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3468 		 * if looped around to the Hyper-V TX channel, so avoid them.
3469 		 */
3470 		size = MCLBYTES;
3471 		if (rxr->rsc.pktlen > MCLBYTES) {
3472 			/* 4096 */
3473 			size = MJUMPAGESIZE;
3474 		}
3475 
3476 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3477 		if (m_new == NULL) {
3478 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3479 			return (0);
3480 		}
3481 
3482 		n = m_new;
3483 		for (i = 0; i < rxr->rsc.cnt; i++) {
3484 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3485 			    rxr->rsc.frag_data[i]);
3486 			if (n == NULL) {
3487 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3488 				return (0);
3489 			} else {
3490 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3491 			}
3492 		}
3493 	}
3494 	if (rxr->rsc.pktlen <= MHLEN)
3495 		rxr->hn_small_pkts++;
3496 
3497 	m_new->m_pkthdr.rcvif = ifp;
3498 
3499 	if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3500 		do_csum = 0;
3501 
3502 	/* receive side checksum offload */
3503 	if (rxr->rsc.csum_info != NULL) {
3504 		/* IP csum offload */
3505 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3506 			m_new->m_pkthdr.csum_flags |=
3507 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3508 			rxr->hn_csum_ip++;
3509 		}
3510 
3511 		/* TCP/UDP csum offload */
3512 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3513 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3514 			m_new->m_pkthdr.csum_flags |=
3515 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3516 			m_new->m_pkthdr.csum_data = 0xffff;
3517 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3518 				rxr->hn_csum_tcp++;
3519 			else
3520 				rxr->hn_csum_udp++;
3521 		}
3522 
3523 		/*
3524 		 * XXX
3525 		 * As of this write (Oct 28th, 2016), host side will turn
3526 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3527 		 * the do_lro setting here is actually _not_ accurate.  We
3528 		 * depend on the RSS hash type check to reset do_lro.
3529 		 */
3530 		if ((*(rxr->rsc.csum_info) &
3531 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3532 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3533 			do_lro = 1;
3534 	} else {
3535 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3536 		if (l3proto == ETHERTYPE_IP) {
3537 			if (l4proto == IPPROTO_TCP) {
3538 				if (do_csum &&
3539 				    (rxr->hn_trust_hcsum &
3540 				     HN_TRUST_HCSUM_TCP)) {
3541 					rxr->hn_csum_trusted++;
3542 					m_new->m_pkthdr.csum_flags |=
3543 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3544 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3545 					m_new->m_pkthdr.csum_data = 0xffff;
3546 				}
3547 				do_lro = 1;
3548 			} else if (l4proto == IPPROTO_UDP) {
3549 				if (do_csum &&
3550 				    (rxr->hn_trust_hcsum &
3551 				     HN_TRUST_HCSUM_UDP)) {
3552 					rxr->hn_csum_trusted++;
3553 					m_new->m_pkthdr.csum_flags |=
3554 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3555 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3556 					m_new->m_pkthdr.csum_data = 0xffff;
3557 				}
3558 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3559 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3560 				rxr->hn_csum_trusted++;
3561 				m_new->m_pkthdr.csum_flags |=
3562 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3563 			}
3564 		}
3565 	}
3566 
3567 	if (rxr->rsc.vlan_info != NULL) {
3568 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3569 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3570 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3571 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3572 		m_new->m_flags |= M_VLANTAG;
3573 	}
3574 
3575 	/*
3576 	 * If VF is activated (tranparent/non-transparent mode does not
3577 	 * matter here).
3578 	 *
3579 	 * - Disable LRO
3580 	 *
3581 	 *   hn(4) will only receive broadcast packets, multicast packets,
3582 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3583 	 *   packet types.
3584 	 *
3585 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3586 	 *   all, since the LRO flush will use hn(4) as the receiving
3587 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3588 	 */
3589 	if (is_vf)
3590 		do_lro = 0;
3591 
3592 	/*
3593 	 * If VF is activated (tranparent/non-transparent mode does not
3594 	 * matter here), do _not_ mess with unsupported hash types or
3595 	 * functions.
3596 	 */
3597 	if (rxr->rsc.hash_info != NULL) {
3598 		rxr->hn_rss_pkts++;
3599 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3600 		if (!is_vf)
3601 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3602 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3603 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3604 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3605 			    rxr->hn_mbuf_hash);
3606 
3607 			/*
3608 			 * NOTE:
3609 			 * do_lro is resetted, if the hash types are not TCP
3610 			 * related.  See the comment in the above csum_flags
3611 			 * setup section.
3612 			 */
3613 			switch (type) {
3614 			case NDIS_HASH_IPV4:
3615 				hash_type = M_HASHTYPE_RSS_IPV4;
3616 				do_lro = 0;
3617 				break;
3618 
3619 			case NDIS_HASH_TCP_IPV4:
3620 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3621 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3622 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3623 
3624 					if (is_vf)
3625 						def_htype = M_HASHTYPE_NONE;
3626 
3627 					/*
3628 					 * UDP 4-tuple hash is delivered as
3629 					 * TCP 4-tuple hash.
3630 					 */
3631 					if (l3proto == ETHERTYPE_MAX) {
3632 						hn_rxpkt_proto(m_new,
3633 						    &l3proto, &l4proto);
3634 					}
3635 					if (l3proto == ETHERTYPE_IP) {
3636 						if (l4proto == IPPROTO_UDP &&
3637 						    (rxr->hn_mbuf_hash &
3638 						     NDIS_HASH_UDP_IPV4_X)) {
3639 							hash_type =
3640 							M_HASHTYPE_RSS_UDP_IPV4;
3641 							do_lro = 0;
3642 						} else if (l4proto !=
3643 						    IPPROTO_TCP) {
3644 							hash_type = def_htype;
3645 							do_lro = 0;
3646 						}
3647 					} else {
3648 						hash_type = def_htype;
3649 						do_lro = 0;
3650 					}
3651 				}
3652 				break;
3653 
3654 			case NDIS_HASH_IPV6:
3655 				hash_type = M_HASHTYPE_RSS_IPV6;
3656 				do_lro = 0;
3657 				break;
3658 
3659 			case NDIS_HASH_IPV6_EX:
3660 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3661 				do_lro = 0;
3662 				break;
3663 
3664 			case NDIS_HASH_TCP_IPV6:
3665 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3666 				break;
3667 
3668 			case NDIS_HASH_TCP_IPV6_EX:
3669 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3670 				break;
3671 			}
3672 		}
3673 	} else if (!is_vf) {
3674 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3675 		hash_type = M_HASHTYPE_OPAQUE;
3676 	}
3677 	M_HASHTYPE_SET(m_new, hash_type);
3678 
3679 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3680 	if (hn_ifp != ifp) {
3681 		const struct ether_header *eh;
3682 
3683 		/*
3684 		 * Non-transparent mode VF is activated.
3685 		 */
3686 
3687 		/*
3688 		 * Allow tapping on hn(4).
3689 		 */
3690 		ETHER_BPF_MTAP(hn_ifp, m_new);
3691 
3692 		/*
3693 		 * Update hn(4)'s stats.
3694 		 */
3695 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3696 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3697 		/* Checked at the beginning of this function. */
3698 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3699 		eh = mtod(m_new, struct ether_header *);
3700 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3701 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3702 	}
3703 	rxr->hn_pkts++;
3704 
3705 	if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3706 #if defined(INET) || defined(INET6)
3707 		struct lro_ctrl *lro = &rxr->hn_lro;
3708 
3709 		if (lro->lro_cnt) {
3710 			rxr->hn_lro_tried++;
3711 			if (hn_lro_rx(lro, m_new) == 0) {
3712 				/* DONE! */
3713 				return 0;
3714 			}
3715 		}
3716 #endif
3717 	}
3718 	if_input(ifp, m_new);
3719 
3720 	return (0);
3721 }
3722 
3723 static int
3724 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3725 {
3726 	struct hn_softc *sc = if_getsoftc(ifp);
3727 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3728 	if_t vf_ifp;
3729 	int mask, error = 0;
3730 	struct ifrsskey *ifrk;
3731 	struct ifrsshash *ifrh;
3732 	uint32_t mtu;
3733 
3734 	switch (cmd) {
3735 	case SIOCSIFMTU:
3736 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3737 			error = EINVAL;
3738 			break;
3739 		}
3740 
3741 		HN_LOCK(sc);
3742 
3743 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3744 			HN_UNLOCK(sc);
3745 			break;
3746 		}
3747 
3748 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3749 			/* Can't change MTU */
3750 			HN_UNLOCK(sc);
3751 			error = EOPNOTSUPP;
3752 			break;
3753 		}
3754 
3755 		if (if_getmtu(ifp) == ifr->ifr_mtu) {
3756 			HN_UNLOCK(sc);
3757 			break;
3758 		}
3759 
3760 		if (hn_xpnt_vf_isready(sc)) {
3761 			vf_ifp = sc->hn_vf_ifp;
3762 			ifr_vf = *ifr;
3763 			strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3764 			    sizeof(ifr_vf.ifr_name));
3765 			error = ifhwioctl(SIOCSIFMTU,vf_ifp,
3766 			    (caddr_t)&ifr_vf, curthread);
3767 			if (error) {
3768 				HN_UNLOCK(sc);
3769 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3770 				    if_name(vf_ifp), ifr->ifr_mtu, error);
3771 				break;
3772 			}
3773 		}
3774 
3775 		/*
3776 		 * Suspend this interface before the synthetic parts
3777 		 * are ripped.
3778 		 */
3779 		hn_suspend(sc);
3780 
3781 		/*
3782 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3783 		 */
3784 		hn_synth_detach(sc);
3785 
3786 		/*
3787 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3788 		 * with the new MTU setting.
3789 		 */
3790 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3791 		if (error) {
3792 			HN_UNLOCK(sc);
3793 			break;
3794 		}
3795 
3796 		error = hn_rndis_get_mtu(sc, &mtu);
3797 		if (error)
3798 			mtu = ifr->ifr_mtu;
3799 		else if (bootverbose)
3800 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3801 
3802 		/*
3803 		 * Commit the requested MTU, after the synthetic parts
3804 		 * have been successfully attached.
3805 		 */
3806 		if (mtu >= ifr->ifr_mtu) {
3807 			mtu = ifr->ifr_mtu;
3808 		} else {
3809 			if_printf(ifp, "fixup mtu %d -> %u\n",
3810 			    ifr->ifr_mtu, mtu);
3811 		}
3812 		if_setmtu(ifp, mtu);
3813 
3814 		/*
3815 		 * Synthetic parts' reattach may change the chimney
3816 		 * sending size; update it.
3817 		 */
3818 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3819 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3820 
3821 		/*
3822 		 * Make sure that various parameters based on MTU are
3823 		 * still valid, after the MTU change.
3824 		 */
3825 		hn_mtu_change_fixup(sc);
3826 
3827 		/*
3828 		 * All done!  Resume the interface now.
3829 		 */
3830 		hn_resume(sc);
3831 
3832 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3833 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3834 			/*
3835 			 * Since we have reattached the NVS part,
3836 			 * change the datapath to VF again; in case
3837 			 * that it is lost, after the NVS was detached.
3838 			 */
3839 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3840 		}
3841 
3842 		HN_UNLOCK(sc);
3843 		break;
3844 
3845 	case SIOCSIFFLAGS:
3846 		HN_LOCK(sc);
3847 
3848 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3849 			HN_UNLOCK(sc);
3850 			break;
3851 		}
3852 
3853 		if (hn_xpnt_vf_isready(sc))
3854 			hn_xpnt_vf_saveifflags(sc);
3855 
3856 		if (if_getflags(ifp) & IFF_UP) {
3857 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3858 				/*
3859 				 * Caller meight hold mutex, e.g.
3860 				 * bpf; use busy-wait for the RNDIS
3861 				 * reply.
3862 				 */
3863 				HN_NO_SLEEPING(sc);
3864 				hn_rxfilter_config(sc);
3865 				HN_SLEEPING_OK(sc);
3866 
3867 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3868 					error = hn_xpnt_vf_iocsetflags(sc);
3869 			} else {
3870 				hn_init_locked(sc);
3871 			}
3872 		} else {
3873 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3874 				hn_stop(sc, false);
3875 		}
3876 		sc->hn_if_flags = if_getflags(ifp);
3877 
3878 		HN_UNLOCK(sc);
3879 		break;
3880 
3881 	case SIOCSIFCAP:
3882 		HN_LOCK(sc);
3883 
3884 		if (hn_xpnt_vf_isready(sc)) {
3885 			ifr_vf = *ifr;
3886 			strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3887 			    sizeof(ifr_vf.ifr_name));
3888 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3889 			HN_UNLOCK(sc);
3890 			break;
3891 		}
3892 
3893 		/*
3894 		 * Fix up requested capabilities w/ supported capabilities,
3895 		 * since the supported capabilities could have been changed.
3896 		 */
3897 		mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3898 		    if_getcapenable(ifp);
3899 
3900 		if (mask & IFCAP_TXCSUM) {
3901 			if_togglecapenable(ifp, IFCAP_TXCSUM);
3902 			if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3903 				if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3904 			else
3905 				if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3906 		}
3907 		if (mask & IFCAP_TXCSUM_IPV6) {
3908 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3909 			if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3910 				if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3911 			else
3912 				if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3913 		}
3914 
3915 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3916 		if (mask & IFCAP_RXCSUM)
3917 			if_togglecapenable(ifp, IFCAP_RXCSUM);
3918 #ifdef foo
3919 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3920 		if (mask & IFCAP_RXCSUM_IPV6)
3921 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3922 #endif
3923 
3924 		if (mask & IFCAP_LRO)
3925 			if_togglecapenable(ifp, IFCAP_LRO);
3926 
3927 		if (mask & IFCAP_TSO4) {
3928 			if_togglecapenable(ifp, IFCAP_TSO4);
3929 			if (if_getcapenable(ifp) & IFCAP_TSO4)
3930 				if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3931 			else
3932 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3933 		}
3934 		if (mask & IFCAP_TSO6) {
3935 			if_togglecapenable(ifp, IFCAP_TSO6);
3936 			if (if_getcapenable(ifp) & IFCAP_TSO6)
3937 				if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3938 			else
3939 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3940 		}
3941 
3942 		HN_UNLOCK(sc);
3943 		break;
3944 
3945 	case SIOCADDMULTI:
3946 	case SIOCDELMULTI:
3947 		HN_LOCK(sc);
3948 
3949 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3950 			HN_UNLOCK(sc);
3951 			break;
3952 		}
3953 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3954 			/*
3955 			 * Multicast uses mutex; use busy-wait for
3956 			 * the RNDIS reply.
3957 			 */
3958 			HN_NO_SLEEPING(sc);
3959 			hn_rxfilter_config(sc);
3960 			HN_SLEEPING_OK(sc);
3961 		}
3962 
3963 		/* XXX vlan(4) style mcast addr maintenance */
3964 		if (hn_xpnt_vf_isready(sc)) {
3965 			int old_if_flags;
3966 
3967 			old_if_flags = if_getflags(sc->hn_vf_ifp);
3968 			hn_xpnt_vf_saveifflags(sc);
3969 
3970 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3971 			    ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
3972 			     IFF_ALLMULTI))
3973 				error = hn_xpnt_vf_iocsetflags(sc);
3974 		}
3975 
3976 		HN_UNLOCK(sc);
3977 		break;
3978 
3979 	case SIOCSIFMEDIA:
3980 	case SIOCGIFMEDIA:
3981 		HN_LOCK(sc);
3982 		if (hn_xpnt_vf_isready(sc)) {
3983 			/*
3984 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3985 			 * create and pass ifr_vf to the VF here; just
3986 			 * replace the ifr_name.
3987 			 */
3988 			vf_ifp = sc->hn_vf_ifp;
3989 			strlcpy(ifr->ifr_name, if_name(vf_ifp),
3990 			    sizeof(ifr->ifr_name));
3991 			error = ifhwioctl(cmd, vf_ifp, data, curthread);
3992 			/* Restore the ifr_name. */
3993 			strlcpy(ifr->ifr_name, if_name(ifp),
3994 			    sizeof(ifr->ifr_name));
3995 			HN_UNLOCK(sc);
3996 			break;
3997 		}
3998 		HN_UNLOCK(sc);
3999 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4000 		break;
4001 
4002 	case SIOCGIFRSSHASH:
4003 		ifrh = (struct ifrsshash *)data;
4004 		HN_LOCK(sc);
4005 		if (sc->hn_rx_ring_inuse == 1) {
4006 			HN_UNLOCK(sc);
4007 			ifrh->ifrh_func = RSS_FUNC_NONE;
4008 			ifrh->ifrh_types = 0;
4009 			break;
4010 		}
4011 
4012 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4013 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4014 		else
4015 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4016 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4017 		HN_UNLOCK(sc);
4018 		break;
4019 
4020 	case SIOCGIFRSSKEY:
4021 		ifrk = (struct ifrsskey *)data;
4022 		HN_LOCK(sc);
4023 		if (sc->hn_rx_ring_inuse == 1) {
4024 			HN_UNLOCK(sc);
4025 			ifrk->ifrk_func = RSS_FUNC_NONE;
4026 			ifrk->ifrk_keylen = 0;
4027 			break;
4028 		}
4029 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4030 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4031 		else
4032 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4033 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4034 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4035 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4036 		HN_UNLOCK(sc);
4037 		break;
4038 
4039 	default:
4040 		error = ether_ioctl(ifp, cmd, data);
4041 		break;
4042 	}
4043 	return (error);
4044 }
4045 
4046 static void
4047 hn_stop(struct hn_softc *sc, bool detaching)
4048 {
4049 	if_t ifp = sc->hn_ifp;
4050 	int i;
4051 
4052 	HN_LOCK_ASSERT(sc);
4053 
4054 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4055 	    ("synthetic parts were not attached"));
4056 
4057 	/* Clear RUNNING bit ASAP. */
4058 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4059 
4060 	/* Disable polling. */
4061 	hn_polling(sc, 0);
4062 
4063 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4064 		KASSERT(sc->hn_vf_ifp != NULL,
4065 		    ("%s: VF is not attached", if_name(ifp)));
4066 
4067 		/* Mark transparent mode VF as disabled. */
4068 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4069 
4070 		/*
4071 		 * NOTE:
4072 		 * Datapath setting must happen _before_ bringing
4073 		 * the VF down.
4074 		 */
4075 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4076 
4077 		/*
4078 		 * Bring the VF down.
4079 		 */
4080 		hn_xpnt_vf_saveifflags(sc);
4081 		if_setflagbits(ifp, 0, IFF_UP);
4082 		hn_xpnt_vf_iocsetflags(sc);
4083 	}
4084 
4085 	/* Suspend data transfers. */
4086 	hn_suspend_data(sc);
4087 
4088 	/* Clear OACTIVE bit. */
4089 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4090 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4091 		sc->hn_tx_ring[i].hn_oactive = 0;
4092 
4093 	/*
4094 	 * If the non-transparent mode VF is active, make sure
4095 	 * that the RX filter still allows packet reception.
4096 	 */
4097 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4098 		hn_rxfilter_config(sc);
4099 }
4100 
4101 static void
4102 hn_init_locked(struct hn_softc *sc)
4103 {
4104 	if_t ifp = sc->hn_ifp;
4105 	int i;
4106 
4107 	HN_LOCK_ASSERT(sc);
4108 
4109 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4110 		return;
4111 
4112 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4113 		return;
4114 
4115 	/* Configure RX filter */
4116 	hn_rxfilter_config(sc);
4117 
4118 	/* Clear OACTIVE bit. */
4119 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4120 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4121 		sc->hn_tx_ring[i].hn_oactive = 0;
4122 
4123 	/* Clear TX 'suspended' bit. */
4124 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4125 
4126 	if (hn_xpnt_vf_isready(sc)) {
4127 		/* Initialize transparent VF. */
4128 		hn_xpnt_vf_init(sc);
4129 	}
4130 
4131 	/* Everything is ready; unleash! */
4132 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4133 
4134 	/* Re-enable polling if requested. */
4135 	if (sc->hn_pollhz > 0)
4136 		hn_polling(sc, sc->hn_pollhz);
4137 }
4138 
4139 static void
4140 hn_init(void *xsc)
4141 {
4142 	struct hn_softc *sc = xsc;
4143 
4144 	HN_LOCK(sc);
4145 	hn_init_locked(sc);
4146 	HN_UNLOCK(sc);
4147 }
4148 
4149 static int
4150 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4151 {
4152 	struct hn_softc *sc = arg1;
4153 	unsigned int lenlim;
4154 	int error;
4155 
4156 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4157 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4158 	if (error || req->newptr == NULL)
4159 		return error;
4160 
4161 	HN_LOCK(sc);
4162 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4163 	    lenlim > TCP_LRO_LENGTH_MAX) {
4164 		HN_UNLOCK(sc);
4165 		return EINVAL;
4166 	}
4167 	hn_set_lro_lenlim(sc, lenlim);
4168 	HN_UNLOCK(sc);
4169 
4170 	return 0;
4171 }
4172 
4173 static int
4174 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4175 {
4176 	struct hn_softc *sc = arg1;
4177 	int ackcnt, error, i;
4178 
4179 	/*
4180 	 * lro_ackcnt_lim is append count limit,
4181 	 * +1 to turn it into aggregation limit.
4182 	 */
4183 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4184 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4185 	if (error || req->newptr == NULL)
4186 		return error;
4187 
4188 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4189 		return EINVAL;
4190 
4191 	/*
4192 	 * Convert aggregation limit back to append
4193 	 * count limit.
4194 	 */
4195 	--ackcnt;
4196 	HN_LOCK(sc);
4197 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4198 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4199 	HN_UNLOCK(sc);
4200 	return 0;
4201 }
4202 
4203 static int
4204 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4205 {
4206 	struct hn_softc *sc = arg1;
4207 	int hcsum = arg2;
4208 	int on, error, i;
4209 
4210 	on = 0;
4211 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4212 		on = 1;
4213 
4214 	error = sysctl_handle_int(oidp, &on, 0, req);
4215 	if (error || req->newptr == NULL)
4216 		return error;
4217 
4218 	HN_LOCK(sc);
4219 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4220 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4221 
4222 		if (on)
4223 			rxr->hn_trust_hcsum |= hcsum;
4224 		else
4225 			rxr->hn_trust_hcsum &= ~hcsum;
4226 	}
4227 	HN_UNLOCK(sc);
4228 	return 0;
4229 }
4230 
4231 static int
4232 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4233 {
4234 	struct hn_softc *sc = arg1;
4235 	int chim_size, error;
4236 
4237 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4238 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4239 	if (error || req->newptr == NULL)
4240 		return error;
4241 
4242 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4243 		return EINVAL;
4244 
4245 	HN_LOCK(sc);
4246 	hn_set_chim_size(sc, chim_size);
4247 	HN_UNLOCK(sc);
4248 	return 0;
4249 }
4250 
4251 static int
4252 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4253 {
4254 	struct hn_softc *sc = arg1;
4255 	int ofs = arg2, i, error;
4256 	struct hn_rx_ring *rxr;
4257 	uint64_t stat;
4258 
4259 	stat = 0;
4260 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4261 		rxr = &sc->hn_rx_ring[i];
4262 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4263 	}
4264 
4265 	error = sysctl_handle_64(oidp, &stat, 0, req);
4266 	if (error || req->newptr == NULL)
4267 		return error;
4268 
4269 	/* Zero out this stat. */
4270 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4271 		rxr = &sc->hn_rx_ring[i];
4272 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4273 	}
4274 	return 0;
4275 }
4276 
4277 static int
4278 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4279 {
4280 	struct hn_softc *sc = arg1;
4281 	int ofs = arg2, i, error;
4282 	struct hn_rx_ring *rxr;
4283 	u_long stat;
4284 
4285 	stat = 0;
4286 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4287 		rxr = &sc->hn_rx_ring[i];
4288 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4289 	}
4290 
4291 	error = sysctl_handle_long(oidp, &stat, 0, req);
4292 	if (error || req->newptr == NULL)
4293 		return error;
4294 
4295 	/* Zero out this stat. */
4296 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4297 		rxr = &sc->hn_rx_ring[i];
4298 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4299 	}
4300 	return 0;
4301 }
4302 
4303 static int
4304 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4305 {
4306 	struct hn_softc *sc = arg1;
4307 	int ofs = arg2, i, error;
4308 	struct hn_tx_ring *txr;
4309 	u_long stat;
4310 
4311 	stat = 0;
4312 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4313 		txr = &sc->hn_tx_ring[i];
4314 		stat += *((u_long *)((uint8_t *)txr + ofs));
4315 	}
4316 
4317 	error = sysctl_handle_long(oidp, &stat, 0, req);
4318 	if (error || req->newptr == NULL)
4319 		return error;
4320 
4321 	/* Zero out this stat. */
4322 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4323 		txr = &sc->hn_tx_ring[i];
4324 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4325 	}
4326 	return 0;
4327 }
4328 
4329 static int
4330 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4331 {
4332 	struct hn_softc *sc = arg1;
4333 	int ofs = arg2, i, error, conf;
4334 	struct hn_tx_ring *txr;
4335 
4336 	txr = &sc->hn_tx_ring[0];
4337 	conf = *((int *)((uint8_t *)txr + ofs));
4338 
4339 	error = sysctl_handle_int(oidp, &conf, 0, req);
4340 	if (error || req->newptr == NULL)
4341 		return error;
4342 
4343 	HN_LOCK(sc);
4344 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4345 		txr = &sc->hn_tx_ring[i];
4346 		*((int *)((uint8_t *)txr + ofs)) = conf;
4347 	}
4348 	HN_UNLOCK(sc);
4349 
4350 	return 0;
4351 }
4352 
4353 static int
4354 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4355 {
4356 	struct hn_softc *sc = arg1;
4357 	int error, size;
4358 
4359 	size = sc->hn_agg_size;
4360 	error = sysctl_handle_int(oidp, &size, 0, req);
4361 	if (error || req->newptr == NULL)
4362 		return (error);
4363 
4364 	HN_LOCK(sc);
4365 	sc->hn_agg_size = size;
4366 	hn_set_txagg(sc);
4367 	HN_UNLOCK(sc);
4368 
4369 	return (0);
4370 }
4371 
4372 static int
4373 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4374 {
4375 	struct hn_softc *sc = arg1;
4376 	int error, pkts;
4377 
4378 	pkts = sc->hn_agg_pkts;
4379 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4380 	if (error || req->newptr == NULL)
4381 		return (error);
4382 
4383 	HN_LOCK(sc);
4384 	sc->hn_agg_pkts = pkts;
4385 	hn_set_txagg(sc);
4386 	HN_UNLOCK(sc);
4387 
4388 	return (0);
4389 }
4390 
4391 static int
4392 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4393 {
4394 	struct hn_softc *sc = arg1;
4395 	int pkts;
4396 
4397 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4398 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4399 }
4400 
4401 static int
4402 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4403 {
4404 	struct hn_softc *sc = arg1;
4405 	int align;
4406 
4407 	align = sc->hn_tx_ring[0].hn_agg_align;
4408 	return (sysctl_handle_int(oidp, &align, 0, req));
4409 }
4410 
4411 static void
4412 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4413 {
4414 	if (pollhz == 0)
4415 		vmbus_chan_poll_disable(chan);
4416 	else
4417 		vmbus_chan_poll_enable(chan, pollhz);
4418 }
4419 
4420 static void
4421 hn_polling(struct hn_softc *sc, u_int pollhz)
4422 {
4423 	int nsubch = sc->hn_rx_ring_inuse - 1;
4424 
4425 	HN_LOCK_ASSERT(sc);
4426 
4427 	if (nsubch > 0) {
4428 		struct vmbus_channel **subch;
4429 		int i;
4430 
4431 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4432 		for (i = 0; i < nsubch; ++i)
4433 			hn_chan_polling(subch[i], pollhz);
4434 		vmbus_subchan_rel(subch, nsubch);
4435 	}
4436 	hn_chan_polling(sc->hn_prichan, pollhz);
4437 }
4438 
4439 static int
4440 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4441 {
4442 	struct hn_softc *sc = arg1;
4443 	int pollhz, error;
4444 
4445 	pollhz = sc->hn_pollhz;
4446 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4447 	if (error || req->newptr == NULL)
4448 		return (error);
4449 
4450 	if (pollhz != 0 &&
4451 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4452 		return (EINVAL);
4453 
4454 	HN_LOCK(sc);
4455 	if (sc->hn_pollhz != pollhz) {
4456 		sc->hn_pollhz = pollhz;
4457 		if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4458 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4459 			hn_polling(sc, sc->hn_pollhz);
4460 	}
4461 	HN_UNLOCK(sc);
4462 
4463 	return (0);
4464 }
4465 
4466 static int
4467 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4468 {
4469 	struct hn_softc *sc = arg1;
4470 	char verstr[16];
4471 
4472 	snprintf(verstr, sizeof(verstr), "%u.%u",
4473 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4474 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4475 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4476 }
4477 
4478 static int
4479 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4480 {
4481 	struct hn_softc *sc = arg1;
4482 	char caps_str[128];
4483 	uint32_t caps;
4484 
4485 	HN_LOCK(sc);
4486 	caps = sc->hn_caps;
4487 	HN_UNLOCK(sc);
4488 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4489 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4490 }
4491 
4492 static int
4493 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4494 {
4495 	struct hn_softc *sc = arg1;
4496 	char assist_str[128];
4497 	uint32_t hwassist;
4498 
4499 	HN_LOCK(sc);
4500 	hwassist = if_gethwassist(sc->hn_ifp);
4501 	HN_UNLOCK(sc);
4502 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4503 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4504 }
4505 
4506 static int
4507 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4508 {
4509 	struct hn_softc *sc = arg1;
4510 	char filter_str[128];
4511 	uint32_t filter;
4512 
4513 	HN_LOCK(sc);
4514 	filter = sc->hn_rx_filter;
4515 	HN_UNLOCK(sc);
4516 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4517 	    NDIS_PACKET_TYPES);
4518 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4519 }
4520 
4521 static int
4522 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4523 {
4524 	struct hn_softc *sc = arg1;
4525 	uint32_t mtu;
4526 	int error;
4527 	HN_LOCK(sc);
4528 	error = hn_rndis_get_mtu(sc, &mtu);
4529 	if (error) {
4530 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4531 		goto back;
4532 	}
4533 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4534 	if (error || req->newptr == NULL)
4535 		goto back;
4536 
4537 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4538 	if (error)
4539 		goto back;
4540 	error = hn_rndis_reconf_offload(sc, mtu);
4541 back:
4542 	HN_UNLOCK(sc);
4543 	return (error);
4544 }
4545 #ifndef RSS
4546 
4547 static int
4548 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4549 {
4550 	struct hn_softc *sc = arg1;
4551 	int error;
4552 
4553 	HN_LOCK(sc);
4554 
4555 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4556 	if (error || req->newptr == NULL)
4557 		goto back;
4558 
4559 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4560 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4561 		/*
4562 		 * RSS key is synchronized w/ VF's, don't allow users
4563 		 * to change it.
4564 		 */
4565 		error = EBUSY;
4566 		goto back;
4567 	}
4568 
4569 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4570 	if (error)
4571 		goto back;
4572 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4573 
4574 	if (sc->hn_rx_ring_inuse > 1) {
4575 		error = hn_rss_reconfig(sc);
4576 	} else {
4577 		/* Not RSS capable, at least for now; just save the RSS key. */
4578 		error = 0;
4579 	}
4580 back:
4581 	HN_UNLOCK(sc);
4582 	return (error);
4583 }
4584 
4585 static int
4586 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4587 {
4588 	struct hn_softc *sc = arg1;
4589 	int error;
4590 
4591 	HN_LOCK(sc);
4592 
4593 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4594 	if (error || req->newptr == NULL)
4595 		goto back;
4596 
4597 	/*
4598 	 * Don't allow RSS indirect table change, if this interface is not
4599 	 * RSS capable currently.
4600 	 */
4601 	if (sc->hn_rx_ring_inuse == 1) {
4602 		error = EOPNOTSUPP;
4603 		goto back;
4604 	}
4605 
4606 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4607 	if (error)
4608 		goto back;
4609 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4610 
4611 	hn_rss_ind_fixup(sc);
4612 	error = hn_rss_reconfig(sc);
4613 back:
4614 	HN_UNLOCK(sc);
4615 	return (error);
4616 }
4617 
4618 #endif	/* !RSS */
4619 
4620 static int
4621 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4622 {
4623 	struct hn_softc *sc = arg1;
4624 	char hash_str[128];
4625 	uint32_t hash;
4626 
4627 	HN_LOCK(sc);
4628 	hash = sc->hn_rss_hash;
4629 	HN_UNLOCK(sc);
4630 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4631 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4632 }
4633 
4634 static int
4635 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4636 {
4637 	struct hn_softc *sc = arg1;
4638 	char hash_str[128];
4639 	uint32_t hash;
4640 
4641 	HN_LOCK(sc);
4642 	hash = sc->hn_rss_hcap;
4643 	HN_UNLOCK(sc);
4644 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4645 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4646 }
4647 
4648 static int
4649 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4650 {
4651 	struct hn_softc *sc = arg1;
4652 	char hash_str[128];
4653 	uint32_t hash;
4654 
4655 	HN_LOCK(sc);
4656 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4657 	HN_UNLOCK(sc);
4658 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4659 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4660 }
4661 
4662 static int
4663 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4664 {
4665 	struct hn_softc *sc = arg1;
4666 	char vf_name[IFNAMSIZ + 1];
4667 	if_t vf_ifp;
4668 
4669 	HN_LOCK(sc);
4670 	vf_name[0] = '\0';
4671 	vf_ifp = sc->hn_vf_ifp;
4672 	if (vf_ifp != NULL)
4673 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4674 	HN_UNLOCK(sc);
4675 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4676 }
4677 
4678 static int
4679 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4680 {
4681 	struct hn_softc *sc = arg1;
4682 	char vf_name[IFNAMSIZ + 1];
4683 	if_t vf_ifp;
4684 
4685 	HN_LOCK(sc);
4686 	vf_name[0] = '\0';
4687 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4688 	if (vf_ifp != NULL)
4689 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4690 	HN_UNLOCK(sc);
4691 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4692 }
4693 
4694 static int
4695 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4696 {
4697 	struct rm_priotracker pt;
4698 	struct sbuf *sb;
4699 	int error, i;
4700 	bool first;
4701 
4702 	error = sysctl_wire_old_buffer(req, 0);
4703 	if (error != 0)
4704 		return (error);
4705 
4706 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4707 	if (sb == NULL)
4708 		return (ENOMEM);
4709 
4710 	rm_rlock(&hn_vfmap_lock, &pt);
4711 
4712 	first = true;
4713 	for (i = 0; i < hn_vfmap_size; ++i) {
4714 		struct epoch_tracker et;
4715 		if_t ifp;
4716 
4717 		if (hn_vfmap[i] == NULL)
4718 			continue;
4719 
4720 		NET_EPOCH_ENTER(et);
4721 		ifp = ifnet_byindex(i);
4722 		if (ifp != NULL) {
4723 			if (first)
4724 				sbuf_printf(sb, "%s", if_name(ifp));
4725 			else
4726 				sbuf_printf(sb, " %s", if_name(ifp));
4727 			first = false;
4728 		}
4729 		NET_EPOCH_EXIT(et);
4730 	}
4731 
4732 	rm_runlock(&hn_vfmap_lock, &pt);
4733 
4734 	error = sbuf_finish(sb);
4735 	sbuf_delete(sb);
4736 	return (error);
4737 }
4738 
4739 static int
4740 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4741 {
4742 	struct rm_priotracker pt;
4743 	struct sbuf *sb;
4744 	int error, i;
4745 	bool first;
4746 
4747 	error = sysctl_wire_old_buffer(req, 0);
4748 	if (error != 0)
4749 		return (error);
4750 
4751 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4752 	if (sb == NULL)
4753 		return (ENOMEM);
4754 
4755 	rm_rlock(&hn_vfmap_lock, &pt);
4756 
4757 	first = true;
4758 	for (i = 0; i < hn_vfmap_size; ++i) {
4759 		struct epoch_tracker et;
4760 		if_t ifp, hn_ifp;
4761 
4762 		hn_ifp = hn_vfmap[i];
4763 		if (hn_ifp == NULL)
4764 			continue;
4765 
4766 		NET_EPOCH_ENTER(et);
4767 		ifp = ifnet_byindex(i);
4768 		if (ifp != NULL) {
4769 			if (first) {
4770 				sbuf_printf(sb, "%s:%s", if_name(ifp),
4771 				    if_name(hn_ifp));
4772 			} else {
4773 				sbuf_printf(sb, " %s:%s", if_name(ifp),
4774 				    if_name(hn_ifp));
4775 			}
4776 			first = false;
4777 		}
4778 		NET_EPOCH_EXIT(et);
4779 	}
4780 
4781 	rm_runlock(&hn_vfmap_lock, &pt);
4782 
4783 	error = sbuf_finish(sb);
4784 	sbuf_delete(sb);
4785 	return (error);
4786 }
4787 
4788 static int
4789 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4790 {
4791 	struct hn_softc *sc = arg1;
4792 	int error, onoff = 0;
4793 
4794 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4795 		onoff = 1;
4796 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4797 	if (error || req->newptr == NULL)
4798 		return (error);
4799 
4800 	HN_LOCK(sc);
4801 	/* NOTE: hn_vf_lock for hn_transmit() */
4802 	rm_wlock(&sc->hn_vf_lock);
4803 	if (onoff)
4804 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4805 	else
4806 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4807 	rm_wunlock(&sc->hn_vf_lock);
4808 	HN_UNLOCK(sc);
4809 
4810 	return (0);
4811 }
4812 
4813 static int
4814 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4815 {
4816 	struct hn_softc *sc = arg1;
4817 	int enabled = 0;
4818 
4819 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4820 		enabled = 1;
4821 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4822 }
4823 
4824 static int
4825 hn_check_iplen(const struct mbuf *m, int hoff)
4826 {
4827 	const struct ip *ip;
4828 	int len, iphlen, iplen;
4829 	const struct tcphdr *th;
4830 	int thoff;				/* TCP data offset */
4831 
4832 	len = hoff + sizeof(struct ip);
4833 
4834 	/* The packet must be at least the size of an IP header. */
4835 	if (m->m_pkthdr.len < len)
4836 		return IPPROTO_DONE;
4837 
4838 	/* The fixed IP header must reside completely in the first mbuf. */
4839 	if (m->m_len < len)
4840 		return IPPROTO_DONE;
4841 
4842 	ip = mtodo(m, hoff);
4843 
4844 	/* Bound check the packet's stated IP header length. */
4845 	iphlen = ip->ip_hl << 2;
4846 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4847 		return IPPROTO_DONE;
4848 
4849 	/* The full IP header must reside completely in the one mbuf. */
4850 	if (m->m_len < hoff + iphlen)
4851 		return IPPROTO_DONE;
4852 
4853 	iplen = ntohs(ip->ip_len);
4854 
4855 	/*
4856 	 * Check that the amount of data in the buffers is as
4857 	 * at least much as the IP header would have us expect.
4858 	 */
4859 	if (m->m_pkthdr.len < hoff + iplen)
4860 		return IPPROTO_DONE;
4861 
4862 	/*
4863 	 * Ignore IP fragments.
4864 	 */
4865 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4866 		return IPPROTO_DONE;
4867 
4868 	/*
4869 	 * The TCP/IP or UDP/IP header must be entirely contained within
4870 	 * the first fragment of a packet.
4871 	 */
4872 	switch (ip->ip_p) {
4873 	case IPPROTO_TCP:
4874 		if (iplen < iphlen + sizeof(struct tcphdr))
4875 			return IPPROTO_DONE;
4876 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4877 			return IPPROTO_DONE;
4878 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4879 		thoff = th->th_off << 2;
4880 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4881 			return IPPROTO_DONE;
4882 		if (m->m_len < hoff + iphlen + thoff)
4883 			return IPPROTO_DONE;
4884 		break;
4885 	case IPPROTO_UDP:
4886 		if (iplen < iphlen + sizeof(struct udphdr))
4887 			return IPPROTO_DONE;
4888 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4889 			return IPPROTO_DONE;
4890 		break;
4891 	default:
4892 		if (iplen < iphlen)
4893 			return IPPROTO_DONE;
4894 		break;
4895 	}
4896 	return ip->ip_p;
4897 }
4898 
4899 static void
4900 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4901 {
4902 	const struct ether_header *eh;
4903 	uint16_t etype;
4904 	int hoff;
4905 
4906 	hoff = sizeof(*eh);
4907 	/* Checked at the beginning of this function. */
4908 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4909 
4910 	eh = mtod(m_new, const struct ether_header *);
4911 	etype = ntohs(eh->ether_type);
4912 	if (etype == ETHERTYPE_VLAN) {
4913 		const struct ether_vlan_header *evl;
4914 
4915 		hoff = sizeof(*evl);
4916 		if (m_new->m_len < hoff)
4917 			return;
4918 		evl = mtod(m_new, const struct ether_vlan_header *);
4919 		etype = ntohs(evl->evl_proto);
4920 	}
4921 	*l3proto = etype;
4922 
4923 	if (etype == ETHERTYPE_IP)
4924 		*l4proto = hn_check_iplen(m_new, hoff);
4925 	else
4926 		*l4proto = IPPROTO_DONE;
4927 }
4928 
4929 static int
4930 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4931 {
4932 	struct sysctl_oid_list *child;
4933 	struct sysctl_ctx_list *ctx;
4934 	device_t dev = sc->hn_dev;
4935 #if defined(INET) || defined(INET6)
4936 	int lroent_cnt;
4937 #endif
4938 	int i;
4939 
4940 	/*
4941 	 * Create RXBUF for reception.
4942 	 *
4943 	 * NOTE:
4944 	 * - It is shared by all channels.
4945 	 * - A large enough buffer is allocated, certain version of NVSes
4946 	 *   may further limit the usable space.
4947 	 */
4948 	sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
4949 	    0ul, ~0ul, PAGE_SIZE, 0);
4950 	if (sc->hn_rxbuf == NULL) {
4951 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4952 		return (ENOMEM);
4953 	}
4954 
4955 	sc->hn_rx_ring_cnt = ring_cnt;
4956 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4957 
4958 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4959 	    M_DEVBUF, M_WAITOK | M_ZERO);
4960 
4961 #if defined(INET) || defined(INET6)
4962 	lroent_cnt = hn_lro_entry_count;
4963 	if (lroent_cnt < TCP_LRO_ENTRIES)
4964 		lroent_cnt = TCP_LRO_ENTRIES;
4965 	if (bootverbose)
4966 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4967 #endif	/* INET || INET6 */
4968 
4969 	ctx = device_get_sysctl_ctx(dev);
4970 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4971 
4972 	/* Create dev.hn.UNIT.rx sysctl tree */
4973 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4974 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4975 
4976 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4977 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4978 
4979 		rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF,
4980 		    M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
4981 		if (rxr->hn_br == NULL) {
4982 			device_printf(dev, "allocate bufring failed\n");
4983 			return (ENOMEM);
4984 		}
4985 
4986 		if (hn_trust_hosttcp)
4987 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4988 		if (hn_trust_hostudp)
4989 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4990 		if (hn_trust_hostip)
4991 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4992 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4993 		rxr->hn_ifp = sc->hn_ifp;
4994 		if (i < sc->hn_tx_ring_cnt)
4995 			rxr->hn_txr = &sc->hn_tx_ring[i];
4996 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4997 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4998 		rxr->hn_rx_idx = i;
4999 		rxr->hn_rxbuf = sc->hn_rxbuf;
5000 
5001 		/*
5002 		 * Initialize LRO.
5003 		 */
5004 #if defined(INET) || defined(INET6)
5005 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5006 		    hn_lro_mbufq_depth);
5007 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5008 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5009 #endif	/* INET || INET6 */
5010 
5011 		if (sc->hn_rx_sysctl_tree != NULL) {
5012 			char name[16];
5013 
5014 			/*
5015 			 * Create per RX ring sysctl tree:
5016 			 * dev.hn.UNIT.rx.RINGID
5017 			 */
5018 			snprintf(name, sizeof(name), "%d", i);
5019 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5020 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5021 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5022 
5023 			if (rxr->hn_rx_sysctl_tree != NULL) {
5024 				SYSCTL_ADD_ULONG(ctx,
5025 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5026 				    OID_AUTO, "packets",
5027 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5028 				    "# of packets received");
5029 				SYSCTL_ADD_ULONG(ctx,
5030 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5031 				    OID_AUTO, "rss_pkts",
5032 				    CTLFLAG_RW | CTLFLAG_STATS,
5033 				    &rxr->hn_rss_pkts,
5034 				    "# of packets w/ RSS info received");
5035 				SYSCTL_ADD_ULONG(ctx,
5036 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5037 				    OID_AUTO, "rsc_pkts",
5038 				    CTLFLAG_RW | CTLFLAG_STATS,
5039 				    &rxr->hn_rsc_pkts,
5040 				    "# of RSC packets received");
5041 				SYSCTL_ADD_ULONG(ctx,
5042 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5043 				    OID_AUTO, "rsc_drop",
5044 				    CTLFLAG_RW | CTLFLAG_STATS,
5045 				    &rxr->hn_rsc_drop,
5046 				    "# of RSC fragments dropped");
5047 				SYSCTL_ADD_INT(ctx,
5048 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5049 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5050 				    &rxr->hn_pktbuf_len, 0,
5051 				    "Temporary channel packet buffer length");
5052 			}
5053 		}
5054 	}
5055 
5056 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5057 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5058 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5059 	    hn_rx_stat_u64_sysctl,
5060 	    "LU", "LRO queued");
5061 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5062 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5063 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5064 	    hn_rx_stat_u64_sysctl,
5065 	    "LU", "LRO flushed");
5066 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5067 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5068 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5069 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5070 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5071 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5072 	    hn_lro_lenlim_sysctl, "IU",
5073 	    "Max # of data bytes to be aggregated by LRO");
5074 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5075 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5076 	    hn_lro_ackcnt_sysctl, "I",
5077 	    "Max # of ACKs to be aggregated by LRO");
5078 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5079 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5080 	    hn_trust_hcsum_sysctl, "I",
5081 	    "Trust tcp segment verification on host side, "
5082 	    "when csum info is missing");
5083 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5084 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5085 	    hn_trust_hcsum_sysctl, "I",
5086 	    "Trust udp datagram verification on host side, "
5087 	    "when csum info is missing");
5088 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5089 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5090 	    hn_trust_hcsum_sysctl, "I",
5091 	    "Trust ip packet verification on host side, "
5092 	    "when csum info is missing");
5093 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5094 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5095 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5096 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5097 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5098 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5099 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5100 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5101 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5102 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5103 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5104 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5105 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5106 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5107 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5108 	    hn_rx_stat_ulong_sysctl, "LU",
5109 	    "# of packets that we trust host's csum verification");
5110 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5111 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5112 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5113 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5114 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5115 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5116 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5117 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5118 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5119 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5120 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5121 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5122 
5123 	return (0);
5124 }
5125 
5126 static void
5127 hn_destroy_rx_data(struct hn_softc *sc)
5128 {
5129 	int i;
5130 
5131 	if (sc->hn_rxbuf != NULL) {
5132 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5133 			contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF);
5134 		else
5135 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5136 		sc->hn_rxbuf = NULL;
5137 	}
5138 
5139 	if (sc->hn_rx_ring_cnt == 0)
5140 		return;
5141 
5142 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5143 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5144 
5145 		if (rxr->hn_br == NULL)
5146 			continue;
5147 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5148 			contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE,
5149 			    M_DEVBUF);
5150 		} else {
5151 			device_printf(sc->hn_dev,
5152 			    "%dth channel bufring is referenced", i);
5153 		}
5154 		rxr->hn_br = NULL;
5155 
5156 #if defined(INET) || defined(INET6)
5157 		tcp_lro_free(&rxr->hn_lro);
5158 #endif
5159 		free(rxr->hn_pktbuf, M_DEVBUF);
5160 	}
5161 	free(sc->hn_rx_ring, M_DEVBUF);
5162 	sc->hn_rx_ring = NULL;
5163 
5164 	sc->hn_rx_ring_cnt = 0;
5165 	sc->hn_rx_ring_inuse = 0;
5166 }
5167 
5168 static int
5169 hn_tx_ring_create(struct hn_softc *sc, int id)
5170 {
5171 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5172 	device_t dev = sc->hn_dev;
5173 	bus_dma_tag_t parent_dtag;
5174 	int error, i;
5175 
5176 	txr->hn_sc = sc;
5177 	txr->hn_tx_idx = id;
5178 
5179 #ifndef HN_USE_TXDESC_BUFRING
5180 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5181 #endif
5182 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5183 
5184 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5185 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5186 	    M_DEVBUF, M_WAITOK | M_ZERO);
5187 #ifndef HN_USE_TXDESC_BUFRING
5188 	SLIST_INIT(&txr->hn_txlist);
5189 #else
5190 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5191 	    M_WAITOK, &txr->hn_tx_lock);
5192 #endif
5193 
5194 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5195 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5196 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5197 	} else {
5198 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5199 	}
5200 
5201 #ifdef HN_IFSTART_SUPPORT
5202 	if (hn_use_if_start) {
5203 		txr->hn_txeof = hn_start_txeof;
5204 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5205 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5206 	} else
5207 #endif
5208 	{
5209 		int br_depth;
5210 
5211 		txr->hn_txeof = hn_xmit_txeof;
5212 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5213 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5214 
5215 		br_depth = hn_get_txswq_depth(txr);
5216 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5217 		    M_WAITOK, &txr->hn_tx_lock);
5218 	}
5219 
5220 	txr->hn_direct_tx_size = hn_direct_tx_size;
5221 
5222 	/*
5223 	 * Always schedule transmission instead of trying to do direct
5224 	 * transmission.  This one gives the best performance so far.
5225 	 */
5226 	txr->hn_sched_tx = 1;
5227 
5228 	parent_dtag = bus_get_dma_tag(dev);
5229 
5230 	/* DMA tag for RNDIS packet messages. */
5231 	error = bus_dma_tag_create(parent_dtag, /* parent */
5232 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5233 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5234 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5235 	    BUS_SPACE_MAXADDR,		/* highaddr */
5236 	    NULL, NULL,			/* filter, filterarg */
5237 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5238 	    1,				/* nsegments */
5239 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5240 	    0,				/* flags */
5241 	    NULL,			/* lockfunc */
5242 	    NULL,			/* lockfuncarg */
5243 	    &txr->hn_tx_rndis_dtag);
5244 	if (error) {
5245 		device_printf(dev, "failed to create rndis dmatag\n");
5246 		return error;
5247 	}
5248 
5249 	/* DMA tag for data. */
5250 	error = bus_dma_tag_create(parent_dtag, /* parent */
5251 	    1,				/* alignment */
5252 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5253 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5254 	    BUS_SPACE_MAXADDR,		/* highaddr */
5255 	    NULL, NULL,			/* filter, filterarg */
5256 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5257 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5258 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5259 	    0,				/* flags */
5260 	    NULL,			/* lockfunc */
5261 	    NULL,			/* lockfuncarg */
5262 	    &txr->hn_tx_data_dtag);
5263 	if (error) {
5264 		device_printf(dev, "failed to create data dmatag\n");
5265 		return error;
5266 	}
5267 
5268 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5269 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5270 
5271 		txd->txr = txr;
5272 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5273 		STAILQ_INIT(&txd->agg_list);
5274 
5275 		/*
5276 		 * Allocate and load RNDIS packet message.
5277 		 */
5278         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5279 		    (void **)&txd->rndis_pkt,
5280 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5281 		    &txd->rndis_pkt_dmap);
5282 		if (error) {
5283 			device_printf(dev,
5284 			    "failed to allocate rndis_packet_msg, %d\n", i);
5285 			return error;
5286 		}
5287 
5288 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5289 		    txd->rndis_pkt_dmap,
5290 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5291 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5292 		    BUS_DMA_NOWAIT);
5293 		if (error) {
5294 			device_printf(dev,
5295 			    "failed to load rndis_packet_msg, %d\n", i);
5296 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5297 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5298 			return error;
5299 		}
5300 
5301 		/* DMA map for TX data. */
5302 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5303 		    &txd->data_dmap);
5304 		if (error) {
5305 			device_printf(dev,
5306 			    "failed to allocate tx data dmamap\n");
5307 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5308 			    txd->rndis_pkt_dmap);
5309 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5310 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5311 			return error;
5312 		}
5313 
5314 		/* All set, put it to list */
5315 		txd->flags |= HN_TXD_FLAG_ONLIST;
5316 #ifndef HN_USE_TXDESC_BUFRING
5317 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5318 #else
5319 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5320 #endif
5321 	}
5322 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5323 
5324 	if (sc->hn_tx_sysctl_tree != NULL) {
5325 		struct sysctl_oid_list *child;
5326 		struct sysctl_ctx_list *ctx;
5327 		char name[16];
5328 
5329 		/*
5330 		 * Create per TX ring sysctl tree:
5331 		 * dev.hn.UNIT.tx.RINGID
5332 		 */
5333 		ctx = device_get_sysctl_ctx(dev);
5334 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5335 
5336 		snprintf(name, sizeof(name), "%d", id);
5337 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5338 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5339 
5340 		if (txr->hn_tx_sysctl_tree != NULL) {
5341 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5342 
5343 #ifdef HN_DEBUG
5344 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5345 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5346 			    "# of available TX descs");
5347 #endif
5348 #ifdef HN_IFSTART_SUPPORT
5349 			if (!hn_use_if_start)
5350 #endif
5351 			{
5352 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5353 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5354 				    "over active");
5355 			}
5356 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5357 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5358 			    "# of packets transmitted");
5359 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5360 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5361 			    "# of sends");
5362 		}
5363 	}
5364 
5365 	return 0;
5366 }
5367 
5368 static void
5369 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5370 {
5371 	struct hn_tx_ring *txr = txd->txr;
5372 
5373 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5374 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5375 
5376 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5377 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5378 	    txd->rndis_pkt_dmap);
5379 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5380 }
5381 
5382 static void
5383 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5384 {
5385 
5386 	KASSERT(txd->refs == 0 || txd->refs == 1,
5387 	    ("invalid txd refs %d", txd->refs));
5388 
5389 	/* Aggregated txds will be freed by their aggregating txd. */
5390 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5391 		int freed __diagused;
5392 
5393 		freed = hn_txdesc_put(txr, txd);
5394 		KASSERT(freed, ("can't free txdesc"));
5395 	}
5396 }
5397 
5398 static void
5399 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5400 {
5401 	int i;
5402 
5403 	if (txr->hn_txdesc == NULL)
5404 		return;
5405 
5406 	/*
5407 	 * NOTE:
5408 	 * Because the freeing of aggregated txds will be deferred
5409 	 * to the aggregating txd, two passes are used here:
5410 	 * - The first pass GCes any pending txds.  This GC is necessary,
5411 	 *   since if the channels are revoked, hypervisor will not
5412 	 *   deliver send-done for all pending txds.
5413 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5414 	 *   were freed.
5415 	 */
5416 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5417 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5418 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5419 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5420 
5421 	if (txr->hn_tx_data_dtag != NULL)
5422 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5423 	if (txr->hn_tx_rndis_dtag != NULL)
5424 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5425 
5426 #ifdef HN_USE_TXDESC_BUFRING
5427 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5428 #endif
5429 
5430 	free(txr->hn_txdesc, M_DEVBUF);
5431 	txr->hn_txdesc = NULL;
5432 
5433 	if (txr->hn_mbuf_br != NULL)
5434 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5435 
5436 #ifndef HN_USE_TXDESC_BUFRING
5437 	mtx_destroy(&txr->hn_txlist_spin);
5438 #endif
5439 	mtx_destroy(&txr->hn_tx_lock);
5440 }
5441 
5442 static int
5443 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5444 {
5445 	struct sysctl_oid_list *child;
5446 	struct sysctl_ctx_list *ctx;
5447 	int i;
5448 
5449 	/*
5450 	 * Create TXBUF for chimney sending.
5451 	 *
5452 	 * NOTE: It is shared by all channels.
5453 	 */
5454 	sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
5455 	    0ul, ~0ul, PAGE_SIZE, 0);
5456 	if (sc->hn_chim == NULL) {
5457 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5458 		return (ENOMEM);
5459 	}
5460 
5461 	sc->hn_tx_ring_cnt = ring_cnt;
5462 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5463 
5464 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5465 	    M_DEVBUF, M_WAITOK | M_ZERO);
5466 
5467 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5468 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5469 
5470 	/* Create dev.hn.UNIT.tx sysctl tree */
5471 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5472 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5473 
5474 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5475 		int error;
5476 
5477 		error = hn_tx_ring_create(sc, i);
5478 		if (error)
5479 			return error;
5480 	}
5481 
5482 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5483 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5484 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5485 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5486 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5487 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5488 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5489 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5490 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5491 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5492 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5493 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5494 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5495 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5496 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5497 	    hn_tx_stat_ulong_sysctl, "LU",
5498 	    "# of packet transmission aggregation flush failure");
5499 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5500 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5501 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5502 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5503 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5504 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5505 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5506 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5507 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5508 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5509 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5510 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5511 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5512 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5513 	    "# of total TX descs");
5514 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5515 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5516 	    "Chimney send packet size upper boundary");
5517 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5518 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5519 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5520 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5521 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5522 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5523 	    hn_tx_conf_int_sysctl, "I",
5524 	    "Size of the packet for direct transmission");
5525 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5526 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5527 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5528 	    hn_tx_conf_int_sysctl, "I",
5529 	    "Always schedule transmission "
5530 	    "instead of doing direct transmission");
5531 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5532 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5533 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5534 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5535 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5536 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5537 	    "Applied packet transmission aggregation size");
5538 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5539 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5540 	    hn_txagg_pktmax_sysctl, "I",
5541 	    "Applied packet transmission aggregation packets");
5542 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5543 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5544 	    hn_txagg_align_sysctl, "I",
5545 	    "Applied packet transmission aggregation alignment");
5546 
5547 	return 0;
5548 }
5549 
5550 static void
5551 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5552 {
5553 	int i;
5554 
5555 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5556 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5557 }
5558 
5559 static void
5560 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5561 {
5562 	if_t ifp = sc->hn_ifp;
5563 	u_int hw_tsomax;
5564 	int tso_minlen;
5565 
5566 	HN_LOCK_ASSERT(sc);
5567 
5568 	if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5569 		return;
5570 
5571 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5572 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5573 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5574 
5575 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5576 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5577 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5578 
5579 	if (tso_maxlen < tso_minlen)
5580 		tso_maxlen = tso_minlen;
5581 	else if (tso_maxlen > IP_MAXPACKET)
5582 		tso_maxlen = IP_MAXPACKET;
5583 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5584 		tso_maxlen = sc->hn_ndis_tso_szmax;
5585 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5586 
5587 	if (hn_xpnt_vf_isready(sc)) {
5588 		if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5589 			hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5590 	}
5591 	if_sethwtsomax(ifp, hw_tsomax);
5592 	if (bootverbose)
5593 		if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5594 }
5595 
5596 static void
5597 hn_fixup_tx_data(struct hn_softc *sc)
5598 {
5599 	uint64_t csum_assist;
5600 	int i;
5601 
5602 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5603 	if (hn_tx_chimney_size > 0 &&
5604 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5605 		hn_set_chim_size(sc, hn_tx_chimney_size);
5606 
5607 	csum_assist = 0;
5608 	if (sc->hn_caps & HN_CAP_IPCS)
5609 		csum_assist |= CSUM_IP;
5610 	if (sc->hn_caps & HN_CAP_TCP4CS)
5611 		csum_assist |= CSUM_IP_TCP;
5612 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5613 		csum_assist |= CSUM_IP_UDP;
5614 	if (sc->hn_caps & HN_CAP_TCP6CS)
5615 		csum_assist |= CSUM_IP6_TCP;
5616 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5617 		csum_assist |= CSUM_IP6_UDP;
5618 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5619 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5620 
5621 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5622 		/*
5623 		 * Support HASHVAL pktinfo on TX path.
5624 		 */
5625 		if (bootverbose)
5626 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5627 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5628 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5629 	}
5630 }
5631 
5632 static void
5633 hn_fixup_rx_data(struct hn_softc *sc)
5634 {
5635 
5636 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5637 		int i;
5638 
5639 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5640 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5641 	}
5642 }
5643 
5644 static void
5645 hn_destroy_tx_data(struct hn_softc *sc)
5646 {
5647 	int i;
5648 
5649 	if (sc->hn_chim != NULL) {
5650 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5651 			contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF);
5652 		} else {
5653 			device_printf(sc->hn_dev,
5654 			    "chimney sending buffer is referenced");
5655 		}
5656 		sc->hn_chim = NULL;
5657 	}
5658 
5659 	if (sc->hn_tx_ring_cnt == 0)
5660 		return;
5661 
5662 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5663 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5664 
5665 	free(sc->hn_tx_ring, M_DEVBUF);
5666 	sc->hn_tx_ring = NULL;
5667 
5668 	sc->hn_tx_ring_cnt = 0;
5669 	sc->hn_tx_ring_inuse = 0;
5670 }
5671 
5672 #ifdef HN_IFSTART_SUPPORT
5673 
5674 static void
5675 hn_start_taskfunc(void *xtxr, int pending __unused)
5676 {
5677 	struct hn_tx_ring *txr = xtxr;
5678 
5679 	mtx_lock(&txr->hn_tx_lock);
5680 	hn_start_locked(txr, 0);
5681 	mtx_unlock(&txr->hn_tx_lock);
5682 }
5683 
5684 static int
5685 hn_start_locked(struct hn_tx_ring *txr, int len)
5686 {
5687 	struct hn_softc *sc = txr->hn_sc;
5688 	if_t ifp = sc->hn_ifp;
5689 	int sched = 0;
5690 
5691 	KASSERT(hn_use_if_start,
5692 	    ("hn_start_locked is called, when if_start is disabled"));
5693 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5694 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5695 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5696 
5697 	if (__predict_false(txr->hn_suspended))
5698 		return (0);
5699 
5700 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5701 	    IFF_DRV_RUNNING)
5702 		return (0);
5703 
5704 	while (!if_sendq_empty(ifp)) {
5705 		struct hn_txdesc *txd;
5706 		struct mbuf *m_head;
5707 		int error;
5708 
5709 		m_head = if_dequeue(ifp);
5710 		if (m_head == NULL)
5711 			break;
5712 
5713 		if (len > 0 && m_head->m_pkthdr.len > len) {
5714 			/*
5715 			 * This sending could be time consuming; let callers
5716 			 * dispatch this packet sending (and sending of any
5717 			 * following up packets) to tx taskqueue.
5718 			 */
5719 			if_sendq_prepend(ifp, m_head);
5720 			sched = 1;
5721 			break;
5722 		}
5723 
5724 #if defined(INET6) || defined(INET)
5725 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5726 			m_head = hn_tso_fixup(m_head);
5727 			if (__predict_false(m_head == NULL)) {
5728 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5729 				continue;
5730 			}
5731 		} else if (m_head->m_pkthdr.csum_flags &
5732 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5733 			m_head = hn_set_hlen(m_head);
5734 			if (__predict_false(m_head == NULL)) {
5735 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5736 				continue;
5737 			}
5738 		}
5739 #endif
5740 
5741 		txd = hn_txdesc_get(txr);
5742 		if (txd == NULL) {
5743 			txr->hn_no_txdescs++;
5744 			if_sendq_prepend(ifp, m_head);
5745 			if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5746 			break;
5747 		}
5748 
5749 		error = hn_encap(ifp, txr, txd, &m_head);
5750 		if (error) {
5751 			/* Both txd and m_head are freed */
5752 			KASSERT(txr->hn_agg_txd == NULL,
5753 			    ("encap failed w/ pending aggregating txdesc"));
5754 			continue;
5755 		}
5756 
5757 		if (txr->hn_agg_pktleft == 0) {
5758 			if (txr->hn_agg_txd != NULL) {
5759 				KASSERT(m_head == NULL,
5760 				    ("pending mbuf for aggregating txdesc"));
5761 				error = hn_flush_txagg(ifp, txr);
5762 				if (__predict_false(error)) {
5763 					if_setdrvflagbits(ifp,
5764 					    IFF_DRV_OACTIVE, 0);
5765 					break;
5766 				}
5767 			} else {
5768 				KASSERT(m_head != NULL, ("mbuf was freed"));
5769 				error = hn_txpkt(ifp, txr, txd);
5770 				if (__predict_false(error)) {
5771 					/* txd is freed, but m_head is not */
5772 					if_sendq_prepend(ifp, m_head);
5773 					if_setdrvflagbits(ifp,
5774 					    IFF_DRV_OACTIVE, 0);
5775 					break;
5776 				}
5777 			}
5778 		}
5779 #ifdef INVARIANTS
5780 		else {
5781 			KASSERT(txr->hn_agg_txd != NULL,
5782 			    ("no aggregating txdesc"));
5783 			KASSERT(m_head == NULL,
5784 			    ("pending mbuf for aggregating txdesc"));
5785 		}
5786 #endif
5787 	}
5788 
5789 	/* Flush pending aggerated transmission. */
5790 	if (txr->hn_agg_txd != NULL)
5791 		hn_flush_txagg(ifp, txr);
5792 	return (sched);
5793 }
5794 
5795 static void
5796 hn_start(if_t ifp)
5797 {
5798 	struct hn_softc *sc = if_getsoftc(ifp);
5799 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5800 
5801 	if (txr->hn_sched_tx)
5802 		goto do_sched;
5803 
5804 	if (mtx_trylock(&txr->hn_tx_lock)) {
5805 		int sched;
5806 
5807 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5808 		mtx_unlock(&txr->hn_tx_lock);
5809 		if (!sched)
5810 			return;
5811 	}
5812 do_sched:
5813 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5814 }
5815 
5816 static void
5817 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5818 {
5819 	struct hn_tx_ring *txr = xtxr;
5820 
5821 	mtx_lock(&txr->hn_tx_lock);
5822 	if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5823 	hn_start_locked(txr, 0);
5824 	mtx_unlock(&txr->hn_tx_lock);
5825 }
5826 
5827 static void
5828 hn_start_txeof(struct hn_tx_ring *txr)
5829 {
5830 	struct hn_softc *sc = txr->hn_sc;
5831 	if_t ifp = sc->hn_ifp;
5832 
5833 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5834 
5835 	if (txr->hn_sched_tx)
5836 		goto do_sched;
5837 
5838 	if (mtx_trylock(&txr->hn_tx_lock)) {
5839 		int sched;
5840 
5841 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5842 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5843 		mtx_unlock(&txr->hn_tx_lock);
5844 		if (sched) {
5845 			taskqueue_enqueue(txr->hn_tx_taskq,
5846 			    &txr->hn_tx_task);
5847 		}
5848 	} else {
5849 do_sched:
5850 		/*
5851 		 * Release the OACTIVE earlier, with the hope, that
5852 		 * others could catch up.  The task will clear the
5853 		 * flag again with the hn_tx_lock to avoid possible
5854 		 * races.
5855 		 */
5856 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5857 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5858 	}
5859 }
5860 
5861 #endif	/* HN_IFSTART_SUPPORT */
5862 
5863 static int
5864 hn_xmit(struct hn_tx_ring *txr, int len)
5865 {
5866 	struct hn_softc *sc = txr->hn_sc;
5867 	if_t ifp = sc->hn_ifp;
5868 	struct mbuf *m_head;
5869 	int sched = 0;
5870 
5871 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5872 #ifdef HN_IFSTART_SUPPORT
5873 	KASSERT(hn_use_if_start == 0,
5874 	    ("hn_xmit is called, when if_start is enabled"));
5875 #endif
5876 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5877 
5878 	if (__predict_false(txr->hn_suspended))
5879 		return (0);
5880 
5881 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5882 		return (0);
5883 
5884 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5885 		struct hn_txdesc *txd;
5886 		int error;
5887 
5888 		if (len > 0 && m_head->m_pkthdr.len > len) {
5889 			/*
5890 			 * This sending could be time consuming; let callers
5891 			 * dispatch this packet sending (and sending of any
5892 			 * following up packets) to tx taskqueue.
5893 			 */
5894 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5895 			sched = 1;
5896 			break;
5897 		}
5898 
5899 		txd = hn_txdesc_get(txr);
5900 		if (txd == NULL) {
5901 			txr->hn_no_txdescs++;
5902 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5903 			txr->hn_oactive = 1;
5904 			break;
5905 		}
5906 
5907 		error = hn_encap(ifp, txr, txd, &m_head);
5908 		if (error) {
5909 			/* Both txd and m_head are freed; discard */
5910 			KASSERT(txr->hn_agg_txd == NULL,
5911 			    ("encap failed w/ pending aggregating txdesc"));
5912 			drbr_advance(ifp, txr->hn_mbuf_br);
5913 			continue;
5914 		}
5915 
5916 		if (txr->hn_agg_pktleft == 0) {
5917 			if (txr->hn_agg_txd != NULL) {
5918 				KASSERT(m_head == NULL,
5919 				    ("pending mbuf for aggregating txdesc"));
5920 				error = hn_flush_txagg(ifp, txr);
5921 				if (__predict_false(error)) {
5922 					txr->hn_oactive = 1;
5923 					break;
5924 				}
5925 			} else {
5926 				KASSERT(m_head != NULL, ("mbuf was freed"));
5927 				error = hn_txpkt(ifp, txr, txd);
5928 				if (__predict_false(error)) {
5929 					/* txd is freed, but m_head is not */
5930 					drbr_putback(ifp, txr->hn_mbuf_br,
5931 					    m_head);
5932 					txr->hn_oactive = 1;
5933 					break;
5934 				}
5935 			}
5936 		}
5937 #ifdef INVARIANTS
5938 		else {
5939 			KASSERT(txr->hn_agg_txd != NULL,
5940 			    ("no aggregating txdesc"));
5941 			KASSERT(m_head == NULL,
5942 			    ("pending mbuf for aggregating txdesc"));
5943 		}
5944 #endif
5945 
5946 		/* Sent */
5947 		drbr_advance(ifp, txr->hn_mbuf_br);
5948 	}
5949 
5950 	/* Flush pending aggerated transmission. */
5951 	if (txr->hn_agg_txd != NULL)
5952 		hn_flush_txagg(ifp, txr);
5953 	return (sched);
5954 }
5955 
5956 static int
5957 hn_transmit(if_t ifp, struct mbuf *m)
5958 {
5959 	struct hn_softc *sc = if_getsoftc(ifp);
5960 	struct hn_tx_ring *txr;
5961 	int error, idx = 0;
5962 
5963 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5964 		struct rm_priotracker pt;
5965 
5966 		rm_rlock(&sc->hn_vf_lock, &pt);
5967 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5968 			struct mbuf *m_bpf = NULL;
5969 			int obytes, omcast;
5970 
5971 			obytes = m->m_pkthdr.len;
5972 			omcast = (m->m_flags & M_MCAST) != 0;
5973 
5974 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5975 				if (bpf_peers_present(if_getbpf(ifp))) {
5976 					m_bpf = m_copypacket(m, M_NOWAIT);
5977 					if (m_bpf == NULL) {
5978 						/*
5979 						 * Failed to grab a shallow
5980 						 * copy; tap now.
5981 						 */
5982 						ETHER_BPF_MTAP(ifp, m);
5983 					}
5984 				}
5985 			} else {
5986 				ETHER_BPF_MTAP(ifp, m);
5987 			}
5988 
5989 			error = if_transmit(sc->hn_vf_ifp, m);
5990 			rm_runlock(&sc->hn_vf_lock, &pt);
5991 
5992 			if (m_bpf != NULL) {
5993 				if (!error)
5994 					ETHER_BPF_MTAP(ifp, m_bpf);
5995 				m_freem(m_bpf);
5996 			}
5997 
5998 			if (error == ENOBUFS) {
5999 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6000 			} else if (error) {
6001 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6002 			} else {
6003 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6004 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6005 				if (omcast) {
6006 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6007 					    omcast);
6008 				}
6009 			}
6010 			return (error);
6011 		}
6012 		rm_runlock(&sc->hn_vf_lock, &pt);
6013 	}
6014 
6015 #if defined(INET6) || defined(INET)
6016 	/*
6017 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6018 	 * since packet headers should be cache-hot.
6019 	 */
6020 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6021 		m = hn_tso_fixup(m);
6022 		if (__predict_false(m == NULL)) {
6023 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6024 			return EIO;
6025 		}
6026 	} else if (m->m_pkthdr.csum_flags &
6027 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6028 		m = hn_set_hlen(m);
6029 		if (__predict_false(m == NULL)) {
6030 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6031 			return EIO;
6032 		}
6033 	}
6034 #endif
6035 
6036 	/*
6037 	 * Select the TX ring based on flowid
6038 	 */
6039 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6040 #ifdef RSS
6041 		uint32_t bid;
6042 
6043 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6044 		    &bid) == 0)
6045 			idx = bid % sc->hn_tx_ring_inuse;
6046 		else
6047 #endif
6048 		{
6049 #if defined(INET6) || defined(INET)
6050 			int tcpsyn = 0;
6051 
6052 			if (m->m_pkthdr.len < 128 &&
6053 			    (m->m_pkthdr.csum_flags &
6054 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6055 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6056 				m = hn_check_tcpsyn(m, &tcpsyn);
6057 				if (__predict_false(m == NULL)) {
6058 					if_inc_counter(ifp,
6059 					    IFCOUNTER_OERRORS, 1);
6060 					return (EIO);
6061 				}
6062 			}
6063 #else
6064 			const int tcpsyn = 0;
6065 #endif
6066 			if (tcpsyn)
6067 				idx = 0;
6068 			else
6069 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6070 		}
6071 	}
6072 	txr = &sc->hn_tx_ring[idx];
6073 
6074 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6075 	if (error) {
6076 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6077 		return error;
6078 	}
6079 
6080 	if (txr->hn_oactive)
6081 		return 0;
6082 
6083 	if (txr->hn_sched_tx)
6084 		goto do_sched;
6085 
6086 	if (mtx_trylock(&txr->hn_tx_lock)) {
6087 		int sched;
6088 
6089 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6090 		mtx_unlock(&txr->hn_tx_lock);
6091 		if (!sched)
6092 			return 0;
6093 	}
6094 do_sched:
6095 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6096 	return 0;
6097 }
6098 
6099 static void
6100 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6101 {
6102 	struct mbuf *m;
6103 
6104 	mtx_lock(&txr->hn_tx_lock);
6105 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6106 		m_freem(m);
6107 	mtx_unlock(&txr->hn_tx_lock);
6108 }
6109 
6110 static void
6111 hn_xmit_qflush(if_t ifp)
6112 {
6113 	struct hn_softc *sc = if_getsoftc(ifp);
6114 	struct rm_priotracker pt;
6115 	int i;
6116 
6117 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6118 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6119 	if_qflush(ifp);
6120 
6121 	rm_rlock(&sc->hn_vf_lock, &pt);
6122 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6123 		if_qflush(sc->hn_vf_ifp);
6124 	rm_runlock(&sc->hn_vf_lock, &pt);
6125 }
6126 
6127 static void
6128 hn_xmit_txeof(struct hn_tx_ring *txr)
6129 {
6130 
6131 	if (txr->hn_sched_tx)
6132 		goto do_sched;
6133 
6134 	if (mtx_trylock(&txr->hn_tx_lock)) {
6135 		int sched;
6136 
6137 		txr->hn_oactive = 0;
6138 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6139 		mtx_unlock(&txr->hn_tx_lock);
6140 		if (sched) {
6141 			taskqueue_enqueue(txr->hn_tx_taskq,
6142 			    &txr->hn_tx_task);
6143 		}
6144 	} else {
6145 do_sched:
6146 		/*
6147 		 * Release the oactive earlier, with the hope, that
6148 		 * others could catch up.  The task will clear the
6149 		 * oactive again with the hn_tx_lock to avoid possible
6150 		 * races.
6151 		 */
6152 		txr->hn_oactive = 0;
6153 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6154 	}
6155 }
6156 
6157 static void
6158 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6159 {
6160 	struct hn_tx_ring *txr = xtxr;
6161 
6162 	mtx_lock(&txr->hn_tx_lock);
6163 	hn_xmit(txr, 0);
6164 	mtx_unlock(&txr->hn_tx_lock);
6165 }
6166 
6167 static void
6168 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6169 {
6170 	struct hn_tx_ring *txr = xtxr;
6171 
6172 	mtx_lock(&txr->hn_tx_lock);
6173 	txr->hn_oactive = 0;
6174 	hn_xmit(txr, 0);
6175 	mtx_unlock(&txr->hn_tx_lock);
6176 }
6177 
6178 static int
6179 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6180 {
6181 	struct vmbus_chan_br cbr;
6182 	struct hn_rx_ring *rxr;
6183 	struct hn_tx_ring *txr = NULL;
6184 	int idx, error;
6185 
6186 	idx = vmbus_chan_subidx(chan);
6187 
6188 	/*
6189 	 * Link this channel to RX/TX ring.
6190 	 */
6191 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6192 	    ("invalid channel index %d, should > 0 && < %d",
6193 	     idx, sc->hn_rx_ring_inuse));
6194 	rxr = &sc->hn_rx_ring[idx];
6195 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6196 	    ("RX ring %d already attached", idx));
6197 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6198 	rxr->hn_chan = chan;
6199 
6200 	if (bootverbose) {
6201 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6202 		    idx, vmbus_chan_id(chan));
6203 	}
6204 
6205 	if (idx < sc->hn_tx_ring_inuse) {
6206 		txr = &sc->hn_tx_ring[idx];
6207 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6208 		    ("TX ring %d already attached", idx));
6209 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6210 
6211 		txr->hn_chan = chan;
6212 		if (bootverbose) {
6213 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6214 			    idx, vmbus_chan_id(chan));
6215 		}
6216 	}
6217 
6218 	/* Bind this channel to a proper CPU. */
6219 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6220 
6221 	/*
6222 	 * Open this channel
6223 	 */
6224 	cbr.cbr = rxr->hn_br;
6225 	cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br);
6226 	cbr.cbr_txsz = HN_TXBR_SIZE;
6227 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6228 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6229 	if (error) {
6230 		if (error == EISCONN) {
6231 			if_printf(sc->hn_ifp, "bufring is connected after "
6232 			    "chan%u open failure\n", vmbus_chan_id(chan));
6233 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6234 		} else {
6235 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6236 			    vmbus_chan_id(chan), error);
6237 		}
6238 	}
6239 	return (error);
6240 }
6241 
6242 static void
6243 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6244 {
6245 	struct hn_rx_ring *rxr;
6246 	int idx, error;
6247 
6248 	idx = vmbus_chan_subidx(chan);
6249 
6250 	/*
6251 	 * Link this channel to RX/TX ring.
6252 	 */
6253 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6254 	    ("invalid channel index %d, should > 0 && < %d",
6255 	     idx, sc->hn_rx_ring_inuse));
6256 	rxr = &sc->hn_rx_ring[idx];
6257 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6258 	    ("RX ring %d is not attached", idx));
6259 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6260 
6261 	if (idx < sc->hn_tx_ring_inuse) {
6262 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6263 
6264 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6265 		    ("TX ring %d is not attached attached", idx));
6266 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6267 	}
6268 
6269 	/*
6270 	 * Close this channel.
6271 	 *
6272 	 * NOTE:
6273 	 * Channel closing does _not_ destroy the target channel.
6274 	 */
6275 	error = vmbus_chan_close_direct(chan);
6276 	if (error == EISCONN) {
6277 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6278 		    "after being closed\n", vmbus_chan_id(chan));
6279 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6280 	} else if (error) {
6281 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6282 		    vmbus_chan_id(chan), error);
6283 	}
6284 }
6285 
6286 static int
6287 hn_attach_subchans(struct hn_softc *sc)
6288 {
6289 	struct vmbus_channel **subchans;
6290 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6291 	int i, error = 0;
6292 
6293 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6294 
6295 	/* Attach the sub-channels. */
6296 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6297 	for (i = 0; i < subchan_cnt; ++i) {
6298 		int error1;
6299 
6300 		error1 = hn_chan_attach(sc, subchans[i]);
6301 		if (error1) {
6302 			error = error1;
6303 			/* Move on; all channels will be detached later. */
6304 		}
6305 	}
6306 	vmbus_subchan_rel(subchans, subchan_cnt);
6307 
6308 	if (error) {
6309 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6310 	} else {
6311 		if (bootverbose) {
6312 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6313 			    subchan_cnt);
6314 		}
6315 	}
6316 	return (error);
6317 }
6318 
6319 static void
6320 hn_detach_allchans(struct hn_softc *sc)
6321 {
6322 	struct vmbus_channel **subchans;
6323 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6324 	int i;
6325 
6326 	if (subchan_cnt == 0)
6327 		goto back;
6328 
6329 	/* Detach the sub-channels. */
6330 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6331 	for (i = 0; i < subchan_cnt; ++i)
6332 		hn_chan_detach(sc, subchans[i]);
6333 	vmbus_subchan_rel(subchans, subchan_cnt);
6334 
6335 back:
6336 	/*
6337 	 * Detach the primary channel, _after_ all sub-channels
6338 	 * are detached.
6339 	 */
6340 	hn_chan_detach(sc, sc->hn_prichan);
6341 
6342 	/* Wait for sub-channels to be destroyed, if any. */
6343 	vmbus_subchan_drain(sc->hn_prichan);
6344 
6345 #ifdef INVARIANTS
6346 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6347 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6348 		    HN_RX_FLAG_ATTACHED) == 0,
6349 		    ("%dth RX ring is still attached", i));
6350 	}
6351 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6352 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6353 		    HN_TX_FLAG_ATTACHED) == 0,
6354 		    ("%dth TX ring is still attached", i));
6355 	}
6356 #endif
6357 }
6358 
6359 static int
6360 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6361 {
6362 	struct vmbus_channel **subchans;
6363 	int nchan, rxr_cnt, error;
6364 
6365 	nchan = *nsubch + 1;
6366 	if (nchan == 1) {
6367 		/*
6368 		 * Multiple RX/TX rings are not requested.
6369 		 */
6370 		*nsubch = 0;
6371 		return (0);
6372 	}
6373 
6374 	/*
6375 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6376 	 * table entries.
6377 	 */
6378 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6379 	if (error) {
6380 		/* No RSS; this is benign. */
6381 		*nsubch = 0;
6382 		return (0);
6383 	}
6384 	if (bootverbose) {
6385 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6386 		    rxr_cnt, nchan);
6387 	}
6388 
6389 	if (nchan > rxr_cnt)
6390 		nchan = rxr_cnt;
6391 	if (nchan == 1) {
6392 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6393 		*nsubch = 0;
6394 		return (0);
6395 	}
6396 
6397 	/*
6398 	 * Allocate sub-channels from NVS.
6399 	 */
6400 	*nsubch = nchan - 1;
6401 	error = hn_nvs_alloc_subchans(sc, nsubch);
6402 	if (error || *nsubch == 0) {
6403 		/* Failed to allocate sub-channels. */
6404 		*nsubch = 0;
6405 		return (0);
6406 	}
6407 
6408 	/*
6409 	 * Wait for all sub-channels to become ready before moving on.
6410 	 */
6411 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6412 	vmbus_subchan_rel(subchans, *nsubch);
6413 	return (0);
6414 }
6415 
6416 static bool
6417 hn_synth_attachable(const struct hn_softc *sc)
6418 {
6419 	int i;
6420 
6421 	if (sc->hn_flags & HN_FLAG_ERRORS)
6422 		return (false);
6423 
6424 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6425 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6426 
6427 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6428 			return (false);
6429 	}
6430 	return (true);
6431 }
6432 
6433 /*
6434  * Make sure that the RX filter is zero after the successful
6435  * RNDIS initialization.
6436  *
6437  * NOTE:
6438  * Under certain conditions on certain versions of Hyper-V,
6439  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6440  * after the successful RNDIS initialization, which breaks
6441  * the assumption of any following code (well, it breaks the
6442  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6443  * explicitly, drain packets sneaking through, and drain the
6444  * interrupt taskqueues scheduled due to the stealth packets.
6445  */
6446 static void
6447 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6448 {
6449 
6450 	hn_disable_rx(sc);
6451 	hn_drain_rxtx(sc, nchan);
6452 }
6453 
6454 static int
6455 hn_synth_attach(struct hn_softc *sc, int mtu)
6456 {
6457 #define ATTACHED_NVS		0x0002
6458 #define ATTACHED_RNDIS		0x0004
6459 
6460 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6461 	int error, nsubch, nchan = 1, i, rndis_inited;
6462 	uint32_t old_caps, attached = 0;
6463 
6464 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6465 	    ("synthetic parts were attached"));
6466 
6467 	if (!hn_synth_attachable(sc))
6468 		return (ENXIO);
6469 
6470 	/* Save capabilities for later verification. */
6471 	old_caps = sc->hn_caps;
6472 	sc->hn_caps = 0;
6473 
6474 	/* Clear RSS stuffs. */
6475 	sc->hn_rss_ind_size = 0;
6476 	sc->hn_rss_hash = 0;
6477 	sc->hn_rss_hcap = 0;
6478 
6479 	/*
6480 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6481 	 */
6482 	error = hn_chan_attach(sc, sc->hn_prichan);
6483 	if (error)
6484 		goto failed;
6485 
6486 	/*
6487 	 * Attach NVS.
6488 	 */
6489 	error = hn_nvs_attach(sc, mtu);
6490 	if (error)
6491 		goto failed;
6492 	attached |= ATTACHED_NVS;
6493 
6494 	/*
6495 	 * Attach RNDIS _after_ NVS is attached.
6496 	 */
6497 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6498 	if (rndis_inited)
6499 		attached |= ATTACHED_RNDIS;
6500 	if (error)
6501 		goto failed;
6502 
6503 	/*
6504 	 * Make sure capabilities are not changed.
6505 	 */
6506 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6507 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6508 		    old_caps, sc->hn_caps);
6509 		error = ENXIO;
6510 		goto failed;
6511 	}
6512 
6513 	/*
6514 	 * Allocate sub-channels for multi-TX/RX rings.
6515 	 *
6516 	 * NOTE:
6517 	 * The # of RX rings that can be used is equivalent to the # of
6518 	 * channels to be requested.
6519 	 */
6520 	nsubch = sc->hn_rx_ring_cnt - 1;
6521 	error = hn_synth_alloc_subchans(sc, &nsubch);
6522 	if (error)
6523 		goto failed;
6524 	/* NOTE: _Full_ synthetic parts detach is required now. */
6525 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6526 
6527 	/*
6528 	 * Set the # of TX/RX rings that could be used according to
6529 	 * the # of channels that NVS offered.
6530 	 */
6531 	nchan = nsubch + 1;
6532 	hn_set_ring_inuse(sc, nchan);
6533 	if (nchan == 1) {
6534 		/* Only the primary channel can be used; done */
6535 		goto back;
6536 	}
6537 
6538 	/*
6539 	 * Attach the sub-channels.
6540 	 *
6541 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6542 	 */
6543 	error = hn_attach_subchans(sc);
6544 	if (error)
6545 		goto failed;
6546 
6547 	/*
6548 	 * Configure RSS key and indirect table _after_ all sub-channels
6549 	 * are attached.
6550 	 */
6551 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6552 		/*
6553 		 * RSS key is not set yet; set it to the default RSS key.
6554 		 */
6555 		if (bootverbose)
6556 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6557 #ifdef RSS
6558 		rss_getkey(rss->rss_key);
6559 #else
6560 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6561 #endif
6562 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6563 	}
6564 
6565 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6566 		/*
6567 		 * RSS indirect table is not set yet; set it up in round-
6568 		 * robin fashion.
6569 		 */
6570 		if (bootverbose) {
6571 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6572 			    "table\n");
6573 		}
6574 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6575 			uint32_t subidx;
6576 
6577 #ifdef RSS
6578 			subidx = rss_get_indirection_to_bucket(i);
6579 #else
6580 			subidx = i;
6581 #endif
6582 			rss->rss_ind[i] = subidx % nchan;
6583 		}
6584 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6585 	} else {
6586 		/*
6587 		 * # of usable channels may be changed, so we have to
6588 		 * make sure that all entries in RSS indirect table
6589 		 * are valid.
6590 		 *
6591 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6592 		 */
6593 		hn_rss_ind_fixup(sc);
6594 	}
6595 
6596 	sc->hn_rss_hash = sc->hn_rss_hcap;
6597 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6598 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6599 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6600 		hn_vf_rss_fixup(sc, false);
6601 	}
6602 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6603 	if (error)
6604 		goto failed;
6605 back:
6606 	/*
6607 	 * Fixup transmission aggregation setup.
6608 	 */
6609 	hn_set_txagg(sc);
6610 	hn_rndis_init_fixat(sc, nchan);
6611 	return (0);
6612 
6613 failed:
6614 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6615 		hn_rndis_init_fixat(sc, nchan);
6616 		hn_synth_detach(sc);
6617 	} else {
6618 		if (attached & ATTACHED_RNDIS) {
6619 			hn_rndis_init_fixat(sc, nchan);
6620 			hn_rndis_detach(sc);
6621 		}
6622 		if (attached & ATTACHED_NVS)
6623 			hn_nvs_detach(sc);
6624 		hn_chan_detach(sc, sc->hn_prichan);
6625 		/* Restore old capabilities. */
6626 		sc->hn_caps = old_caps;
6627 	}
6628 	return (error);
6629 
6630 #undef ATTACHED_RNDIS
6631 #undef ATTACHED_NVS
6632 }
6633 
6634 /*
6635  * NOTE:
6636  * The interface must have been suspended though hn_suspend(), before
6637  * this function get called.
6638  */
6639 static void
6640 hn_synth_detach(struct hn_softc *sc)
6641 {
6642 
6643 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6644 	    ("synthetic parts were not attached"));
6645 
6646 	/* Detach the RNDIS first. */
6647 	hn_rndis_detach(sc);
6648 
6649 	/* Detach NVS. */
6650 	hn_nvs_detach(sc);
6651 
6652 	/* Detach all of the channels. */
6653 	hn_detach_allchans(sc);
6654 
6655 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6656 		/*
6657 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6658 		 */
6659 		int error;
6660 
6661 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6662 		    sc->hn_rxbuf_gpadl);
6663 		if (error) {
6664 			if_printf(sc->hn_ifp,
6665 			    "rxbuf gpadl disconn failed: %d\n", error);
6666 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6667 		}
6668 		sc->hn_rxbuf_gpadl = 0;
6669 	}
6670 
6671 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6672 		/*
6673 		 * Host is post-Win2016, disconnect chimney sending buffer from
6674 		 * primary channel here.
6675 		 */
6676 		int error;
6677 
6678 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6679 		    sc->hn_chim_gpadl);
6680 		if (error) {
6681 			if_printf(sc->hn_ifp,
6682 			    "chim gpadl disconn failed: %d\n", error);
6683 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6684 		}
6685 		sc->hn_chim_gpadl = 0;
6686 	}
6687 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6688 }
6689 
6690 static void
6691 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6692 {
6693 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6694 	    ("invalid ring count %d", ring_cnt));
6695 
6696 	if (sc->hn_tx_ring_cnt > ring_cnt)
6697 		sc->hn_tx_ring_inuse = ring_cnt;
6698 	else
6699 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6700 	sc->hn_rx_ring_inuse = ring_cnt;
6701 
6702 #ifdef RSS
6703 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6704 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6705 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6706 		    rss_getnumbuckets());
6707 	}
6708 #endif
6709 
6710 	if (bootverbose) {
6711 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6712 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6713 	}
6714 }
6715 
6716 static void
6717 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6718 {
6719 
6720 	/*
6721 	 * NOTE:
6722 	 * The TX bufring will not be drained by the hypervisor,
6723 	 * if the primary channel is revoked.
6724 	 */
6725 	while (!vmbus_chan_rx_empty(chan) ||
6726 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6727 	     !vmbus_chan_tx_empty(chan)))
6728 		pause("waitch", 1);
6729 	vmbus_chan_intr_drain(chan);
6730 }
6731 
6732 static void
6733 hn_disable_rx(struct hn_softc *sc)
6734 {
6735 
6736 	/*
6737 	 * Disable RX by clearing RX filter forcefully.
6738 	 */
6739 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6740 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6741 
6742 	/*
6743 	 * Give RNDIS enough time to flush all pending data packets.
6744 	 */
6745 	pause("waitrx", (200 * hz) / 1000);
6746 }
6747 
6748 /*
6749  * NOTE:
6750  * RX/TX _must_ have been suspended/disabled, before this function
6751  * is called.
6752  */
6753 static void
6754 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6755 {
6756 	struct vmbus_channel **subch = NULL;
6757 	int nsubch;
6758 
6759 	/*
6760 	 * Drain RX/TX bufrings and interrupts.
6761 	 */
6762 	nsubch = nchan - 1;
6763 	if (nsubch > 0)
6764 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6765 
6766 	if (subch != NULL) {
6767 		int i;
6768 
6769 		for (i = 0; i < nsubch; ++i)
6770 			hn_chan_drain(sc, subch[i]);
6771 	}
6772 	hn_chan_drain(sc, sc->hn_prichan);
6773 
6774 	if (subch != NULL)
6775 		vmbus_subchan_rel(subch, nsubch);
6776 }
6777 
6778 static void
6779 hn_suspend_data(struct hn_softc *sc)
6780 {
6781 	struct hn_tx_ring *txr;
6782 	int i;
6783 
6784 	HN_LOCK_ASSERT(sc);
6785 
6786 	/*
6787 	 * Suspend TX.
6788 	 */
6789 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6790 		txr = &sc->hn_tx_ring[i];
6791 
6792 		mtx_lock(&txr->hn_tx_lock);
6793 		txr->hn_suspended = 1;
6794 		mtx_unlock(&txr->hn_tx_lock);
6795 		/* No one is able send more packets now. */
6796 
6797 		/*
6798 		 * Wait for all pending sends to finish.
6799 		 *
6800 		 * NOTE:
6801 		 * We will _not_ receive all pending send-done, if the
6802 		 * primary channel is revoked.
6803 		 */
6804 		while (hn_tx_ring_pending(txr) &&
6805 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6806 			pause("hnwtx", 1 /* 1 tick */);
6807 	}
6808 
6809 	/*
6810 	 * Disable RX.
6811 	 */
6812 	hn_disable_rx(sc);
6813 
6814 	/*
6815 	 * Drain RX/TX.
6816 	 */
6817 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6818 
6819 	/*
6820 	 * Drain any pending TX tasks.
6821 	 *
6822 	 * NOTE:
6823 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6824 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6825 	 */
6826 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6827 		txr = &sc->hn_tx_ring[i];
6828 
6829 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6830 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6831 	}
6832 }
6833 
6834 static void
6835 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6836 {
6837 
6838 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6839 }
6840 
6841 static void
6842 hn_suspend_mgmt(struct hn_softc *sc)
6843 {
6844 	struct task task;
6845 
6846 	HN_LOCK_ASSERT(sc);
6847 
6848 	/*
6849 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6850 	 * through hn_mgmt_taskq.
6851 	 */
6852 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6853 	vmbus_chan_run_task(sc->hn_prichan, &task);
6854 
6855 	/*
6856 	 * Make sure that all pending management tasks are completed.
6857 	 */
6858 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6859 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6860 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6861 }
6862 
6863 static void
6864 hn_suspend(struct hn_softc *sc)
6865 {
6866 
6867 	/* Disable polling. */
6868 	hn_polling(sc, 0);
6869 
6870 	/*
6871 	 * If the non-transparent mode VF is activated, the synthetic
6872 	 * device is receiving packets, so the data path of the
6873 	 * synthetic device must be suspended.
6874 	 */
6875 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6876 	    (sc->hn_flags & HN_FLAG_RXVF))
6877 		hn_suspend_data(sc);
6878 	hn_suspend_mgmt(sc);
6879 }
6880 
6881 static void
6882 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6883 {
6884 	int i;
6885 
6886 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6887 	    ("invalid TX ring count %d", tx_ring_cnt));
6888 
6889 	for (i = 0; i < tx_ring_cnt; ++i) {
6890 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6891 
6892 		mtx_lock(&txr->hn_tx_lock);
6893 		txr->hn_suspended = 0;
6894 		mtx_unlock(&txr->hn_tx_lock);
6895 	}
6896 }
6897 
6898 static void
6899 hn_resume_data(struct hn_softc *sc)
6900 {
6901 	int i;
6902 
6903 	HN_LOCK_ASSERT(sc);
6904 
6905 	/*
6906 	 * Re-enable RX.
6907 	 */
6908 	hn_rxfilter_config(sc);
6909 
6910 	/*
6911 	 * Make sure to clear suspend status on "all" TX rings,
6912 	 * since hn_tx_ring_inuse can be changed after
6913 	 * hn_suspend_data().
6914 	 */
6915 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6916 
6917 #ifdef HN_IFSTART_SUPPORT
6918 	if (!hn_use_if_start)
6919 #endif
6920 	{
6921 		/*
6922 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6923 		 * reduced.
6924 		 */
6925 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6926 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6927 	}
6928 
6929 	/*
6930 	 * Kick start TX.
6931 	 */
6932 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6933 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6934 
6935 		/*
6936 		 * Use txeof task, so that any pending oactive can be
6937 		 * cleared properly.
6938 		 */
6939 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6940 	}
6941 }
6942 
6943 static void
6944 hn_resume_mgmt(struct hn_softc *sc)
6945 {
6946 
6947 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6948 
6949 	/*
6950 	 * Kick off network change detection, if it was pending.
6951 	 * If no network change was pending, start link status
6952 	 * checks, which is more lightweight than network change
6953 	 * detection.
6954 	 */
6955 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6956 		hn_change_network(sc);
6957 	else
6958 		hn_update_link_status(sc);
6959 }
6960 
6961 static void
6962 hn_resume(struct hn_softc *sc)
6963 {
6964 
6965 	/*
6966 	 * If the non-transparent mode VF is activated, the synthetic
6967 	 * device have to receive packets, so the data path of the
6968 	 * synthetic device must be resumed.
6969 	 */
6970 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6971 	    (sc->hn_flags & HN_FLAG_RXVF))
6972 		hn_resume_data(sc);
6973 
6974 	/*
6975 	 * Don't resume link status change if VF is attached/activated.
6976 	 * - In the non-transparent VF mode, the synthetic device marks
6977 	 *   link down until the VF is deactivated; i.e. VF is down.
6978 	 * - In transparent VF mode, VF's media status is used until
6979 	 *   the VF is detached.
6980 	 */
6981 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6982 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6983 		hn_resume_mgmt(sc);
6984 
6985 	/*
6986 	 * Re-enable polling if this interface is running and
6987 	 * the polling is requested.
6988 	 */
6989 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6990 		hn_polling(sc, sc->hn_pollhz);
6991 }
6992 
6993 static void
6994 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6995 {
6996 	const struct rndis_status_msg *msg;
6997 	int ofs;
6998 
6999 	if (dlen < sizeof(*msg)) {
7000 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7001 		return;
7002 	}
7003 	msg = data;
7004 
7005 	switch (msg->rm_status) {
7006 	case RNDIS_STATUS_MEDIA_CONNECT:
7007 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7008 		hn_update_link_status(sc);
7009 		break;
7010 
7011 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7012 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7013 		/* Not really useful; ignore. */
7014 		break;
7015 
7016 	case RNDIS_STATUS_NETWORK_CHANGE:
7017 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7018 		if (dlen < ofs + msg->rm_stbuflen ||
7019 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7020 			if_printf(sc->hn_ifp, "network changed\n");
7021 		} else {
7022 			uint32_t change;
7023 
7024 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7025 			    sizeof(change));
7026 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7027 			    change);
7028 		}
7029 		hn_change_network(sc);
7030 		break;
7031 
7032 	default:
7033 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7034 		    msg->rm_status);
7035 		break;
7036 	}
7037 }
7038 
7039 static int
7040 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7041 {
7042 	const struct rndis_pktinfo *pi = info_data;
7043 	uint32_t mask = 0;
7044 
7045 	while (info_dlen != 0) {
7046 		const void *data;
7047 		uint32_t dlen;
7048 
7049 		if (__predict_false(info_dlen < sizeof(*pi)))
7050 			return (EINVAL);
7051 		if (__predict_false(info_dlen < pi->rm_size))
7052 			return (EINVAL);
7053 		info_dlen -= pi->rm_size;
7054 
7055 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7056 			return (EINVAL);
7057 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7058 			return (EINVAL);
7059 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7060 		data = pi->rm_data;
7061 
7062 		if (pi->rm_internal == 1) {
7063 			switch (pi->rm_type) {
7064 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7065 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7066 					return (EINVAL);
7067 				info->pktinfo_id =
7068 				    (const struct packet_info_id *)data;
7069 				mask |= HN_RXINFO_PKTINFO_ID;
7070 				break;
7071 
7072 			default:
7073 				goto next;
7074 			}
7075 		} else {
7076 			switch (pi->rm_type) {
7077 			case NDIS_PKTINFO_TYPE_VLAN:
7078 				if (__predict_false(dlen
7079 				    < NDIS_VLAN_INFO_SIZE))
7080 					return (EINVAL);
7081 				info->vlan_info = (const uint32_t *)data;
7082 				mask |= HN_RXINFO_VLAN;
7083 				break;
7084 
7085 			case NDIS_PKTINFO_TYPE_CSUM:
7086 				if (__predict_false(dlen
7087 				    < NDIS_RXCSUM_INFO_SIZE))
7088 					return (EINVAL);
7089 				info->csum_info = (const uint32_t *)data;
7090 				mask |= HN_RXINFO_CSUM;
7091 				break;
7092 
7093 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7094 				if (__predict_false(dlen
7095 				    < HN_NDIS_HASH_VALUE_SIZE))
7096 					return (EINVAL);
7097 				info->hash_value = (const uint32_t *)data;
7098 				mask |= HN_RXINFO_HASHVAL;
7099 				break;
7100 
7101 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7102 				if (__predict_false(dlen
7103 				    < HN_NDIS_HASH_INFO_SIZE))
7104 					return (EINVAL);
7105 				info->hash_info = (const uint32_t *)data;
7106 				mask |= HN_RXINFO_HASHINF;
7107 				break;
7108 
7109 			default:
7110 				goto next;
7111 			}
7112 		}
7113 
7114 		if (mask == HN_RXINFO_ALL) {
7115 			/* All found; done */
7116 			break;
7117 		}
7118 next:
7119 		pi = (const struct rndis_pktinfo *)
7120 		    ((const uint8_t *)pi + pi->rm_size);
7121 	}
7122 
7123 	/*
7124 	 * Final fixup.
7125 	 * - If there is no hash value, invalidate the hash info.
7126 	 */
7127 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7128 		info->hash_info = NULL;
7129 	return (0);
7130 }
7131 
7132 static __inline bool
7133 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7134 {
7135 
7136 	if (off < check_off) {
7137 		if (__predict_true(off + len <= check_off))
7138 			return (false);
7139 	} else if (off > check_off) {
7140 		if (__predict_true(check_off + check_len <= off))
7141 			return (false);
7142 	}
7143 	return (true);
7144 }
7145 
7146 static __inline void
7147 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7148 		uint32_t len, struct hn_rxinfo *info)
7149 {
7150 	uint32_t cnt = rxr->rsc.cnt;
7151 
7152 	if (cnt) {
7153 		rxr->rsc.pktlen += len;
7154 	} else {
7155 		rxr->rsc.vlan_info = info->vlan_info;
7156 		rxr->rsc.csum_info = info->csum_info;
7157 		rxr->rsc.hash_info = info->hash_info;
7158 		rxr->rsc.hash_value = info->hash_value;
7159 		rxr->rsc.pktlen = len;
7160 	}
7161 
7162 	rxr->rsc.frag_data[cnt] = data;
7163 	rxr->rsc.frag_len[cnt] = len;
7164 	rxr->rsc.cnt++;
7165 }
7166 
7167 static void
7168 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7169 {
7170 	const struct rndis_packet_msg *pkt;
7171 	struct hn_rxinfo info;
7172 	int data_off, pktinfo_off, data_len, pktinfo_len;
7173 	bool rsc_more= false;
7174 
7175 	/*
7176 	 * Check length.
7177 	 */
7178 	if (__predict_false(dlen < sizeof(*pkt))) {
7179 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7180 		return;
7181 	}
7182 	pkt = data;
7183 
7184 	if (__predict_false(dlen < pkt->rm_len)) {
7185 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7186 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7187 		return;
7188 	}
7189 	if (__predict_false(pkt->rm_len <
7190 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7191 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7192 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7193 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7194 		    pkt->rm_pktinfolen);
7195 		return;
7196 	}
7197 	if (__predict_false(pkt->rm_datalen == 0)) {
7198 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7199 		return;
7200 	}
7201 
7202 	/*
7203 	 * Check offests.
7204 	 */
7205 #define IS_OFFSET_INVALID(ofs)			\
7206 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7207 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7208 
7209 	/* XXX Hyper-V does not meet data offset alignment requirement */
7210 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7211 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7212 		    "data offset %u\n", pkt->rm_dataoffset);
7213 		return;
7214 	}
7215 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7216 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7217 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7218 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7219 		return;
7220 	}
7221 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7222 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7223 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7224 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7225 		return;
7226 	}
7227 
7228 #undef IS_OFFSET_INVALID
7229 
7230 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7231 	data_len = pkt->rm_datalen;
7232 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7233 	pktinfo_len = pkt->rm_pktinfolen;
7234 
7235 	/*
7236 	 * Check OOB coverage.
7237 	 */
7238 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7239 		int oob_off, oob_len;
7240 
7241 		if_printf(rxr->hn_ifp, "got oobdata\n");
7242 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7243 		oob_len = pkt->rm_oobdatalen;
7244 
7245 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7246 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7247 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7248 			    pkt->rm_len, oob_off, oob_len);
7249 			return;
7250 		}
7251 
7252 		/*
7253 		 * Check against data.
7254 		 */
7255 		if (hn_rndis_check_overlap(oob_off, oob_len,
7256 		    data_off, data_len)) {
7257 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7258 			    "oob overlaps data, oob abs %d len %d, "
7259 			    "data abs %d len %d\n",
7260 			    oob_off, oob_len, data_off, data_len);
7261 			return;
7262 		}
7263 
7264 		/*
7265 		 * Check against pktinfo.
7266 		 */
7267 		if (pktinfo_len != 0 &&
7268 		    hn_rndis_check_overlap(oob_off, oob_len,
7269 		    pktinfo_off, pktinfo_len)) {
7270 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7271 			    "oob overlaps pktinfo, oob abs %d len %d, "
7272 			    "pktinfo abs %d len %d\n",
7273 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7274 			return;
7275 		}
7276 	}
7277 
7278 	/*
7279 	 * Check per-packet-info coverage and find useful per-packet-info.
7280 	 */
7281 	info.vlan_info = NULL;
7282 	info.csum_info = NULL;
7283 	info.hash_info = NULL;
7284 	info.pktinfo_id = NULL;
7285 
7286 	if (__predict_true(pktinfo_len != 0)) {
7287 		bool overlap;
7288 		int error;
7289 
7290 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7291 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7292 			    "pktinfo overflow, msglen %u, "
7293 			    "pktinfo abs %d len %d\n",
7294 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7295 			return;
7296 		}
7297 
7298 		/*
7299 		 * Check packet info coverage.
7300 		 */
7301 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7302 		    data_off, data_len);
7303 		if (__predict_false(overlap)) {
7304 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7305 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7306 			    "data abs %d len %d\n",
7307 			    pktinfo_off, pktinfo_len, data_off, data_len);
7308 			return;
7309 		}
7310 
7311 		/*
7312 		 * Find useful per-packet-info.
7313 		 */
7314 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7315 		    pktinfo_len, &info);
7316 		if (__predict_false(error)) {
7317 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7318 			    "pktinfo\n");
7319 			return;
7320 		}
7321 	}
7322 
7323 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7324 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7325 		    "data overflow, msglen %u, data abs %d len %d\n",
7326 		    pkt->rm_len, data_off, data_len);
7327 		return;
7328 	}
7329 
7330 	/* Identify RSC fragments, drop invalid packets */
7331 	if ((info.pktinfo_id != NULL) &&
7332 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7333 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7334 			rxr->rsc.cnt = 0;
7335 			rxr->hn_rsc_pkts++;
7336 		} else if (rxr->rsc.cnt == 0)
7337 			goto drop;
7338 
7339 		rsc_more = true;
7340 
7341 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7342 			rsc_more = false;
7343 
7344 		if (rsc_more && rxr->rsc.is_last)
7345 			goto drop;
7346 	} else {
7347 		rxr->rsc.cnt = 0;
7348 	}
7349 
7350 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7351 		goto drop;
7352 
7353 	/* Store data in per rx ring structure */
7354 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7355 	    data_len, &info);
7356 
7357 	if (rsc_more)
7358 		return;
7359 
7360 	hn_rxpkt(rxr);
7361 	rxr->rsc.cnt = 0;
7362 	return;
7363 drop:
7364 	rxr->hn_rsc_drop++;
7365 	return;
7366 }
7367 
7368 static __inline void
7369 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7370 {
7371 	const struct rndis_msghdr *hdr;
7372 
7373 	if (__predict_false(dlen < sizeof(*hdr))) {
7374 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7375 		return;
7376 	}
7377 	hdr = data;
7378 
7379 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7380 		/* Hot data path. */
7381 		hn_rndis_rx_data(rxr, data, dlen);
7382 		/* Done! */
7383 		return;
7384 	}
7385 
7386 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7387 		hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7388 	else
7389 		hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7390 }
7391 
7392 static void
7393 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7394 {
7395 	const struct hn_nvs_hdr *hdr;
7396 
7397 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7398 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7399 		return;
7400 	}
7401 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7402 
7403 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7404 		/* Useless; ignore */
7405 		return;
7406 	}
7407 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7408 }
7409 
7410 static void
7411 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7412     const struct vmbus_chanpkt_hdr *pkt)
7413 {
7414 	struct hn_nvs_sendctx *sndc;
7415 
7416 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7417 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7418 	    VMBUS_CHANPKT_DATALEN(pkt));
7419 	/*
7420 	 * NOTE:
7421 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7422 	 * its callback.
7423 	 */
7424 }
7425 
7426 static void
7427 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7428     const struct vmbus_chanpkt_hdr *pkthdr)
7429 {
7430 	struct epoch_tracker et;
7431 	const struct vmbus_chanpkt_rxbuf *pkt;
7432 	const struct hn_nvs_hdr *nvs_hdr;
7433 	int count, i, hlen;
7434 
7435 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7436 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7437 		return;
7438 	}
7439 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7440 
7441 	/* Make sure that this is a RNDIS message. */
7442 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7443 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7444 		    nvs_hdr->nvs_type);
7445 		return;
7446 	}
7447 
7448 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7449 	if (__predict_false(hlen < sizeof(*pkt))) {
7450 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7451 		return;
7452 	}
7453 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7454 
7455 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7456 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7457 		    pkt->cp_rxbuf_id);
7458 		return;
7459 	}
7460 
7461 	count = pkt->cp_rxbuf_cnt;
7462 	if (__predict_false(hlen <
7463 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7464 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7465 		return;
7466 	}
7467 
7468 	NET_EPOCH_ENTER(et);
7469 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7470 	for (i = 0; i < count; ++i) {
7471 		int ofs, len;
7472 
7473 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7474 		len = pkt->cp_rxbuf[i].rb_len;
7475 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7476 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7477 			    "ofs %d, len %d\n", i, ofs, len);
7478 			continue;
7479 		}
7480 
7481 		rxr->rsc.is_last = (i == (count - 1));
7482 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7483 	}
7484 	NET_EPOCH_EXIT(et);
7485 
7486 	/*
7487 	 * Ack the consumed RXBUF associated w/ this channel packet,
7488 	 * so that this RXBUF can be recycled by the hypervisor.
7489 	 */
7490 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7491 }
7492 
7493 static void
7494 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7495     uint64_t tid)
7496 {
7497 	struct hn_nvs_rndis_ack ack;
7498 	int retries, error;
7499 
7500 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7501 	ack.nvs_status = HN_NVS_STATUS_OK;
7502 
7503 	retries = 0;
7504 again:
7505 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7506 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7507 	if (__predict_false(error == EAGAIN)) {
7508 		/*
7509 		 * NOTE:
7510 		 * This should _not_ happen in real world, since the
7511 		 * consumption of the TX bufring from the TX path is
7512 		 * controlled.
7513 		 */
7514 		if (rxr->hn_ack_failed == 0)
7515 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7516 		rxr->hn_ack_failed++;
7517 		retries++;
7518 		if (retries < 10) {
7519 			DELAY(100);
7520 			goto again;
7521 		}
7522 		/* RXBUF leaks! */
7523 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7524 	}
7525 }
7526 
7527 static void
7528 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7529 {
7530 	struct hn_rx_ring *rxr = xrxr;
7531 	struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7532 
7533 	for (;;) {
7534 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7535 		int error, pktlen;
7536 
7537 		pktlen = rxr->hn_pktbuf_len;
7538 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7539 		if (__predict_false(error == ENOBUFS)) {
7540 			void *nbuf;
7541 			int nlen;
7542 
7543 			/*
7544 			 * Expand channel packet buffer.
7545 			 *
7546 			 * XXX
7547 			 * Use M_WAITOK here, since allocation failure
7548 			 * is fatal.
7549 			 */
7550 			nlen = rxr->hn_pktbuf_len * 2;
7551 			while (nlen < pktlen)
7552 				nlen *= 2;
7553 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7554 
7555 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7556 			    rxr->hn_pktbuf_len, nlen);
7557 
7558 			free(rxr->hn_pktbuf, M_DEVBUF);
7559 			rxr->hn_pktbuf = nbuf;
7560 			rxr->hn_pktbuf_len = nlen;
7561 			/* Retry! */
7562 			continue;
7563 		} else if (__predict_false(error == EAGAIN)) {
7564 			/* No more channel packets; done! */
7565 			break;
7566 		}
7567 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7568 
7569 		switch (pkt->cph_type) {
7570 		case VMBUS_CHANPKT_TYPE_COMP:
7571 			hn_nvs_handle_comp(sc, chan, pkt);
7572 			break;
7573 
7574 		case VMBUS_CHANPKT_TYPE_RXBUF:
7575 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7576 			break;
7577 
7578 		case VMBUS_CHANPKT_TYPE_INBAND:
7579 			hn_nvs_handle_notify(sc, pkt);
7580 			break;
7581 
7582 		default:
7583 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7584 			    pkt->cph_type);
7585 			break;
7586 		}
7587 	}
7588 	hn_chan_rollup(rxr, rxr->hn_txr);
7589 }
7590 
7591 static void
7592 hn_sysinit(void *arg __unused)
7593 {
7594 	int i;
7595 
7596 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7597 
7598 #ifdef HN_IFSTART_SUPPORT
7599 	/*
7600 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7601 	 * mainly due to the IFF_DRV_OACTIVE flag.
7602 	 */
7603 	if (hn_xpnt_vf && hn_use_if_start) {
7604 		hn_use_if_start = 0;
7605 		printf("hn: tranparent VF mode, if_transmit will be used, "
7606 		    "instead of if_start\n");
7607 	}
7608 #endif
7609 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7610 		printf("hn: invalid transparent VF attach routing "
7611 		    "wait timeout %d, reset to %d\n",
7612 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7613 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7614 	}
7615 
7616 	/*
7617 	 * Initialize VF map.
7618 	 */
7619 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7620 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7621 	hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7622 	    M_WAITOK | M_ZERO);
7623 
7624 	/*
7625 	 * Fix the # of TX taskqueues.
7626 	 */
7627 	if (hn_tx_taskq_cnt <= 0)
7628 		hn_tx_taskq_cnt = 1;
7629 	else if (hn_tx_taskq_cnt > mp_ncpus)
7630 		hn_tx_taskq_cnt = mp_ncpus;
7631 
7632 	/*
7633 	 * Fix the TX taskqueue mode.
7634 	 */
7635 	switch (hn_tx_taskq_mode) {
7636 	case HN_TX_TASKQ_M_INDEP:
7637 	case HN_TX_TASKQ_M_GLOBAL:
7638 	case HN_TX_TASKQ_M_EVTTQ:
7639 		break;
7640 	default:
7641 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7642 		break;
7643 	}
7644 
7645 	if (vm_guest != VM_GUEST_HV)
7646 		return;
7647 
7648 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7649 		return;
7650 
7651 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7652 	    M_DEVBUF, M_WAITOK);
7653 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7654 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7655 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7656 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7657 		    "hn tx%d", i);
7658 	}
7659 }
7660 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7661 
7662 static void
7663 hn_sysuninit(void *arg __unused)
7664 {
7665 
7666 	if (hn_tx_taskque != NULL) {
7667 		int i;
7668 
7669 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7670 			taskqueue_free(hn_tx_taskque[i]);
7671 		free(hn_tx_taskque, M_DEVBUF);
7672 	}
7673 
7674 	if (hn_vfmap != NULL)
7675 		free(hn_vfmap, M_DEVBUF);
7676 	rm_destroy(&hn_vfmap_lock);
7677 
7678 	counter_u64_free(hn_udpcs_fixup);
7679 }
7680 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7681