xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 81ea85a8845662ca329a954eeeb3e6d4124282a2)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87 
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99 
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107 
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112 
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118 
119 #include "vmbus_if.h"
120 
121 #define HN_IFSTART_SUPPORT
122 
123 #define HN_RING_CNT_DEF_MAX		8
124 
125 #define HN_VFMAP_SIZE_DEF		8
126 
127 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
128 
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT			512
131 
132 #define HN_RNDIS_PKT_LEN					\
133 	(sizeof(struct rndis_packet_msg) +			\
134 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
140 
141 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
146 
147 #define HN_DIRECT_TX_SIZE_DEF		128
148 
149 #define HN_EARLY_TXEOF_THRESH		8
150 
151 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
152 
153 #define HN_LROENT_CNT_DEF		128
154 
155 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
159 
160 #define HN_LRO_ACKCNT_DEF		1
161 
162 #define HN_LOCK_INIT(sc)		\
163 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc)					\
167 do {							\
168 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
169 		DELAY(1000);				\
170 } while (0)
171 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
172 
173 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc)		\
176 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc)	\
178 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 
180 #define HN_PKTSIZE_MIN(align)		\
181 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 	    HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align)		\
184 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191 
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 	SLIST_ENTRY(hn_txdesc)		link;
195 #endif
196 	STAILQ_ENTRY(hn_txdesc)		agg_link;
197 
198 	/* Aggregated txdescs, in sending order. */
199 	STAILQ_HEAD(, hn_txdesc)	agg_list;
200 
201 	/* The oldest packet, if transmission aggregation happens. */
202 	struct mbuf			*m;
203 	struct hn_tx_ring		*txr;
204 	int				refs;
205 	uint32_t			flags;	/* HN_TXD_FLAG_ */
206 	struct hn_nvs_sendctx		send_ctx;
207 	uint32_t			chim_index;
208 	int				chim_size;
209 
210 	bus_dmamap_t			data_dmap;
211 
212 	bus_addr_t			rndis_pkt_paddr;
213 	struct rndis_packet_msg		*rndis_pkt;
214 	bus_dmamap_t			rndis_pkt_dmap;
215 };
216 
217 #define HN_TXD_FLAG_ONLIST		0x0001
218 #define HN_TXD_FLAG_DMAMAP		0x0002
219 #define HN_TXD_FLAG_ONAGG		0x0004
220 
221 struct hn_rxinfo {
222 	uint32_t			vlan_info;
223 	uint32_t			csum_info;
224 	uint32_t			hash_info;
225 	uint32_t			hash_value;
226 };
227 
228 struct hn_rxvf_setarg {
229 	struct hn_rx_ring	*rxr;
230 	struct ifnet		*vf_ifp;
231 };
232 
233 #define HN_RXINFO_VLAN			0x0001
234 #define HN_RXINFO_CSUM			0x0002
235 #define HN_RXINFO_HASHINF		0x0004
236 #define HN_RXINFO_HASHVAL		0x0008
237 #define HN_RXINFO_ALL			\
238 	(HN_RXINFO_VLAN |		\
239 	 HN_RXINFO_CSUM |		\
240 	 HN_RXINFO_HASHINF |		\
241 	 HN_RXINFO_HASHVAL)
242 
243 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID	0
245 #define HN_NDIS_HASH_INFO_INVALID	0
246 
247 static int			hn_probe(device_t);
248 static int			hn_attach(device_t);
249 static int			hn_detach(device_t);
250 static int			hn_shutdown(device_t);
251 static void			hn_chan_callback(struct vmbus_channel *,
252 				    void *);
253 
254 static void			hn_init(void *);
255 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void			hn_start(struct ifnet *);
258 #endif
259 static int			hn_transmit(struct ifnet *, struct mbuf *);
260 static void			hn_xmit_qflush(struct ifnet *);
261 static int			hn_ifmedia_upd(struct ifnet *);
262 static void			hn_ifmedia_sts(struct ifnet *,
263 				    struct ifmediareq *);
264 
265 static void			hn_ifnet_event(void *, struct ifnet *, int);
266 static void			hn_ifaddr_event(void *, struct ifnet *);
267 static void			hn_ifnet_attevent(void *, struct ifnet *);
268 static void			hn_ifnet_detevent(void *, struct ifnet *);
269 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 
271 static bool			hn_ismyvf(const struct hn_softc *,
272 				    const struct ifnet *);
273 static void			hn_rxvf_change(struct hn_softc *,
274 				    struct ifnet *, bool);
275 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void			hn_rxvf_set_task(void *, int);
277 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 				    struct ifreq *);
281 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool			hn_xpnt_vf_isready(struct hn_softc *);
283 static void			hn_xpnt_vf_setready(struct hn_softc *);
284 static void			hn_xpnt_vf_init_taskfunc(void *, int);
285 static void			hn_xpnt_vf_init(struct hn_softc *);
286 static void			hn_xpnt_vf_setenable(struct hn_softc *);
287 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void			hn_vf_rss_restore(struct hn_softc *);
290 
291 static int			hn_rndis_rxinfo(const void *, int,
292 				    struct hn_rxinfo *);
293 static void			hn_rndis_rx_data(struct hn_rx_ring *,
294 				    const void *, int);
295 static void			hn_rndis_rx_status(struct hn_softc *,
296 				    const void *, int);
297 static void			hn_rndis_init_fixat(struct hn_softc *, int);
298 
299 static void			hn_nvs_handle_notify(struct hn_softc *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_comp(struct hn_softc *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *,
306 				    const struct vmbus_chanpkt_hdr *);
307 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 				    struct vmbus_channel *, uint64_t);
309 
310 #if __FreeBSD_version >= 1100099
311 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 
347 static void			hn_stop(struct hn_softc *, bool);
348 static void			hn_init_locked(struct hn_softc *);
349 static int			hn_chan_attach(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_chan_detach(struct hn_softc *,
352 				    struct vmbus_channel *);
353 static int			hn_attach_subchans(struct hn_softc *);
354 static void			hn_detach_allchans(struct hn_softc *);
355 static void			hn_chan_rollup(struct hn_rx_ring *,
356 				    struct hn_tx_ring *);
357 static void			hn_set_ring_inuse(struct hn_softc *, int);
358 static int			hn_synth_attach(struct hn_softc *, int);
359 static void			hn_synth_detach(struct hn_softc *);
360 static int			hn_synth_alloc_subchans(struct hn_softc *,
361 				    int *);
362 static bool			hn_synth_attachable(const struct hn_softc *);
363 static void			hn_suspend(struct hn_softc *);
364 static void			hn_suspend_data(struct hn_softc *);
365 static void			hn_suspend_mgmt(struct hn_softc *);
366 static void			hn_resume(struct hn_softc *);
367 static void			hn_resume_data(struct hn_softc *);
368 static void			hn_resume_mgmt(struct hn_softc *);
369 static void			hn_suspend_mgmt_taskfunc(void *, int);
370 static void			hn_chan_drain(struct hn_softc *,
371 				    struct vmbus_channel *);
372 static void			hn_disable_rx(struct hn_softc *);
373 static void			hn_drain_rxtx(struct hn_softc *, int);
374 static void			hn_polling(struct hn_softc *, u_int);
375 static void			hn_chan_polling(struct vmbus_channel *, u_int);
376 static void			hn_mtu_change_fixup(struct hn_softc *);
377 
378 static void			hn_update_link_status(struct hn_softc *);
379 static void			hn_change_network(struct hn_softc *);
380 static void			hn_link_taskfunc(void *, int);
381 static void			hn_netchg_init_taskfunc(void *, int);
382 static void			hn_netchg_status_taskfunc(void *, int);
383 static void			hn_link_status(struct hn_softc *);
384 
385 static int			hn_create_rx_data(struct hn_softc *, int);
386 static void			hn_destroy_rx_data(struct hn_softc *);
387 static int			hn_check_iplen(const struct mbuf *, int);
388 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
389 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
390 static int			hn_rxfilter_config(struct hn_softc *);
391 static int			hn_rss_reconfig(struct hn_softc *);
392 static void			hn_rss_ind_fixup(struct hn_softc *);
393 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
394 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
395 				    int, const struct hn_rxinfo *);
396 static uint32_t			hn_rss_type_fromndis(uint32_t);
397 static uint32_t			hn_rss_type_tondis(uint32_t);
398 
399 static int			hn_tx_ring_create(struct hn_softc *, int);
400 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
401 static int			hn_create_tx_data(struct hn_softc *, int);
402 static void			hn_fixup_tx_data(struct hn_softc *);
403 static void			hn_fixup_rx_data(struct hn_softc *);
404 static void			hn_destroy_tx_data(struct hn_softc *);
405 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
406 static void			hn_txdesc_gc(struct hn_tx_ring *,
407 				    struct hn_txdesc *);
408 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
409 				    struct hn_txdesc *, struct mbuf **);
410 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 				    struct hn_txdesc *);
412 static void			hn_set_chim_size(struct hn_softc *, int);
413 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
414 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
415 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
416 static void			hn_resume_tx(struct hn_softc *, int);
417 static void			hn_set_txagg(struct hn_softc *);
418 static void			*hn_try_txagg(struct ifnet *,
419 				    struct hn_tx_ring *, struct hn_txdesc *,
420 				    int);
421 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
422 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
423 				    struct hn_softc *, struct vmbus_channel *,
424 				    const void *, int);
425 static int			hn_txpkt_sglist(struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static int			hn_txpkt_chim(struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static int			hn_xmit(struct hn_tx_ring *, int);
430 static void			hn_xmit_taskfunc(void *, int);
431 static void			hn_xmit_txeof(struct hn_tx_ring *);
432 static void			hn_xmit_txeof_taskfunc(void *, int);
433 #ifdef HN_IFSTART_SUPPORT
434 static int			hn_start_locked(struct hn_tx_ring *, int);
435 static void			hn_start_taskfunc(void *, int);
436 static void			hn_start_txeof(struct hn_tx_ring *);
437 static void			hn_start_txeof_taskfunc(void *, int);
438 #endif
439 
440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
441     "Hyper-V network interface");
442 
443 /* Trust tcp segements verification on host side. */
444 static int			hn_trust_hosttcp = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
446     &hn_trust_hosttcp, 0,
447     "Trust tcp segement verification on host side, "
448     "when csum info is missing (global setting)");
449 
450 /* Trust udp datagrams verification on host side. */
451 static int			hn_trust_hostudp = 1;
452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
453     &hn_trust_hostudp, 0,
454     "Trust udp datagram verification on host side, "
455     "when csum info is missing (global setting)");
456 
457 /* Trust ip packets verification on host side. */
458 static int			hn_trust_hostip = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460     &hn_trust_hostip, 0,
461     "Trust ip packet verification on host side, "
462     "when csum info is missing (global setting)");
463 
464 /*
465  * Offload UDP/IPv4 checksum.
466  */
467 static int			hn_enable_udp4cs = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
469     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 
471 /*
472  * Offload UDP/IPv6 checksum.
473  */
474 static int			hn_enable_udp6cs = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
476     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 
478 /* Stats. */
479 static counter_u64_t		hn_udpcs_fixup;
480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
481     &hn_udpcs_fixup, "# of UDP checksum fixup");
482 
483 /*
484  * See hn_set_hlen().
485  *
486  * This value is for Azure.  For Hyper-V, set this above
487  * 65536 to disable UDP datagram checksum fixup.
488  */
489 static int			hn_udpcs_fixup_mtu = 1420;
490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
491     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
492 
493 /* Limit TSO burst size */
494 static int			hn_tso_maxlen = IP_MAXPACKET;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
496     &hn_tso_maxlen, 0, "TSO burst limit");
497 
498 /* Limit chimney send size */
499 static int			hn_tx_chimney_size = 0;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
501     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
502 
503 /* Limit the size of packet for direct transmission */
504 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
506     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
507 
508 /* # of LRO entries per RX ring */
509 #if defined(INET) || defined(INET6)
510 #if __FreeBSD_version >= 1100095
511 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
513     &hn_lro_entry_count, 0, "LRO entry count");
514 #endif
515 #endif
516 
517 static int			hn_tx_taskq_cnt = 1;
518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
519     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
520 
521 #define HN_TX_TASKQ_M_INDEP	0
522 #define HN_TX_TASKQ_M_GLOBAL	1
523 #define HN_TX_TASKQ_M_EVTTQ	2
524 
525 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
527     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
528     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
529 
530 #ifndef HN_USE_TXDESC_BUFRING
531 static int			hn_use_txdesc_bufring = 0;
532 #else
533 static int			hn_use_txdesc_bufring = 1;
534 #endif
535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
536     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
537 
538 #ifdef HN_IFSTART_SUPPORT
539 /* Use ifnet.if_start instead of ifnet.if_transmit */
540 static int			hn_use_if_start = 0;
541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
542     &hn_use_if_start, 0, "Use if_start TX method");
543 #endif
544 
545 /* # of channels to use */
546 static int			hn_chan_cnt = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
548     &hn_chan_cnt, 0,
549     "# of channels to use; each channel has one RX ring and one TX ring");
550 
551 /* # of transmit rings to use */
552 static int			hn_tx_ring_cnt = 0;
553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
554     &hn_tx_ring_cnt, 0, "# of TX rings to use");
555 
556 /* Software TX ring deptch */
557 static int			hn_tx_swq_depth = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
559     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
560 
561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
562 #if __FreeBSD_version >= 1100095
563 static u_int			hn_lro_mbufq_depth = 0;
564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
565     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 #endif
567 
568 /* Packet transmission aggregation size limit */
569 static int			hn_tx_agg_size = -1;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
571     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
572 
573 /* Packet transmission aggregation count limit */
574 static int			hn_tx_agg_pkts = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
576     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 
578 /* VF list */
579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
580     0, 0, hn_vflist_sysctl, "A", "VF list");
581 
582 /* VF mapping */
583 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
584     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 
586 /* Transparent VF */
587 static int			hn_xpnt_vf = 1;
588 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
589     &hn_xpnt_vf, 0, "Transparent VF mod");
590 
591 /* Accurate BPF support for Transparent VF */
592 static int			hn_xpnt_vf_accbpf = 0;
593 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
594     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
595 
596 /* Extra wait for transparent VF attach routing; unit seconds. */
597 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
598 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
599     &hn_xpnt_vf_attwait, 0,
600     "Extra wait for transparent VF attach routing; unit: seconds");
601 
602 static u_int			hn_cpu_index;	/* next CPU for channel */
603 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
604 
605 static struct rmlock		hn_vfmap_lock;
606 static int			hn_vfmap_size;
607 static struct ifnet		**hn_vfmap;
608 
609 #ifndef RSS
610 static const uint8_t
611 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
612 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
613 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
614 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
615 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
616 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
617 };
618 #endif	/* !RSS */
619 
620 static const struct hyperv_guid	hn_guid = {
621 	.hv_guid = {
622 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
623 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
624 };
625 
626 static device_method_t hn_methods[] = {
627 	/* Device interface */
628 	DEVMETHOD(device_probe,		hn_probe),
629 	DEVMETHOD(device_attach,	hn_attach),
630 	DEVMETHOD(device_detach,	hn_detach),
631 	DEVMETHOD(device_shutdown,	hn_shutdown),
632 	DEVMETHOD_END
633 };
634 
635 static driver_t hn_driver = {
636 	"hn",
637 	hn_methods,
638 	sizeof(struct hn_softc)
639 };
640 
641 static devclass_t hn_devclass;
642 
643 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646 
647 #if __FreeBSD_version >= 1100099
648 static void
649 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
650 {
651 	int i;
652 
653 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
654 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 }
656 #endif
657 
658 static int
659 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
660 {
661 
662 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
663 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
664 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
665 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
666 }
667 
668 static int
669 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
670 {
671 	struct hn_nvs_rndis rndis;
672 
673 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size > 0, ("invalid rndis chim txd"));
675 
676 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
677 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
678 	rndis.nvs_chim_idx = txd->chim_index;
679 	rndis.nvs_chim_sz = txd->chim_size;
680 
681 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
682 	    &rndis, sizeof(rndis), &txd->send_ctx));
683 }
684 
685 static __inline uint32_t
686 hn_chim_alloc(struct hn_softc *sc)
687 {
688 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
689 	u_long *bmap = sc->hn_chim_bmap;
690 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
691 
692 	for (i = 0; i < bmap_cnt; ++i) {
693 		int idx;
694 
695 		idx = ffsl(~bmap[i]);
696 		if (idx == 0)
697 			continue;
698 
699 		--idx; /* ffsl is 1-based */
700 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
701 		    ("invalid i %d and idx %d", i, idx));
702 
703 		if (atomic_testandset_long(&bmap[i], idx))
704 			continue;
705 
706 		ret = i * LONG_BIT + idx;
707 		break;
708 	}
709 	return (ret);
710 }
711 
712 static __inline void
713 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 {
715 	u_long mask;
716 	uint32_t idx;
717 
718 	idx = chim_idx / LONG_BIT;
719 	KASSERT(idx < sc->hn_chim_bmap_cnt,
720 	    ("invalid chimney index 0x%x", chim_idx));
721 
722 	mask = 1UL << (chim_idx % LONG_BIT);
723 	KASSERT(sc->hn_chim_bmap[idx] & mask,
724 	    ("index bitmap 0x%lx, chimney index %u, "
725 	     "bitmap idx %d, bitmask 0x%lx",
726 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
727 
728 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
729 }
730 
731 #if defined(INET6) || defined(INET)
732 
733 #define PULLUP_HDR(m, len)				\
734 do {							\
735 	if (__predict_false((m)->m_len < (len))) {	\
736 		(m) = m_pullup((m), (len));		\
737 		if ((m) == NULL)			\
738 			return (NULL);			\
739 	}						\
740 } while (0)
741 
742 /*
743  * NOTE: If this function failed, the m_head would be freed.
744  */
745 static __inline struct mbuf *
746 hn_tso_fixup(struct mbuf *m_head)
747 {
748 	struct ether_vlan_header *evl;
749 	struct tcphdr *th;
750 	int ehlen;
751 
752 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
753 
754 	PULLUP_HDR(m_head, sizeof(*evl));
755 	evl = mtod(m_head, struct ether_vlan_header *);
756 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
757 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
758 	else
759 		ehlen = ETHER_HDR_LEN;
760 	m_head->m_pkthdr.l2hlen = ehlen;
761 
762 #ifdef INET
763 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
764 		struct ip *ip;
765 		int iphlen;
766 
767 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
768 		ip = mtodo(m_head, ehlen);
769 		iphlen = ip->ip_hl << 2;
770 		m_head->m_pkthdr.l3hlen = iphlen;
771 
772 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
773 		th = mtodo(m_head, ehlen + iphlen);
774 
775 		ip->ip_len = 0;
776 		ip->ip_sum = 0;
777 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
778 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
779 	}
780 #endif
781 #if defined(INET6) && defined(INET)
782 	else
783 #endif
784 #ifdef INET6
785 	{
786 		struct ip6_hdr *ip6;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
789 		ip6 = mtodo(m_head, ehlen);
790 		if (ip6->ip6_nxt != IPPROTO_TCP) {
791 			m_freem(m_head);
792 			return (NULL);
793 		}
794 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
795 
796 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
797 		th = mtodo(m_head, ehlen + sizeof(*ip6));
798 
799 		ip6->ip6_plen = 0;
800 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
801 	}
802 #endif
803 	return (m_head);
804 }
805 
806 /*
807  * NOTE: If this function failed, the m_head would be freed.
808  */
809 static __inline struct mbuf *
810 hn_set_hlen(struct mbuf *m_head)
811 {
812 	const struct ether_vlan_header *evl;
813 	int ehlen;
814 
815 	PULLUP_HDR(m_head, sizeof(*evl));
816 	evl = mtod(m_head, const struct ether_vlan_header *);
817 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
818 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
819 	else
820 		ehlen = ETHER_HDR_LEN;
821 	m_head->m_pkthdr.l2hlen = ehlen;
822 
823 #ifdef INET
824 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
825 		const struct ip *ip;
826 		int iphlen;
827 
828 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
829 		ip = mtodo(m_head, ehlen);
830 		iphlen = ip->ip_hl << 2;
831 		m_head->m_pkthdr.l3hlen = iphlen;
832 
833 		/*
834 		 * UDP checksum offload does not work in Azure, if the
835 		 * following conditions meet:
836 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
837 		 * - IP_DF is not set in the IP hdr.
838 		 *
839 		 * Fallback to software checksum for these UDP datagrams.
840 		 */
841 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
842 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
843 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
844 			uint16_t off = ehlen + iphlen;
845 
846 			counter_u64_add(hn_udpcs_fixup, 1);
847 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
848 			*(uint16_t *)(m_head->m_data + off +
849                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
850 			    m_head, m_head->m_pkthdr.len, off);
851 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
852 		}
853 	}
854 #endif
855 #if defined(INET6) && defined(INET)
856 	else
857 #endif
858 #ifdef INET6
859 	{
860 		const struct ip6_hdr *ip6;
861 
862 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
863 		ip6 = mtodo(m_head, ehlen);
864 		if (ip6->ip6_nxt != IPPROTO_TCP) {
865 			m_freem(m_head);
866 			return (NULL);
867 		}
868 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
869 	}
870 #endif
871 	return (m_head);
872 }
873 
874 /*
875  * NOTE: If this function failed, the m_head would be freed.
876  */
877 static __inline struct mbuf *
878 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
879 {
880 	const struct tcphdr *th;
881 	int ehlen, iphlen;
882 
883 	*tcpsyn = 0;
884 	ehlen = m_head->m_pkthdr.l2hlen;
885 	iphlen = m_head->m_pkthdr.l3hlen;
886 
887 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
888 	th = mtodo(m_head, ehlen + iphlen);
889 	if (th->th_flags & TH_SYN)
890 		*tcpsyn = 1;
891 	return (m_head);
892 }
893 
894 #undef PULLUP_HDR
895 
896 #endif	/* INET6 || INET */
897 
898 static int
899 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
900 {
901 	int error = 0;
902 
903 	HN_LOCK_ASSERT(sc);
904 
905 	if (sc->hn_rx_filter != filter) {
906 		error = hn_rndis_set_rxfilter(sc, filter);
907 		if (!error)
908 			sc->hn_rx_filter = filter;
909 	}
910 	return (error);
911 }
912 
913 static int
914 hn_rxfilter_config(struct hn_softc *sc)
915 {
916 	struct ifnet *ifp = sc->hn_ifp;
917 	uint32_t filter;
918 
919 	HN_LOCK_ASSERT(sc);
920 
921 	/*
922 	 * If the non-transparent mode VF is activated, we don't know how
923 	 * its RX filter is configured, so stick the synthetic device in
924 	 * the promiscous mode.
925 	 */
926 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
927 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
928 	} else {
929 		filter = NDIS_PACKET_TYPE_DIRECTED;
930 		if (ifp->if_flags & IFF_BROADCAST)
931 			filter |= NDIS_PACKET_TYPE_BROADCAST;
932 		/* TODO: support multicast list */
933 		if ((ifp->if_flags & IFF_ALLMULTI) ||
934 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
935 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
936 	}
937 	return (hn_set_rxfilter(sc, filter));
938 }
939 
940 static void
941 hn_set_txagg(struct hn_softc *sc)
942 {
943 	uint32_t size, pkts;
944 	int i;
945 
946 	/*
947 	 * Setup aggregation size.
948 	 */
949 	if (sc->hn_agg_size < 0)
950 		size = UINT32_MAX;
951 	else
952 		size = sc->hn_agg_size;
953 
954 	if (sc->hn_rndis_agg_size < size)
955 		size = sc->hn_rndis_agg_size;
956 
957 	/* NOTE: We only aggregate packets using chimney sending buffers. */
958 	if (size > (uint32_t)sc->hn_chim_szmax)
959 		size = sc->hn_chim_szmax;
960 
961 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
962 		/* Disable */
963 		size = 0;
964 		pkts = 0;
965 		goto done;
966 	}
967 
968 	/* NOTE: Type of the per TX ring setting is 'int'. */
969 	if (size > INT_MAX)
970 		size = INT_MAX;
971 
972 	/*
973 	 * Setup aggregation packet count.
974 	 */
975 	if (sc->hn_agg_pkts < 0)
976 		pkts = UINT32_MAX;
977 	else
978 		pkts = sc->hn_agg_pkts;
979 
980 	if (sc->hn_rndis_agg_pkts < pkts)
981 		pkts = sc->hn_rndis_agg_pkts;
982 
983 	if (pkts <= 1) {
984 		/* Disable */
985 		size = 0;
986 		pkts = 0;
987 		goto done;
988 	}
989 
990 	/* NOTE: Type of the per TX ring setting is 'short'. */
991 	if (pkts > SHRT_MAX)
992 		pkts = SHRT_MAX;
993 
994 done:
995 	/* NOTE: Type of the per TX ring setting is 'short'. */
996 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
997 		/* Disable */
998 		size = 0;
999 		pkts = 0;
1000 	}
1001 
1002 	if (bootverbose) {
1003 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1004 		    size, pkts, sc->hn_rndis_agg_align);
1005 	}
1006 
1007 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1008 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1009 
1010 		mtx_lock(&txr->hn_tx_lock);
1011 		txr->hn_agg_szmax = size;
1012 		txr->hn_agg_pktmax = pkts;
1013 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1014 		mtx_unlock(&txr->hn_tx_lock);
1015 	}
1016 }
1017 
1018 static int
1019 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1020 {
1021 
1022 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1023 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1024 		return txr->hn_txdesc_cnt;
1025 	return hn_tx_swq_depth;
1026 }
1027 
1028 static int
1029 hn_rss_reconfig(struct hn_softc *sc)
1030 {
1031 	int error;
1032 
1033 	HN_LOCK_ASSERT(sc);
1034 
1035 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1036 		return (ENXIO);
1037 
1038 	/*
1039 	 * Disable RSS first.
1040 	 *
1041 	 * NOTE:
1042 	 * Direct reconfiguration by setting the UNCHG flags does
1043 	 * _not_ work properly.
1044 	 */
1045 	if (bootverbose)
1046 		if_printf(sc->hn_ifp, "disable RSS\n");
1047 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1048 	if (error) {
1049 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1050 		return (error);
1051 	}
1052 
1053 	/*
1054 	 * Reenable the RSS w/ the updated RSS key or indirect
1055 	 * table.
1056 	 */
1057 	if (bootverbose)
1058 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1059 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1060 	if (error) {
1061 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1062 		return (error);
1063 	}
1064 	return (0);
1065 }
1066 
1067 static void
1068 hn_rss_ind_fixup(struct hn_softc *sc)
1069 {
1070 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1071 	int i, nchan;
1072 
1073 	nchan = sc->hn_rx_ring_inuse;
1074 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1075 
1076 	/*
1077 	 * Check indirect table to make sure that all channels in it
1078 	 * can be used.
1079 	 */
1080 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1081 		if (rss->rss_ind[i] >= nchan) {
1082 			if_printf(sc->hn_ifp,
1083 			    "RSS indirect table %d fixup: %u -> %d\n",
1084 			    i, rss->rss_ind[i], nchan - 1);
1085 			rss->rss_ind[i] = nchan - 1;
1086 		}
1087 	}
1088 }
1089 
1090 static int
1091 hn_ifmedia_upd(struct ifnet *ifp __unused)
1092 {
1093 
1094 	return EOPNOTSUPP;
1095 }
1096 
1097 static void
1098 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1099 {
1100 	struct hn_softc *sc = ifp->if_softc;
1101 
1102 	ifmr->ifm_status = IFM_AVALID;
1103 	ifmr->ifm_active = IFM_ETHER;
1104 
1105 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1106 		ifmr->ifm_active |= IFM_NONE;
1107 		return;
1108 	}
1109 	ifmr->ifm_status |= IFM_ACTIVE;
1110 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1111 }
1112 
1113 static void
1114 hn_rxvf_set_task(void *xarg, int pending __unused)
1115 {
1116 	struct hn_rxvf_setarg *arg = xarg;
1117 
1118 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1119 }
1120 
1121 static void
1122 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1123 {
1124 	struct hn_rx_ring *rxr;
1125 	struct hn_rxvf_setarg arg;
1126 	struct task task;
1127 	int i;
1128 
1129 	HN_LOCK_ASSERT(sc);
1130 
1131 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1132 
1133 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1134 		rxr = &sc->hn_rx_ring[i];
1135 
1136 		if (i < sc->hn_rx_ring_inuse) {
1137 			arg.rxr = rxr;
1138 			arg.vf_ifp = vf_ifp;
1139 			vmbus_chan_run_task(rxr->hn_chan, &task);
1140 		} else {
1141 			rxr->hn_rxvf_ifp = vf_ifp;
1142 		}
1143 	}
1144 }
1145 
1146 static bool
1147 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1148 {
1149 	const struct ifnet *hn_ifp;
1150 
1151 	hn_ifp = sc->hn_ifp;
1152 
1153 	if (ifp == hn_ifp)
1154 		return (false);
1155 
1156 	if (ifp->if_alloctype != IFT_ETHER)
1157 		return (false);
1158 
1159 	/* Ignore lagg/vlan interfaces */
1160 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1161 	    strcmp(ifp->if_dname, "vlan") == 0)
1162 		return (false);
1163 
1164 	/*
1165 	 * During detach events ifp->if_addr might be NULL.
1166 	 * Make sure the bcmp() below doesn't panic on that:
1167 	 */
1168 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1169 		return (false);
1170 
1171 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1172 		return (false);
1173 
1174 	return (true);
1175 }
1176 
1177 static void
1178 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1179 {
1180 	struct ifnet *hn_ifp;
1181 
1182 	HN_LOCK(sc);
1183 
1184 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1185 		goto out;
1186 
1187 	if (!hn_ismyvf(sc, ifp))
1188 		goto out;
1189 	hn_ifp = sc->hn_ifp;
1190 
1191 	if (rxvf) {
1192 		if (sc->hn_flags & HN_FLAG_RXVF)
1193 			goto out;
1194 
1195 		sc->hn_flags |= HN_FLAG_RXVF;
1196 		hn_rxfilter_config(sc);
1197 	} else {
1198 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1199 			goto out;
1200 
1201 		sc->hn_flags &= ~HN_FLAG_RXVF;
1202 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1203 			hn_rxfilter_config(sc);
1204 		else
1205 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1206 	}
1207 
1208 	hn_nvs_set_datapath(sc,
1209 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1210 
1211 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1212 
1213 	if (rxvf) {
1214 		hn_vf_rss_fixup(sc, true);
1215 		hn_suspend_mgmt(sc);
1216 		sc->hn_link_flags &=
1217 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1218 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1219 	} else {
1220 		hn_vf_rss_restore(sc);
1221 		hn_resume_mgmt(sc);
1222 	}
1223 
1224 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1225 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1226 
1227 	if (bootverbose) {
1228 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1229 		    rxvf ? "to" : "from", ifp->if_xname);
1230 	}
1231 out:
1232 	HN_UNLOCK(sc);
1233 }
1234 
1235 static void
1236 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1237 {
1238 
1239 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1240 		return;
1241 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1242 }
1243 
1244 static void
1245 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1246 {
1247 
1248 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1249 }
1250 
1251 static int
1252 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1253 {
1254 	struct ifnet *ifp, *vf_ifp;
1255 	uint64_t tmp;
1256 	int error;
1257 
1258 	HN_LOCK_ASSERT(sc);
1259 	ifp = sc->hn_ifp;
1260 	vf_ifp = sc->hn_vf_ifp;
1261 
1262 	/*
1263 	 * Fix up requested capabilities w/ supported capabilities,
1264 	 * since the supported capabilities could have been changed.
1265 	 */
1266 	ifr->ifr_reqcap &= ifp->if_capabilities;
1267 	/* Pass SIOCSIFCAP to VF. */
1268 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1269 
1270 	/*
1271 	 * NOTE:
1272 	 * The error will be propagated to the callers, however, it
1273 	 * is _not_ useful here.
1274 	 */
1275 
1276 	/*
1277 	 * Merge VF's enabled capabilities.
1278 	 */
1279 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1280 
1281 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1282 	if (ifp->if_capenable & IFCAP_TXCSUM)
1283 		ifp->if_hwassist |= tmp;
1284 	else
1285 		ifp->if_hwassist &= ~tmp;
1286 
1287 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1288 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1289 		ifp->if_hwassist |= tmp;
1290 	else
1291 		ifp->if_hwassist &= ~tmp;
1292 
1293 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1294 	if (ifp->if_capenable & IFCAP_TSO4)
1295 		ifp->if_hwassist |= tmp;
1296 	else
1297 		ifp->if_hwassist &= ~tmp;
1298 
1299 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1300 	if (ifp->if_capenable & IFCAP_TSO6)
1301 		ifp->if_hwassist |= tmp;
1302 	else
1303 		ifp->if_hwassist &= ~tmp;
1304 
1305 	return (error);
1306 }
1307 
1308 static int
1309 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1310 {
1311 	struct ifnet *vf_ifp;
1312 	struct ifreq ifr;
1313 
1314 	HN_LOCK_ASSERT(sc);
1315 	vf_ifp = sc->hn_vf_ifp;
1316 
1317 	memset(&ifr, 0, sizeof(ifr));
1318 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1319 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1320 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1321 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1322 }
1323 
1324 static void
1325 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1326 {
1327 	struct ifnet *ifp = sc->hn_ifp;
1328 	int allmulti = 0;
1329 
1330 	HN_LOCK_ASSERT(sc);
1331 
1332 	/* XXX vlan(4) style mcast addr maintenance */
1333 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1334 		allmulti = IFF_ALLMULTI;
1335 
1336 	/* Always set the VF's if_flags */
1337 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1338 }
1339 
1340 static void
1341 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1342 {
1343 	struct rm_priotracker pt;
1344 	struct ifnet *hn_ifp = NULL;
1345 	struct mbuf *mn;
1346 
1347 	/*
1348 	 * XXX racy, if hn(4) ever detached.
1349 	 */
1350 	rm_rlock(&hn_vfmap_lock, &pt);
1351 	if (vf_ifp->if_index < hn_vfmap_size)
1352 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1353 	rm_runlock(&hn_vfmap_lock, &pt);
1354 
1355 	if (hn_ifp != NULL) {
1356 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1357 			/*
1358 			 * Allow tapping on the VF.
1359 			 */
1360 			ETHER_BPF_MTAP(vf_ifp, mn);
1361 
1362 			/*
1363 			 * Update VF stats.
1364 			 */
1365 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1366 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1367 				    mn->m_pkthdr.len);
1368 			}
1369 			/*
1370 			 * XXX IFCOUNTER_IMCAST
1371 			 * This stat updating is kinda invasive, since it
1372 			 * requires two checks on the mbuf: the length check
1373 			 * and the ethernet header check.  As of this write,
1374 			 * all multicast packets go directly to hn(4), which
1375 			 * makes imcast stat updating in the VF a try in vian.
1376 			 */
1377 
1378 			/*
1379 			 * Fix up rcvif and increase hn(4)'s ipackets.
1380 			 */
1381 			mn->m_pkthdr.rcvif = hn_ifp;
1382 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1383 		}
1384 		/*
1385 		 * Go through hn(4)'s if_input.
1386 		 */
1387 		hn_ifp->if_input(hn_ifp, m);
1388 	} else {
1389 		/*
1390 		 * In the middle of the transition; free this
1391 		 * mbuf chain.
1392 		 */
1393 		while (m != NULL) {
1394 			mn = m->m_nextpkt;
1395 			m->m_nextpkt = NULL;
1396 			m_freem(m);
1397 			m = mn;
1398 		}
1399 	}
1400 }
1401 
1402 static void
1403 hn_mtu_change_fixup(struct hn_softc *sc)
1404 {
1405 	struct ifnet *ifp;
1406 
1407 	HN_LOCK_ASSERT(sc);
1408 	ifp = sc->hn_ifp;
1409 
1410 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1411 #if __FreeBSD_version >= 1100099
1412 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1413 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1414 #endif
1415 }
1416 
1417 static uint32_t
1418 hn_rss_type_fromndis(uint32_t rss_hash)
1419 {
1420 	uint32_t types = 0;
1421 
1422 	if (rss_hash & NDIS_HASH_IPV4)
1423 		types |= RSS_TYPE_IPV4;
1424 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1425 		types |= RSS_TYPE_TCP_IPV4;
1426 	if (rss_hash & NDIS_HASH_IPV6)
1427 		types |= RSS_TYPE_IPV6;
1428 	if (rss_hash & NDIS_HASH_IPV6_EX)
1429 		types |= RSS_TYPE_IPV6_EX;
1430 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1431 		types |= RSS_TYPE_TCP_IPV6;
1432 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1433 		types |= RSS_TYPE_TCP_IPV6_EX;
1434 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1435 		types |= RSS_TYPE_UDP_IPV4;
1436 	return (types);
1437 }
1438 
1439 static uint32_t
1440 hn_rss_type_tondis(uint32_t types)
1441 {
1442 	uint32_t rss_hash = 0;
1443 
1444 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1445 	    ("UDP6 and UDP6EX are not supported"));
1446 
1447 	if (types & RSS_TYPE_IPV4)
1448 		rss_hash |= NDIS_HASH_IPV4;
1449 	if (types & RSS_TYPE_TCP_IPV4)
1450 		rss_hash |= NDIS_HASH_TCP_IPV4;
1451 	if (types & RSS_TYPE_IPV6)
1452 		rss_hash |= NDIS_HASH_IPV6;
1453 	if (types & RSS_TYPE_IPV6_EX)
1454 		rss_hash |= NDIS_HASH_IPV6_EX;
1455 	if (types & RSS_TYPE_TCP_IPV6)
1456 		rss_hash |= NDIS_HASH_TCP_IPV6;
1457 	if (types & RSS_TYPE_TCP_IPV6_EX)
1458 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1459 	if (types & RSS_TYPE_UDP_IPV4)
1460 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1461 	return (rss_hash);
1462 }
1463 
1464 static void
1465 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1466 {
1467 	int i;
1468 
1469 	HN_LOCK_ASSERT(sc);
1470 
1471 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1472 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1473 }
1474 
1475 static void
1476 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1477 {
1478 	struct ifnet *ifp, *vf_ifp;
1479 	struct ifrsshash ifrh;
1480 	struct ifrsskey ifrk;
1481 	int error;
1482 	uint32_t my_types, diff_types, mbuf_types = 0;
1483 
1484 	HN_LOCK_ASSERT(sc);
1485 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1486 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1487 
1488 	if (sc->hn_rx_ring_inuse == 1) {
1489 		/* No RSS on synthetic parts; done. */
1490 		return;
1491 	}
1492 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1493 		/* Synthetic parts do not support Toeplitz; done. */
1494 		return;
1495 	}
1496 
1497 	ifp = sc->hn_ifp;
1498 	vf_ifp = sc->hn_vf_ifp;
1499 
1500 	/*
1501 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1502 	 * supported.
1503 	 */
1504 	memset(&ifrk, 0, sizeof(ifrk));
1505 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1506 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1507 	if (error) {
1508 		if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1509 		    vf_ifp->if_xname, error);
1510 		goto done;
1511 	}
1512 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1513 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1514 		    vf_ifp->if_xname, ifrk.ifrk_func);
1515 		goto done;
1516 	}
1517 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1518 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1519 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1520 		goto done;
1521 	}
1522 
1523 	/*
1524 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1525 	 */
1526 	memset(&ifrh, 0, sizeof(ifrh));
1527 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1528 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1529 	if (error) {
1530 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1531 		    vf_ifp->if_xname, error);
1532 		goto done;
1533 	}
1534 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1535 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1536 		    vf_ifp->if_xname, ifrh.ifrh_func);
1537 		goto done;
1538 	}
1539 
1540 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1541 	if ((ifrh.ifrh_types & my_types) == 0) {
1542 		/* This disables RSS; ignore it then */
1543 		if_printf(ifp, "%s intersection of RSS types failed.  "
1544 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1545 		    ifrh.ifrh_types, my_types);
1546 		goto done;
1547 	}
1548 
1549 	diff_types = my_types ^ ifrh.ifrh_types;
1550 	my_types &= ifrh.ifrh_types;
1551 	mbuf_types = my_types;
1552 
1553 	/*
1554 	 * Detect RSS hash value/type confliction.
1555 	 *
1556 	 * NOTE:
1557 	 * We don't disable the hash type, but stop delivery the hash
1558 	 * value/type through mbufs on RX path.
1559 	 *
1560 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1561 	 * hash is delivered with type of TCP_IPV4.  This means if
1562 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1563 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1564 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1565 	 * here.
1566 	 */
1567 	if ((my_types & RSS_TYPE_IPV4) &&
1568 	    (diff_types & ifrh.ifrh_types &
1569 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1570 		/* Conflict; disable IPV4 hash type/value delivery. */
1571 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1572 		mbuf_types &= ~RSS_TYPE_IPV4;
1573 	}
1574 	if ((my_types & RSS_TYPE_IPV6) &&
1575 	    (diff_types & ifrh.ifrh_types &
1576 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1577 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1578 	      RSS_TYPE_IPV6_EX))) {
1579 		/* Conflict; disable IPV6 hash type/value delivery. */
1580 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1581 		mbuf_types &= ~RSS_TYPE_IPV6;
1582 	}
1583 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1584 	    (diff_types & ifrh.ifrh_types &
1585 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1586 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1587 	      RSS_TYPE_IPV6))) {
1588 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1589 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1590 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1591 	}
1592 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1593 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1594 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1595 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1596 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1597 	}
1598 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1599 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1600 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1601 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1602 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1603 	}
1604 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1605 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1606 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1607 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1608 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1609 	}
1610 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1611 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1612 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1613 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1614 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1615 	}
1616 
1617 	/*
1618 	 * Indirect table does not matter.
1619 	 */
1620 
1621 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1622 	    hn_rss_type_tondis(my_types);
1623 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1624 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1625 
1626 	if (reconf) {
1627 		error = hn_rss_reconfig(sc);
1628 		if (error) {
1629 			/* XXX roll-back? */
1630 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1631 			/* XXX keep going. */
1632 		}
1633 	}
1634 done:
1635 	/* Hash deliverability for mbufs. */
1636 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1637 }
1638 
1639 static void
1640 hn_vf_rss_restore(struct hn_softc *sc)
1641 {
1642 
1643 	HN_LOCK_ASSERT(sc);
1644 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1645 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1646 
1647 	if (sc->hn_rx_ring_inuse == 1)
1648 		goto done;
1649 
1650 	/*
1651 	 * Restore hash types.  Key does _not_ matter.
1652 	 */
1653 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1654 		int error;
1655 
1656 		sc->hn_rss_hash = sc->hn_rss_hcap;
1657 		error = hn_rss_reconfig(sc);
1658 		if (error) {
1659 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1660 			    error);
1661 			/* XXX keep going. */
1662 		}
1663 	}
1664 done:
1665 	/* Hash deliverability for mbufs. */
1666 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1667 }
1668 
1669 static void
1670 hn_xpnt_vf_setready(struct hn_softc *sc)
1671 {
1672 	struct ifnet *ifp, *vf_ifp;
1673 	struct ifreq ifr;
1674 
1675 	HN_LOCK_ASSERT(sc);
1676 	ifp = sc->hn_ifp;
1677 	vf_ifp = sc->hn_vf_ifp;
1678 
1679 	/*
1680 	 * Mark the VF ready.
1681 	 */
1682 	sc->hn_vf_rdytick = 0;
1683 
1684 	/*
1685 	 * Save information for restoration.
1686 	 */
1687 	sc->hn_saved_caps = ifp->if_capabilities;
1688 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1689 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1690 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1691 
1692 	/*
1693 	 * Intersect supported/enabled capabilities.
1694 	 *
1695 	 * NOTE:
1696 	 * if_hwassist is not changed here.
1697 	 */
1698 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1699 	ifp->if_capenable &= ifp->if_capabilities;
1700 
1701 	/*
1702 	 * Fix TSO settings.
1703 	 */
1704 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1705 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1706 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1707 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1708 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1709 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1710 
1711 	/*
1712 	 * Change VF's enabled capabilities.
1713 	 */
1714 	memset(&ifr, 0, sizeof(ifr));
1715 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1716 	ifr.ifr_reqcap = ifp->if_capenable;
1717 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1718 
1719 	if (ifp->if_mtu != ETHERMTU) {
1720 		int error;
1721 
1722 		/*
1723 		 * Change VF's MTU.
1724 		 */
1725 		memset(&ifr, 0, sizeof(ifr));
1726 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1727 		ifr.ifr_mtu = ifp->if_mtu;
1728 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1729 		if (error) {
1730 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1731 			    vf_ifp->if_xname, ifp->if_mtu);
1732 			if (ifp->if_mtu > ETHERMTU) {
1733 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1734 
1735 				/*
1736 				 * XXX
1737 				 * No need to adjust the synthetic parts' MTU;
1738 				 * failure of the adjustment will cause us
1739 				 * infinite headache.
1740 				 */
1741 				ifp->if_mtu = ETHERMTU;
1742 				hn_mtu_change_fixup(sc);
1743 			}
1744 		}
1745 	}
1746 }
1747 
1748 static bool
1749 hn_xpnt_vf_isready(struct hn_softc *sc)
1750 {
1751 
1752 	HN_LOCK_ASSERT(sc);
1753 
1754 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1755 		return (false);
1756 
1757 	if (sc->hn_vf_rdytick == 0)
1758 		return (true);
1759 
1760 	if (sc->hn_vf_rdytick > ticks)
1761 		return (false);
1762 
1763 	/* Mark VF as ready. */
1764 	hn_xpnt_vf_setready(sc);
1765 	return (true);
1766 }
1767 
1768 static void
1769 hn_xpnt_vf_setenable(struct hn_softc *sc)
1770 {
1771 	int i;
1772 
1773 	HN_LOCK_ASSERT(sc);
1774 
1775 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1776 	rm_wlock(&sc->hn_vf_lock);
1777 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1778 	rm_wunlock(&sc->hn_vf_lock);
1779 
1780 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1781 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1782 }
1783 
1784 static void
1785 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1786 {
1787 	int i;
1788 
1789 	HN_LOCK_ASSERT(sc);
1790 
1791 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1792 	rm_wlock(&sc->hn_vf_lock);
1793 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1794 	if (clear_vf)
1795 		sc->hn_vf_ifp = NULL;
1796 	rm_wunlock(&sc->hn_vf_lock);
1797 
1798 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1799 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1800 }
1801 
1802 static void
1803 hn_xpnt_vf_init(struct hn_softc *sc)
1804 {
1805 	int error;
1806 
1807 	HN_LOCK_ASSERT(sc);
1808 
1809 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1810 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1811 
1812 	if (bootverbose) {
1813 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1814 		    sc->hn_vf_ifp->if_xname);
1815 	}
1816 
1817 	/*
1818 	 * Bring the VF up.
1819 	 */
1820 	hn_xpnt_vf_saveifflags(sc);
1821 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1822 	error = hn_xpnt_vf_iocsetflags(sc);
1823 	if (error) {
1824 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1825 		    sc->hn_vf_ifp->if_xname, error);
1826 		return;
1827 	}
1828 
1829 	/*
1830 	 * NOTE:
1831 	 * Datapath setting must happen _after_ bringing the VF up.
1832 	 */
1833 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1834 
1835 	/*
1836 	 * NOTE:
1837 	 * Fixup RSS related bits _after_ the VF is brought up, since
1838 	 * many VFs generate RSS key during it's initialization.
1839 	 */
1840 	hn_vf_rss_fixup(sc, true);
1841 
1842 	/* Mark transparent mode VF as enabled. */
1843 	hn_xpnt_vf_setenable(sc);
1844 }
1845 
1846 static void
1847 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1848 {
1849 	struct hn_softc *sc = xsc;
1850 
1851 	HN_LOCK(sc);
1852 
1853 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1854 		goto done;
1855 	if (sc->hn_vf_ifp == NULL)
1856 		goto done;
1857 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1858 		goto done;
1859 
1860 	if (sc->hn_vf_rdytick != 0) {
1861 		/* Mark VF as ready. */
1862 		hn_xpnt_vf_setready(sc);
1863 	}
1864 
1865 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1866 		/*
1867 		 * Delayed VF initialization.
1868 		 */
1869 		if (bootverbose) {
1870 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1871 			    sc->hn_vf_ifp->if_xname);
1872 		}
1873 		hn_xpnt_vf_init(sc);
1874 	}
1875 done:
1876 	HN_UNLOCK(sc);
1877 }
1878 
1879 static void
1880 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1881 {
1882 	struct hn_softc *sc = xsc;
1883 
1884 	HN_LOCK(sc);
1885 
1886 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1887 		goto done;
1888 
1889 	if (!hn_ismyvf(sc, ifp))
1890 		goto done;
1891 
1892 	if (sc->hn_vf_ifp != NULL) {
1893 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1894 		    sc->hn_vf_ifp->if_xname);
1895 		goto done;
1896 	}
1897 
1898 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1899 		/*
1900 		 * ifnet.if_start is _not_ supported by transparent
1901 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1902 		 */
1903 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1904 		    "in transparent VF mode.\n", ifp->if_xname);
1905 		goto done;
1906 	}
1907 
1908 	rm_wlock(&hn_vfmap_lock);
1909 
1910 	if (ifp->if_index >= hn_vfmap_size) {
1911 		struct ifnet **newmap;
1912 		int newsize;
1913 
1914 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1915 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1916 		    M_WAITOK | M_ZERO);
1917 
1918 		memcpy(newmap, hn_vfmap,
1919 		    sizeof(struct ifnet *) * hn_vfmap_size);
1920 		free(hn_vfmap, M_DEVBUF);
1921 		hn_vfmap = newmap;
1922 		hn_vfmap_size = newsize;
1923 	}
1924 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1925 	    ("%s: ifindex %d was mapped to %s",
1926 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1927 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1928 
1929 	rm_wunlock(&hn_vfmap_lock);
1930 
1931 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1932 	rm_wlock(&sc->hn_vf_lock);
1933 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1934 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1935 	sc->hn_vf_ifp = ifp;
1936 	rm_wunlock(&sc->hn_vf_lock);
1937 
1938 	if (hn_xpnt_vf) {
1939 		int wait_ticks;
1940 
1941 		/*
1942 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1943 		 * Save vf_ifp's current if_input for later restoration.
1944 		 */
1945 		sc->hn_vf_input = ifp->if_input;
1946 		ifp->if_input = hn_xpnt_vf_input;
1947 
1948 		/*
1949 		 * Stop link status management; use the VF's.
1950 		 */
1951 		hn_suspend_mgmt(sc);
1952 
1953 		/*
1954 		 * Give VF sometime to complete its attach routing.
1955 		 */
1956 		wait_ticks = hn_xpnt_vf_attwait * hz;
1957 		sc->hn_vf_rdytick = ticks + wait_ticks;
1958 
1959 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1960 		    wait_ticks);
1961 	}
1962 done:
1963 	HN_UNLOCK(sc);
1964 }
1965 
1966 static void
1967 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1968 {
1969 	struct hn_softc *sc = xsc;
1970 
1971 	HN_LOCK(sc);
1972 
1973 	if (sc->hn_vf_ifp == NULL)
1974 		goto done;
1975 
1976 	if (!hn_ismyvf(sc, ifp))
1977 		goto done;
1978 
1979 	if (hn_xpnt_vf) {
1980 		/*
1981 		 * Make sure that the delayed initialization is not running.
1982 		 *
1983 		 * NOTE:
1984 		 * - This lock _must_ be released, since the hn_vf_init task
1985 		 *   will try holding this lock.
1986 		 * - It is safe to release this lock here, since the
1987 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1988 		 *
1989 		 * XXX racy, if hn(4) ever detached.
1990 		 */
1991 		HN_UNLOCK(sc);
1992 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1993 		HN_LOCK(sc);
1994 
1995 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1996 		    sc->hn_ifp->if_xname));
1997 		ifp->if_input = sc->hn_vf_input;
1998 		sc->hn_vf_input = NULL;
1999 
2000 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2001 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2002 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2003 
2004 		if (sc->hn_vf_rdytick == 0) {
2005 			/*
2006 			 * The VF was ready; restore some settings.
2007 			 */
2008 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2009 			/*
2010 			 * NOTE:
2011 			 * There is _no_ need to fixup if_capenable and
2012 			 * if_hwassist, since the if_capabilities before
2013 			 * restoration was an intersection of the VF's
2014 			 * if_capabilites and the synthetic device's
2015 			 * if_capabilites.
2016 			 */
2017 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2018 			sc->hn_ifp->if_hw_tsomaxsegcount =
2019 			    sc->hn_saved_tsosegcnt;
2020 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2021 		}
2022 
2023 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2024 			/*
2025 			 * Restore RSS settings.
2026 			 */
2027 			hn_vf_rss_restore(sc);
2028 
2029 			/*
2030 			 * Resume link status management, which was suspended
2031 			 * by hn_ifnet_attevent().
2032 			 */
2033 			hn_resume_mgmt(sc);
2034 		}
2035 	}
2036 
2037 	/* Mark transparent mode VF as disabled. */
2038 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2039 
2040 	rm_wlock(&hn_vfmap_lock);
2041 
2042 	KASSERT(ifp->if_index < hn_vfmap_size,
2043 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2044 	if (hn_vfmap[ifp->if_index] != NULL) {
2045 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2046 		    ("%s: ifindex %d was mapped to %s",
2047 		     ifp->if_xname, ifp->if_index,
2048 		     hn_vfmap[ifp->if_index]->if_xname));
2049 		hn_vfmap[ifp->if_index] = NULL;
2050 	}
2051 
2052 	rm_wunlock(&hn_vfmap_lock);
2053 done:
2054 	HN_UNLOCK(sc);
2055 }
2056 
2057 static void
2058 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2059 {
2060 	struct hn_softc *sc = xsc;
2061 
2062 	if (sc->hn_vf_ifp == ifp)
2063 		if_link_state_change(sc->hn_ifp, link_state);
2064 }
2065 
2066 static int
2067 hn_probe(device_t dev)
2068 {
2069 
2070 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2071 		device_set_desc(dev, "Hyper-V Network Interface");
2072 		return BUS_PROBE_DEFAULT;
2073 	}
2074 	return ENXIO;
2075 }
2076 
2077 static int
2078 hn_attach(device_t dev)
2079 {
2080 	struct hn_softc *sc = device_get_softc(dev);
2081 	struct sysctl_oid_list *child;
2082 	struct sysctl_ctx_list *ctx;
2083 	uint8_t eaddr[ETHER_ADDR_LEN];
2084 	struct ifnet *ifp = NULL;
2085 	int error, ring_cnt, tx_ring_cnt;
2086 	uint32_t mtu;
2087 
2088 	sc->hn_dev = dev;
2089 	sc->hn_prichan = vmbus_get_channel(dev);
2090 	HN_LOCK_INIT(sc);
2091 	rm_init(&sc->hn_vf_lock, "hnvf");
2092 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2093 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2094 
2095 	/*
2096 	 * Initialize these tunables once.
2097 	 */
2098 	sc->hn_agg_size = hn_tx_agg_size;
2099 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2100 
2101 	/*
2102 	 * Setup taskqueue for transmission.
2103 	 */
2104 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2105 		int i;
2106 
2107 		sc->hn_tx_taskqs =
2108 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2109 		    M_DEVBUF, M_WAITOK);
2110 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2111 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2112 			    M_WAITOK, taskqueue_thread_enqueue,
2113 			    &sc->hn_tx_taskqs[i]);
2114 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2115 			    "%s tx%d", device_get_nameunit(dev), i);
2116 		}
2117 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2118 		sc->hn_tx_taskqs = hn_tx_taskque;
2119 	}
2120 
2121 	/*
2122 	 * Setup taskqueue for mangement tasks, e.g. link status.
2123 	 */
2124 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2125 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2126 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2127 	    device_get_nameunit(dev));
2128 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2129 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2130 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2131 	    hn_netchg_status_taskfunc, sc);
2132 
2133 	if (hn_xpnt_vf) {
2134 		/*
2135 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2136 		 */
2137 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2138 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2139 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2140 		    device_get_nameunit(dev));
2141 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2142 		    hn_xpnt_vf_init_taskfunc, sc);
2143 	}
2144 
2145 	/*
2146 	 * Allocate ifnet and setup its name earlier, so that if_printf
2147 	 * can be used by functions, which will be called after
2148 	 * ether_ifattach().
2149 	 */
2150 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2151 	ifp->if_softc = sc;
2152 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2153 
2154 	/*
2155 	 * Initialize ifmedia earlier so that it can be unconditionally
2156 	 * destroyed, if error happened later on.
2157 	 */
2158 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2159 
2160 	/*
2161 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2162 	 * to use (tx_ring_cnt).
2163 	 *
2164 	 * NOTE:
2165 	 * The # of RX rings to use is same as the # of channels to use.
2166 	 */
2167 	ring_cnt = hn_chan_cnt;
2168 	if (ring_cnt <= 0) {
2169 		/* Default */
2170 		ring_cnt = mp_ncpus;
2171 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2172 			ring_cnt = HN_RING_CNT_DEF_MAX;
2173 	} else if (ring_cnt > mp_ncpus) {
2174 		ring_cnt = mp_ncpus;
2175 	}
2176 #ifdef RSS
2177 	if (ring_cnt > rss_getnumbuckets())
2178 		ring_cnt = rss_getnumbuckets();
2179 #endif
2180 
2181 	tx_ring_cnt = hn_tx_ring_cnt;
2182 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2183 		tx_ring_cnt = ring_cnt;
2184 #ifdef HN_IFSTART_SUPPORT
2185 	if (hn_use_if_start) {
2186 		/* ifnet.if_start only needs one TX ring. */
2187 		tx_ring_cnt = 1;
2188 	}
2189 #endif
2190 
2191 	/*
2192 	 * Set the leader CPU for channels.
2193 	 */
2194 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2195 
2196 	/*
2197 	 * Create enough TX/RX rings, even if only limited number of
2198 	 * channels can be allocated.
2199 	 */
2200 	error = hn_create_tx_data(sc, tx_ring_cnt);
2201 	if (error)
2202 		goto failed;
2203 	error = hn_create_rx_data(sc, ring_cnt);
2204 	if (error)
2205 		goto failed;
2206 
2207 	/*
2208 	 * Create transaction context for NVS and RNDIS transactions.
2209 	 */
2210 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2211 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2212 	if (sc->hn_xact == NULL) {
2213 		error = ENXIO;
2214 		goto failed;
2215 	}
2216 
2217 	/*
2218 	 * Install orphan handler for the revocation of this device's
2219 	 * primary channel.
2220 	 *
2221 	 * NOTE:
2222 	 * The processing order is critical here:
2223 	 * Install the orphan handler, _before_ testing whether this
2224 	 * device's primary channel has been revoked or not.
2225 	 */
2226 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2227 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2228 		error = ENXIO;
2229 		goto failed;
2230 	}
2231 
2232 	/*
2233 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2234 	 */
2235 	error = hn_synth_attach(sc, ETHERMTU);
2236 	if (error)
2237 		goto failed;
2238 
2239 	error = hn_rndis_get_eaddr(sc, eaddr);
2240 	if (error)
2241 		goto failed;
2242 
2243 	error = hn_rndis_get_mtu(sc, &mtu);
2244 	if (error)
2245 		mtu = ETHERMTU;
2246 	else if (bootverbose)
2247 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2248 
2249 #if __FreeBSD_version >= 1100099
2250 	if (sc->hn_rx_ring_inuse > 1) {
2251 		/*
2252 		 * Reduce TCP segment aggregation limit for multiple
2253 		 * RX rings to increase ACK timeliness.
2254 		 */
2255 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2256 	}
2257 #endif
2258 
2259 	/*
2260 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2261 	 */
2262 	hn_fixup_tx_data(sc);
2263 	hn_fixup_rx_data(sc);
2264 
2265 	ctx = device_get_sysctl_ctx(dev);
2266 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2267 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2268 	    &sc->hn_nvs_ver, 0, "NVS version");
2269 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2270 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2271 	    hn_ndis_version_sysctl, "A", "NDIS version");
2272 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2273 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2274 	    hn_caps_sysctl, "A", "capabilities");
2275 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2276 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2277 	    hn_hwassist_sysctl, "A", "hwassist");
2278 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2279 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2280 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2281 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2282 	    "max # of TSO segments");
2283 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2284 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2285 	    "max size of TSO segment");
2286 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2287 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2288 	    hn_rxfilter_sysctl, "A", "rxfilter");
2289 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2290 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2291 	    hn_rss_hash_sysctl, "A", "RSS hash");
2292 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2293 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2294 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2295 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2296 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2297 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2298 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2299 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2300 #ifndef RSS
2301 	/*
2302 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2303 	 */
2304 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2305 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2306 	    hn_rss_key_sysctl, "IU", "RSS key");
2307 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2308 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2309 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2310 #endif
2311 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2312 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2313 	    "RNDIS offered packet transmission aggregation size limit");
2314 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2315 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2316 	    "RNDIS offered packet transmission aggregation count limit");
2317 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2318 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2319 	    "RNDIS packet transmission aggregation alignment");
2320 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2321 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2322 	    hn_txagg_size_sysctl, "I",
2323 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2324 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2325 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2326 	    hn_txagg_pkts_sysctl, "I",
2327 	    "Packet transmission aggregation packets, "
2328 	    "0 -- disable, -1 -- auto");
2329 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2330 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 	    hn_polling_sysctl, "I",
2332 	    "Polling frequency: [100,1000000], 0 disable polling");
2333 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2334 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2335 	    hn_vf_sysctl, "A", "Virtual Function's name");
2336 	if (!hn_xpnt_vf) {
2337 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2338 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2339 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2340 	} else {
2341 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2342 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2343 		    hn_xpnt_vf_enabled_sysctl, "I",
2344 		    "Transparent VF enabled");
2345 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2346 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2347 		    hn_xpnt_vf_accbpf_sysctl, "I",
2348 		    "Accurate BPF for transparent VF");
2349 	}
2350 
2351 	/*
2352 	 * Setup the ifmedia, which has been initialized earlier.
2353 	 */
2354 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2355 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2356 	/* XXX ifmedia_set really should do this for us */
2357 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2358 
2359 	/*
2360 	 * Setup the ifnet for this interface.
2361 	 */
2362 
2363 	ifp->if_baudrate = IF_Gbps(10);
2364 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2365 	ifp->if_ioctl = hn_ioctl;
2366 	ifp->if_init = hn_init;
2367 #ifdef HN_IFSTART_SUPPORT
2368 	if (hn_use_if_start) {
2369 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2370 
2371 		ifp->if_start = hn_start;
2372 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2373 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2374 		IFQ_SET_READY(&ifp->if_snd);
2375 	} else
2376 #endif
2377 	{
2378 		ifp->if_transmit = hn_transmit;
2379 		ifp->if_qflush = hn_xmit_qflush;
2380 	}
2381 
2382 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2383 #ifdef foo
2384 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2385 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2386 #endif
2387 	if (sc->hn_caps & HN_CAP_VLAN) {
2388 		/* XXX not sure about VLAN_MTU. */
2389 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2390 	}
2391 
2392 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2393 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2394 		ifp->if_capabilities |= IFCAP_TXCSUM;
2395 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2396 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2397 	if (sc->hn_caps & HN_CAP_TSO4) {
2398 		ifp->if_capabilities |= IFCAP_TSO4;
2399 		ifp->if_hwassist |= CSUM_IP_TSO;
2400 	}
2401 	if (sc->hn_caps & HN_CAP_TSO6) {
2402 		ifp->if_capabilities |= IFCAP_TSO6;
2403 		ifp->if_hwassist |= CSUM_IP6_TSO;
2404 	}
2405 
2406 	/* Enable all available capabilities by default. */
2407 	ifp->if_capenable = ifp->if_capabilities;
2408 
2409 	/*
2410 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2411 	 * be enabled through SIOCSIFCAP.
2412 	 */
2413 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2414 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2415 
2416 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2417 		/*
2418 		 * Lock hn_set_tso_maxsize() to simplify its
2419 		 * internal logic.
2420 		 */
2421 		HN_LOCK(sc);
2422 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2423 		HN_UNLOCK(sc);
2424 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2425 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2426 	}
2427 
2428 	ether_ifattach(ifp, eaddr);
2429 
2430 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2431 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2432 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2433 	}
2434 	if (mtu < ETHERMTU) {
2435 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2436 		ifp->if_mtu = mtu;
2437 	}
2438 
2439 	/* Inform the upper layer about the long frame support. */
2440 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2441 
2442 	/*
2443 	 * Kick off link status check.
2444 	 */
2445 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2446 	hn_update_link_status(sc);
2447 
2448 	if (!hn_xpnt_vf) {
2449 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2450 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2451 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2452 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2453 	} else {
2454 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2455 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2456 	}
2457 
2458 	/*
2459 	 * NOTE:
2460 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2461 	 * since interface's LLADDR is needed; interface LLADDR is not
2462 	 * available when ifnet_arrival event is triggered.
2463 	 */
2464 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2465 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2466 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2467 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2468 
2469 	return (0);
2470 failed:
2471 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2472 		hn_synth_detach(sc);
2473 	hn_detach(dev);
2474 	return (error);
2475 }
2476 
2477 static int
2478 hn_detach(device_t dev)
2479 {
2480 	struct hn_softc *sc = device_get_softc(dev);
2481 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2482 
2483 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2484 		/*
2485 		 * In case that the vmbus missed the orphan handler
2486 		 * installation.
2487 		 */
2488 		vmbus_xact_ctx_orphan(sc->hn_xact);
2489 	}
2490 
2491 	if (sc->hn_ifaddr_evthand != NULL)
2492 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2493 	if (sc->hn_ifnet_evthand != NULL)
2494 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2495 	if (sc->hn_ifnet_atthand != NULL) {
2496 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2497 		    sc->hn_ifnet_atthand);
2498 	}
2499 	if (sc->hn_ifnet_dethand != NULL) {
2500 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2501 		    sc->hn_ifnet_dethand);
2502 	}
2503 	if (sc->hn_ifnet_lnkhand != NULL)
2504 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2505 
2506 	vf_ifp = sc->hn_vf_ifp;
2507 	__compiler_membar();
2508 	if (vf_ifp != NULL)
2509 		hn_ifnet_detevent(sc, vf_ifp);
2510 
2511 	if (device_is_attached(dev)) {
2512 		HN_LOCK(sc);
2513 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2514 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2515 				hn_stop(sc, true);
2516 			/*
2517 			 * NOTE:
2518 			 * hn_stop() only suspends data, so managment
2519 			 * stuffs have to be suspended manually here.
2520 			 */
2521 			hn_suspend_mgmt(sc);
2522 			hn_synth_detach(sc);
2523 		}
2524 		HN_UNLOCK(sc);
2525 		ether_ifdetach(ifp);
2526 	}
2527 
2528 	ifmedia_removeall(&sc->hn_media);
2529 	hn_destroy_rx_data(sc);
2530 	hn_destroy_tx_data(sc);
2531 
2532 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2533 		int i;
2534 
2535 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2536 			taskqueue_free(sc->hn_tx_taskqs[i]);
2537 		free(sc->hn_tx_taskqs, M_DEVBUF);
2538 	}
2539 	taskqueue_free(sc->hn_mgmt_taskq0);
2540 	if (sc->hn_vf_taskq != NULL)
2541 		taskqueue_free(sc->hn_vf_taskq);
2542 
2543 	if (sc->hn_xact != NULL) {
2544 		/*
2545 		 * Uninstall the orphan handler _before_ the xact is
2546 		 * destructed.
2547 		 */
2548 		vmbus_chan_unset_orphan(sc->hn_prichan);
2549 		vmbus_xact_ctx_destroy(sc->hn_xact);
2550 	}
2551 
2552 	if_free(ifp);
2553 
2554 	HN_LOCK_DESTROY(sc);
2555 	rm_destroy(&sc->hn_vf_lock);
2556 	return (0);
2557 }
2558 
2559 static int
2560 hn_shutdown(device_t dev)
2561 {
2562 
2563 	return (0);
2564 }
2565 
2566 static void
2567 hn_link_status(struct hn_softc *sc)
2568 {
2569 	uint32_t link_status;
2570 	int error;
2571 
2572 	error = hn_rndis_get_linkstatus(sc, &link_status);
2573 	if (error) {
2574 		/* XXX what to do? */
2575 		return;
2576 	}
2577 
2578 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2579 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2580 	else
2581 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2582 	if_link_state_change(sc->hn_ifp,
2583 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2584 	    LINK_STATE_UP : LINK_STATE_DOWN);
2585 }
2586 
2587 static void
2588 hn_link_taskfunc(void *xsc, int pending __unused)
2589 {
2590 	struct hn_softc *sc = xsc;
2591 
2592 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2593 		return;
2594 	hn_link_status(sc);
2595 }
2596 
2597 static void
2598 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2599 {
2600 	struct hn_softc *sc = xsc;
2601 
2602 	/* Prevent any link status checks from running. */
2603 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2604 
2605 	/*
2606 	 * Fake up a [link down --> link up] state change; 5 seconds
2607 	 * delay is used, which closely simulates miibus reaction
2608 	 * upon link down event.
2609 	 */
2610 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2611 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2612 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2613 	    &sc->hn_netchg_status, 5 * hz);
2614 }
2615 
2616 static void
2617 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2618 {
2619 	struct hn_softc *sc = xsc;
2620 
2621 	/* Re-allow link status checks. */
2622 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2623 	hn_link_status(sc);
2624 }
2625 
2626 static void
2627 hn_update_link_status(struct hn_softc *sc)
2628 {
2629 
2630 	if (sc->hn_mgmt_taskq != NULL)
2631 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2632 }
2633 
2634 static void
2635 hn_change_network(struct hn_softc *sc)
2636 {
2637 
2638 	if (sc->hn_mgmt_taskq != NULL)
2639 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2640 }
2641 
2642 static __inline int
2643 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2644     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2645 {
2646 	struct mbuf *m = *m_head;
2647 	int error;
2648 
2649 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2650 
2651 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2652 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2653 	if (error == EFBIG) {
2654 		struct mbuf *m_new;
2655 
2656 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2657 		if (m_new == NULL)
2658 			return ENOBUFS;
2659 		else
2660 			*m_head = m = m_new;
2661 		txr->hn_tx_collapsed++;
2662 
2663 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2664 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2665 	}
2666 	if (!error) {
2667 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2668 		    BUS_DMASYNC_PREWRITE);
2669 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2670 	}
2671 	return error;
2672 }
2673 
2674 static __inline int
2675 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2676 {
2677 
2678 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2679 	    ("put an onlist txd %#x", txd->flags));
2680 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2681 	    ("put an onagg txd %#x", txd->flags));
2682 
2683 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2684 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2685 		return 0;
2686 
2687 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2688 		struct hn_txdesc *tmp_txd;
2689 
2690 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2691 			int freed;
2692 
2693 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2694 			    ("resursive aggregation on aggregated txdesc"));
2695 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2696 			    ("not aggregated txdesc"));
2697 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2698 			    ("aggregated txdesc uses dmamap"));
2699 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2700 			    ("aggregated txdesc consumes "
2701 			     "chimney sending buffer"));
2702 			KASSERT(tmp_txd->chim_size == 0,
2703 			    ("aggregated txdesc has non-zero "
2704 			     "chimney sending size"));
2705 
2706 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2707 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2708 			freed = hn_txdesc_put(txr, tmp_txd);
2709 			KASSERT(freed, ("failed to free aggregated txdesc"));
2710 		}
2711 	}
2712 
2713 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2714 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2715 		    ("chim txd uses dmamap"));
2716 		hn_chim_free(txr->hn_sc, txd->chim_index);
2717 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2718 		txd->chim_size = 0;
2719 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2720 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2721 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2722 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2723 		    txd->data_dmap);
2724 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2725 	}
2726 
2727 	if (txd->m != NULL) {
2728 		m_freem(txd->m);
2729 		txd->m = NULL;
2730 	}
2731 
2732 	txd->flags |= HN_TXD_FLAG_ONLIST;
2733 #ifndef HN_USE_TXDESC_BUFRING
2734 	mtx_lock_spin(&txr->hn_txlist_spin);
2735 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2736 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2737 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2738 	txr->hn_txdesc_avail++;
2739 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2740 	mtx_unlock_spin(&txr->hn_txlist_spin);
2741 #else	/* HN_USE_TXDESC_BUFRING */
2742 #ifdef HN_DEBUG
2743 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2744 #endif
2745 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2746 #endif	/* !HN_USE_TXDESC_BUFRING */
2747 
2748 	return 1;
2749 }
2750 
2751 static __inline struct hn_txdesc *
2752 hn_txdesc_get(struct hn_tx_ring *txr)
2753 {
2754 	struct hn_txdesc *txd;
2755 
2756 #ifndef HN_USE_TXDESC_BUFRING
2757 	mtx_lock_spin(&txr->hn_txlist_spin);
2758 	txd = SLIST_FIRST(&txr->hn_txlist);
2759 	if (txd != NULL) {
2760 		KASSERT(txr->hn_txdesc_avail > 0,
2761 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2762 		txr->hn_txdesc_avail--;
2763 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2764 	}
2765 	mtx_unlock_spin(&txr->hn_txlist_spin);
2766 #else
2767 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2768 #endif
2769 
2770 	if (txd != NULL) {
2771 #ifdef HN_USE_TXDESC_BUFRING
2772 #ifdef HN_DEBUG
2773 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2774 #endif
2775 #endif	/* HN_USE_TXDESC_BUFRING */
2776 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2777 		    STAILQ_EMPTY(&txd->agg_list) &&
2778 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2779 		    txd->chim_size == 0 &&
2780 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2781 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2782 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2783 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2784 		txd->refs = 1;
2785 	}
2786 	return txd;
2787 }
2788 
2789 static __inline void
2790 hn_txdesc_hold(struct hn_txdesc *txd)
2791 {
2792 
2793 	/* 0->1 transition will never work */
2794 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2795 	atomic_add_int(&txd->refs, 1);
2796 }
2797 
2798 static __inline void
2799 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2800 {
2801 
2802 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2803 	    ("recursive aggregation on aggregating txdesc"));
2804 
2805 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2806 	    ("already aggregated"));
2807 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2808 	    ("recursive aggregation on to-be-aggregated txdesc"));
2809 
2810 	txd->flags |= HN_TXD_FLAG_ONAGG;
2811 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2812 }
2813 
2814 static bool
2815 hn_tx_ring_pending(struct hn_tx_ring *txr)
2816 {
2817 	bool pending = false;
2818 
2819 #ifndef HN_USE_TXDESC_BUFRING
2820 	mtx_lock_spin(&txr->hn_txlist_spin);
2821 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2822 		pending = true;
2823 	mtx_unlock_spin(&txr->hn_txlist_spin);
2824 #else
2825 	if (!buf_ring_full(txr->hn_txdesc_br))
2826 		pending = true;
2827 #endif
2828 	return (pending);
2829 }
2830 
2831 static __inline void
2832 hn_txeof(struct hn_tx_ring *txr)
2833 {
2834 	txr->hn_has_txeof = 0;
2835 	txr->hn_txeof(txr);
2836 }
2837 
2838 static void
2839 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2840     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2841 {
2842 	struct hn_txdesc *txd = sndc->hn_cbarg;
2843 	struct hn_tx_ring *txr;
2844 
2845 	txr = txd->txr;
2846 	KASSERT(txr->hn_chan == chan,
2847 	    ("channel mismatch, on chan%u, should be chan%u",
2848 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2849 
2850 	txr->hn_has_txeof = 1;
2851 	hn_txdesc_put(txr, txd);
2852 
2853 	++txr->hn_txdone_cnt;
2854 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2855 		txr->hn_txdone_cnt = 0;
2856 		if (txr->hn_oactive)
2857 			hn_txeof(txr);
2858 	}
2859 }
2860 
2861 static void
2862 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2863 {
2864 #if defined(INET) || defined(INET6)
2865 	tcp_lro_flush_all(&rxr->hn_lro);
2866 #endif
2867 
2868 	/*
2869 	 * NOTE:
2870 	 * 'txr' could be NULL, if multiple channels and
2871 	 * ifnet.if_start method are enabled.
2872 	 */
2873 	if (txr == NULL || !txr->hn_has_txeof)
2874 		return;
2875 
2876 	txr->hn_txdone_cnt = 0;
2877 	hn_txeof(txr);
2878 }
2879 
2880 static __inline uint32_t
2881 hn_rndis_pktmsg_offset(uint32_t ofs)
2882 {
2883 
2884 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2885 	    ("invalid RNDIS packet msg offset %u", ofs));
2886 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2887 }
2888 
2889 static __inline void *
2890 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2891     size_t pi_dlen, uint32_t pi_type)
2892 {
2893 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2894 	struct rndis_pktinfo *pi;
2895 
2896 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2897 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2898 
2899 	/*
2900 	 * Per-packet-info does not move; it only grows.
2901 	 *
2902 	 * NOTE:
2903 	 * rm_pktinfooffset in this phase counts from the beginning
2904 	 * of rndis_packet_msg.
2905 	 */
2906 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2907 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2908 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2909 	    pkt->rm_pktinfolen);
2910 	pkt->rm_pktinfolen += pi_size;
2911 
2912 	pi->rm_size = pi_size;
2913 	pi->rm_type = pi_type;
2914 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2915 
2916 	return (pi->rm_data);
2917 }
2918 
2919 static __inline int
2920 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2921 {
2922 	struct hn_txdesc *txd;
2923 	struct mbuf *m;
2924 	int error, pkts;
2925 
2926 	txd = txr->hn_agg_txd;
2927 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2928 
2929 	/*
2930 	 * Since hn_txpkt() will reset this temporary stat, save
2931 	 * it now, so that oerrors can be updated properly, if
2932 	 * hn_txpkt() ever fails.
2933 	 */
2934 	pkts = txr->hn_stat_pkts;
2935 
2936 	/*
2937 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2938 	 * failure, save it for later freeing, if hn_txpkt() ever
2939 	 * fails.
2940 	 */
2941 	m = txd->m;
2942 	error = hn_txpkt(ifp, txr, txd);
2943 	if (__predict_false(error)) {
2944 		/* txd is freed, but m is not. */
2945 		m_freem(m);
2946 
2947 		txr->hn_flush_failed++;
2948 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2949 	}
2950 
2951 	/* Reset all aggregation states. */
2952 	txr->hn_agg_txd = NULL;
2953 	txr->hn_agg_szleft = 0;
2954 	txr->hn_agg_pktleft = 0;
2955 	txr->hn_agg_prevpkt = NULL;
2956 
2957 	return (error);
2958 }
2959 
2960 static void *
2961 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2962     int pktsize)
2963 {
2964 	void *chim;
2965 
2966 	if (txr->hn_agg_txd != NULL) {
2967 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2968 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2969 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2970 			int olen;
2971 
2972 			/*
2973 			 * Update the previous RNDIS packet's total length,
2974 			 * it can be increased due to the mandatory alignment
2975 			 * padding for this RNDIS packet.  And update the
2976 			 * aggregating txdesc's chimney sending buffer size
2977 			 * accordingly.
2978 			 *
2979 			 * XXX
2980 			 * Zero-out the padding, as required by the RNDIS spec.
2981 			 */
2982 			olen = pkt->rm_len;
2983 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2984 			agg_txd->chim_size += pkt->rm_len - olen;
2985 
2986 			/* Link this txdesc to the parent. */
2987 			hn_txdesc_agg(agg_txd, txd);
2988 
2989 			chim = (uint8_t *)pkt + pkt->rm_len;
2990 			/* Save the current packet for later fixup. */
2991 			txr->hn_agg_prevpkt = chim;
2992 
2993 			txr->hn_agg_pktleft--;
2994 			txr->hn_agg_szleft -= pktsize;
2995 			if (txr->hn_agg_szleft <=
2996 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2997 				/*
2998 				 * Probably can't aggregate more packets,
2999 				 * flush this aggregating txdesc proactively.
3000 				 */
3001 				txr->hn_agg_pktleft = 0;
3002 			}
3003 			/* Done! */
3004 			return (chim);
3005 		}
3006 		hn_flush_txagg(ifp, txr);
3007 	}
3008 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3009 
3010 	txr->hn_tx_chimney_tried++;
3011 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3012 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3013 		return (NULL);
3014 	txr->hn_tx_chimney++;
3015 
3016 	chim = txr->hn_sc->hn_chim +
3017 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3018 
3019 	if (txr->hn_agg_pktmax > 1 &&
3020 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3021 		txr->hn_agg_txd = txd;
3022 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3023 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3024 		txr->hn_agg_prevpkt = chim;
3025 	}
3026 	return (chim);
3027 }
3028 
3029 /*
3030  * NOTE:
3031  * If this function fails, then both txd and m_head0 will be freed.
3032  */
3033 static int
3034 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3035     struct mbuf **m_head0)
3036 {
3037 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3038 	int error, nsegs, i;
3039 	struct mbuf *m_head = *m_head0;
3040 	struct rndis_packet_msg *pkt;
3041 	uint32_t *pi_data;
3042 	void *chim = NULL;
3043 	int pkt_hlen, pkt_size;
3044 
3045 	pkt = txd->rndis_pkt;
3046 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3047 	if (pkt_size < txr->hn_chim_size) {
3048 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3049 		if (chim != NULL)
3050 			pkt = chim;
3051 	} else {
3052 		if (txr->hn_agg_txd != NULL)
3053 			hn_flush_txagg(ifp, txr);
3054 	}
3055 
3056 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3057 	pkt->rm_len = m_head->m_pkthdr.len;
3058 	pkt->rm_dataoffset = 0;
3059 	pkt->rm_datalen = m_head->m_pkthdr.len;
3060 	pkt->rm_oobdataoffset = 0;
3061 	pkt->rm_oobdatalen = 0;
3062 	pkt->rm_oobdataelements = 0;
3063 	pkt->rm_pktinfooffset = sizeof(*pkt);
3064 	pkt->rm_pktinfolen = 0;
3065 	pkt->rm_vchandle = 0;
3066 	pkt->rm_reserved = 0;
3067 
3068 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3069 		/*
3070 		 * Set the hash value for this packet, so that the host could
3071 		 * dispatch the TX done event for this packet back to this TX
3072 		 * ring's channel.
3073 		 */
3074 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3075 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3076 		*pi_data = txr->hn_tx_idx;
3077 	}
3078 
3079 	if (m_head->m_flags & M_VLANTAG) {
3080 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3081 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3082 		*pi_data = NDIS_VLAN_INFO_MAKE(
3083 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3084 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3085 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3086 	}
3087 
3088 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3089 #if defined(INET6) || defined(INET)
3090 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3091 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3092 #ifdef INET
3093 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3094 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3095 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3096 			    m_head->m_pkthdr.tso_segsz);
3097 		}
3098 #endif
3099 #if defined(INET6) && defined(INET)
3100 		else
3101 #endif
3102 #ifdef INET6
3103 		{
3104 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3105 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3106 			    m_head->m_pkthdr.tso_segsz);
3107 		}
3108 #endif
3109 #endif	/* INET6 || INET */
3110 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3111 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3112 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3113 		if (m_head->m_pkthdr.csum_flags &
3114 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3115 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3116 		} else {
3117 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3118 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3119 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3120 		}
3121 
3122 		if (m_head->m_pkthdr.csum_flags &
3123 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3124 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3125 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3126 		} else if (m_head->m_pkthdr.csum_flags &
3127 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3128 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3129 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3130 		}
3131 	}
3132 
3133 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3134 	/* Fixup RNDIS packet message total length */
3135 	pkt->rm_len += pkt_hlen;
3136 	/* Convert RNDIS packet message offsets */
3137 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3138 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3139 
3140 	/*
3141 	 * Fast path: Chimney sending.
3142 	 */
3143 	if (chim != NULL) {
3144 		struct hn_txdesc *tgt_txd = txd;
3145 
3146 		if (txr->hn_agg_txd != NULL) {
3147 			tgt_txd = txr->hn_agg_txd;
3148 #ifdef INVARIANTS
3149 			*m_head0 = NULL;
3150 #endif
3151 		}
3152 
3153 		KASSERT(pkt == chim,
3154 		    ("RNDIS pkt not in chimney sending buffer"));
3155 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3156 		    ("chimney sending buffer is not used"));
3157 		tgt_txd->chim_size += pkt->rm_len;
3158 
3159 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3160 		    ((uint8_t *)chim) + pkt_hlen);
3161 
3162 		txr->hn_gpa_cnt = 0;
3163 		txr->hn_sendpkt = hn_txpkt_chim;
3164 		goto done;
3165 	}
3166 
3167 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3168 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3169 	    ("chimney buffer is used"));
3170 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3171 
3172 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3173 	if (__predict_false(error)) {
3174 		int freed;
3175 
3176 		/*
3177 		 * This mbuf is not linked w/ the txd yet, so free it now.
3178 		 */
3179 		m_freem(m_head);
3180 		*m_head0 = NULL;
3181 
3182 		freed = hn_txdesc_put(txr, txd);
3183 		KASSERT(freed != 0,
3184 		    ("fail to free txd upon txdma error"));
3185 
3186 		txr->hn_txdma_failed++;
3187 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3188 		return error;
3189 	}
3190 	*m_head0 = m_head;
3191 
3192 	/* +1 RNDIS packet message */
3193 	txr->hn_gpa_cnt = nsegs + 1;
3194 
3195 	/* send packet with page buffer */
3196 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3197 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3198 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3199 
3200 	/*
3201 	 * Fill the page buffers with mbuf info after the page
3202 	 * buffer for RNDIS packet message.
3203 	 */
3204 	for (i = 0; i < nsegs; ++i) {
3205 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3206 
3207 		gpa->gpa_page = atop(segs[i].ds_addr);
3208 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3209 		gpa->gpa_len = segs[i].ds_len;
3210 	}
3211 
3212 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3213 	txd->chim_size = 0;
3214 	txr->hn_sendpkt = hn_txpkt_sglist;
3215 done:
3216 	txd->m = m_head;
3217 
3218 	/* Set the completion routine */
3219 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3220 
3221 	/* Update temporary stats for later use. */
3222 	txr->hn_stat_pkts++;
3223 	txr->hn_stat_size += m_head->m_pkthdr.len;
3224 	if (m_head->m_flags & M_MCAST)
3225 		txr->hn_stat_mcasts++;
3226 
3227 	return 0;
3228 }
3229 
3230 /*
3231  * NOTE:
3232  * If this function fails, then txd will be freed, but the mbuf
3233  * associated w/ the txd will _not_ be freed.
3234  */
3235 static int
3236 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3237 {
3238 	int error, send_failed = 0, has_bpf;
3239 
3240 again:
3241 	has_bpf = bpf_peers_present(ifp->if_bpf);
3242 	if (has_bpf) {
3243 		/*
3244 		 * Make sure that this txd and any aggregated txds are not
3245 		 * freed before ETHER_BPF_MTAP.
3246 		 */
3247 		hn_txdesc_hold(txd);
3248 	}
3249 	error = txr->hn_sendpkt(txr, txd);
3250 	if (!error) {
3251 		if (has_bpf) {
3252 			const struct hn_txdesc *tmp_txd;
3253 
3254 			ETHER_BPF_MTAP(ifp, txd->m);
3255 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3256 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3257 		}
3258 
3259 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3260 #ifdef HN_IFSTART_SUPPORT
3261 		if (!hn_use_if_start)
3262 #endif
3263 		{
3264 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3265 			    txr->hn_stat_size);
3266 			if (txr->hn_stat_mcasts != 0) {
3267 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3268 				    txr->hn_stat_mcasts);
3269 			}
3270 		}
3271 		txr->hn_pkts += txr->hn_stat_pkts;
3272 		txr->hn_sends++;
3273 	}
3274 	if (has_bpf)
3275 		hn_txdesc_put(txr, txd);
3276 
3277 	if (__predict_false(error)) {
3278 		int freed;
3279 
3280 		/*
3281 		 * This should "really rarely" happen.
3282 		 *
3283 		 * XXX Too many RX to be acked or too many sideband
3284 		 * commands to run?  Ask netvsc_channel_rollup()
3285 		 * to kick start later.
3286 		 */
3287 		txr->hn_has_txeof = 1;
3288 		if (!send_failed) {
3289 			txr->hn_send_failed++;
3290 			send_failed = 1;
3291 			/*
3292 			 * Try sending again after set hn_has_txeof;
3293 			 * in case that we missed the last
3294 			 * netvsc_channel_rollup().
3295 			 */
3296 			goto again;
3297 		}
3298 		if_printf(ifp, "send failed\n");
3299 
3300 		/*
3301 		 * Caller will perform further processing on the
3302 		 * associated mbuf, so don't free it in hn_txdesc_put();
3303 		 * only unload it from the DMA map in hn_txdesc_put(),
3304 		 * if it was loaded.
3305 		 */
3306 		txd->m = NULL;
3307 		freed = hn_txdesc_put(txr, txd);
3308 		KASSERT(freed != 0,
3309 		    ("fail to free txd upon send error"));
3310 
3311 		txr->hn_send_failed++;
3312 	}
3313 
3314 	/* Reset temporary stats, after this sending is done. */
3315 	txr->hn_stat_size = 0;
3316 	txr->hn_stat_pkts = 0;
3317 	txr->hn_stat_mcasts = 0;
3318 
3319 	return (error);
3320 }
3321 
3322 /*
3323  * Append the specified data to the indicated mbuf chain,
3324  * Extend the mbuf chain if the new data does not fit in
3325  * existing space.
3326  *
3327  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3328  * There should be an equivalent in the kernel mbuf code,
3329  * but there does not appear to be one yet.
3330  *
3331  * Differs from m_append() in that additional mbufs are
3332  * allocated with cluster size MJUMPAGESIZE, and filled
3333  * accordingly.
3334  *
3335  * Return 1 if able to complete the job; otherwise 0.
3336  */
3337 static int
3338 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3339 {
3340 	struct mbuf *m, *n;
3341 	int remainder, space;
3342 
3343 	for (m = m0; m->m_next != NULL; m = m->m_next)
3344 		;
3345 	remainder = len;
3346 	space = M_TRAILINGSPACE(m);
3347 	if (space > 0) {
3348 		/*
3349 		 * Copy into available space.
3350 		 */
3351 		if (space > remainder)
3352 			space = remainder;
3353 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3354 		m->m_len += space;
3355 		cp += space;
3356 		remainder -= space;
3357 	}
3358 	while (remainder > 0) {
3359 		/*
3360 		 * Allocate a new mbuf; could check space
3361 		 * and allocate a cluster instead.
3362 		 */
3363 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3364 		if (n == NULL)
3365 			break;
3366 		n->m_len = min(MJUMPAGESIZE, remainder);
3367 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3368 		cp += n->m_len;
3369 		remainder -= n->m_len;
3370 		m->m_next = n;
3371 		m = n;
3372 	}
3373 	if (m0->m_flags & M_PKTHDR)
3374 		m0->m_pkthdr.len += len - remainder;
3375 
3376 	return (remainder == 0);
3377 }
3378 
3379 #if defined(INET) || defined(INET6)
3380 static __inline int
3381 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3382 {
3383 #if __FreeBSD_version >= 1100095
3384 	if (hn_lro_mbufq_depth) {
3385 		tcp_lro_queue_mbuf(lc, m);
3386 		return 0;
3387 	}
3388 #endif
3389 	return tcp_lro_rx(lc, m, 0);
3390 }
3391 #endif
3392 
3393 static int
3394 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3395     const struct hn_rxinfo *info)
3396 {
3397 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3398 	struct mbuf *m_new;
3399 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3400 	int hash_type = M_HASHTYPE_NONE;
3401 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3402 
3403 	ifp = hn_ifp;
3404 	if (rxr->hn_rxvf_ifp != NULL) {
3405 		/*
3406 		 * Non-transparent mode VF; pretend this packet is from
3407 		 * the VF.
3408 		 */
3409 		ifp = rxr->hn_rxvf_ifp;
3410 		is_vf = 1;
3411 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3412 		/* Transparent mode VF. */
3413 		is_vf = 1;
3414 	}
3415 
3416 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3417 		/*
3418 		 * NOTE:
3419 		 * See the NOTE of hn_rndis_init_fixat().  This
3420 		 * function can be reached, immediately after the
3421 		 * RNDIS is initialized but before the ifnet is
3422 		 * setup on the hn_attach() path; drop the unexpected
3423 		 * packets.
3424 		 */
3425 		return (0);
3426 	}
3427 
3428 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3429 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3430 		return (0);
3431 	}
3432 
3433 	if (dlen <= MHLEN) {
3434 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3435 		if (m_new == NULL) {
3436 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3437 			return (0);
3438 		}
3439 		memcpy(mtod(m_new, void *), data, dlen);
3440 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3441 		rxr->hn_small_pkts++;
3442 	} else {
3443 		/*
3444 		 * Get an mbuf with a cluster.  For packets 2K or less,
3445 		 * get a standard 2K cluster.  For anything larger, get a
3446 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3447 		 * if looped around to the Hyper-V TX channel, so avoid them.
3448 		 */
3449 		size = MCLBYTES;
3450 		if (dlen > MCLBYTES) {
3451 			/* 4096 */
3452 			size = MJUMPAGESIZE;
3453 		}
3454 
3455 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3456 		if (m_new == NULL) {
3457 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3458 			return (0);
3459 		}
3460 
3461 		hv_m_append(m_new, dlen, data);
3462 	}
3463 	m_new->m_pkthdr.rcvif = ifp;
3464 
3465 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3466 		do_csum = 0;
3467 
3468 	/* receive side checksum offload */
3469 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3470 		/* IP csum offload */
3471 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3472 			m_new->m_pkthdr.csum_flags |=
3473 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3474 			rxr->hn_csum_ip++;
3475 		}
3476 
3477 		/* TCP/UDP csum offload */
3478 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3479 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3480 			m_new->m_pkthdr.csum_flags |=
3481 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3482 			m_new->m_pkthdr.csum_data = 0xffff;
3483 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3484 				rxr->hn_csum_tcp++;
3485 			else
3486 				rxr->hn_csum_udp++;
3487 		}
3488 
3489 		/*
3490 		 * XXX
3491 		 * As of this write (Oct 28th, 2016), host side will turn
3492 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3493 		 * the do_lro setting here is actually _not_ accurate.  We
3494 		 * depend on the RSS hash type check to reset do_lro.
3495 		 */
3496 		if ((info->csum_info &
3497 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3498 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3499 			do_lro = 1;
3500 	} else {
3501 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3502 		if (l3proto == ETHERTYPE_IP) {
3503 			if (l4proto == IPPROTO_TCP) {
3504 				if (do_csum &&
3505 				    (rxr->hn_trust_hcsum &
3506 				     HN_TRUST_HCSUM_TCP)) {
3507 					rxr->hn_csum_trusted++;
3508 					m_new->m_pkthdr.csum_flags |=
3509 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3510 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3511 					m_new->m_pkthdr.csum_data = 0xffff;
3512 				}
3513 				do_lro = 1;
3514 			} else if (l4proto == IPPROTO_UDP) {
3515 				if (do_csum &&
3516 				    (rxr->hn_trust_hcsum &
3517 				     HN_TRUST_HCSUM_UDP)) {
3518 					rxr->hn_csum_trusted++;
3519 					m_new->m_pkthdr.csum_flags |=
3520 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3521 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3522 					m_new->m_pkthdr.csum_data = 0xffff;
3523 				}
3524 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3525 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3526 				rxr->hn_csum_trusted++;
3527 				m_new->m_pkthdr.csum_flags |=
3528 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3529 			}
3530 		}
3531 	}
3532 
3533 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3534 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3535 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3536 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3537 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3538 		m_new->m_flags |= M_VLANTAG;
3539 	}
3540 
3541 	/*
3542 	 * If VF is activated (tranparent/non-transparent mode does not
3543 	 * matter here).
3544 	 *
3545 	 * - Disable LRO
3546 	 *
3547 	 *   hn(4) will only receive broadcast packets, multicast packets,
3548 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3549 	 *   packet types.
3550 	 *
3551 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3552 	 *   all, since the LRO flush will use hn(4) as the receiving
3553 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3554 	 */
3555 	if (is_vf)
3556 		do_lro = 0;
3557 
3558 	/*
3559 	 * If VF is activated (tranparent/non-transparent mode does not
3560 	 * matter here), do _not_ mess with unsupported hash types or
3561 	 * functions.
3562 	 */
3563 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3564 		rxr->hn_rss_pkts++;
3565 		m_new->m_pkthdr.flowid = info->hash_value;
3566 		if (!is_vf)
3567 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3568 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3569 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3570 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3571 			    rxr->hn_mbuf_hash);
3572 
3573 			/*
3574 			 * NOTE:
3575 			 * do_lro is resetted, if the hash types are not TCP
3576 			 * related.  See the comment in the above csum_flags
3577 			 * setup section.
3578 			 */
3579 			switch (type) {
3580 			case NDIS_HASH_IPV4:
3581 				hash_type = M_HASHTYPE_RSS_IPV4;
3582 				do_lro = 0;
3583 				break;
3584 
3585 			case NDIS_HASH_TCP_IPV4:
3586 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3587 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3588 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3589 
3590 					if (is_vf)
3591 						def_htype = M_HASHTYPE_NONE;
3592 
3593 					/*
3594 					 * UDP 4-tuple hash is delivered as
3595 					 * TCP 4-tuple hash.
3596 					 */
3597 					if (l3proto == ETHERTYPE_MAX) {
3598 						hn_rxpkt_proto(m_new,
3599 						    &l3proto, &l4proto);
3600 					}
3601 					if (l3proto == ETHERTYPE_IP) {
3602 						if (l4proto == IPPROTO_UDP &&
3603 						    (rxr->hn_mbuf_hash &
3604 						     NDIS_HASH_UDP_IPV4_X)) {
3605 							hash_type =
3606 							M_HASHTYPE_RSS_UDP_IPV4;
3607 							do_lro = 0;
3608 						} else if (l4proto !=
3609 						    IPPROTO_TCP) {
3610 							hash_type = def_htype;
3611 							do_lro = 0;
3612 						}
3613 					} else {
3614 						hash_type = def_htype;
3615 						do_lro = 0;
3616 					}
3617 				}
3618 				break;
3619 
3620 			case NDIS_HASH_IPV6:
3621 				hash_type = M_HASHTYPE_RSS_IPV6;
3622 				do_lro = 0;
3623 				break;
3624 
3625 			case NDIS_HASH_IPV6_EX:
3626 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3627 				do_lro = 0;
3628 				break;
3629 
3630 			case NDIS_HASH_TCP_IPV6:
3631 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3632 				break;
3633 
3634 			case NDIS_HASH_TCP_IPV6_EX:
3635 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3636 				break;
3637 			}
3638 		}
3639 	} else if (!is_vf) {
3640 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3641 		hash_type = M_HASHTYPE_OPAQUE;
3642 	}
3643 	M_HASHTYPE_SET(m_new, hash_type);
3644 
3645 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3646 	if (hn_ifp != ifp) {
3647 		const struct ether_header *eh;
3648 
3649 		/*
3650 		 * Non-transparent mode VF is activated.
3651 		 */
3652 
3653 		/*
3654 		 * Allow tapping on hn(4).
3655 		 */
3656 		ETHER_BPF_MTAP(hn_ifp, m_new);
3657 
3658 		/*
3659 		 * Update hn(4)'s stats.
3660 		 */
3661 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3662 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3663 		/* Checked at the beginning of this function. */
3664 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3665 		eh = mtod(m_new, struct ether_header *);
3666 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3667 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3668 	}
3669 	rxr->hn_pkts++;
3670 
3671 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3672 #if defined(INET) || defined(INET6)
3673 		struct lro_ctrl *lro = &rxr->hn_lro;
3674 
3675 		if (lro->lro_cnt) {
3676 			rxr->hn_lro_tried++;
3677 			if (hn_lro_rx(lro, m_new) == 0) {
3678 				/* DONE! */
3679 				return 0;
3680 			}
3681 		}
3682 #endif
3683 	}
3684 	ifp->if_input(ifp, m_new);
3685 
3686 	return (0);
3687 }
3688 
3689 static int
3690 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3691 {
3692 	struct hn_softc *sc = ifp->if_softc;
3693 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3694 	struct ifnet *vf_ifp;
3695 	int mask, error = 0;
3696 	struct ifrsskey *ifrk;
3697 	struct ifrsshash *ifrh;
3698 	uint32_t mtu;
3699 
3700 	switch (cmd) {
3701 	case SIOCSIFMTU:
3702 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3703 			error = EINVAL;
3704 			break;
3705 		}
3706 
3707 		HN_LOCK(sc);
3708 
3709 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3710 			HN_UNLOCK(sc);
3711 			break;
3712 		}
3713 
3714 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3715 			/* Can't change MTU */
3716 			HN_UNLOCK(sc);
3717 			error = EOPNOTSUPP;
3718 			break;
3719 		}
3720 
3721 		if (ifp->if_mtu == ifr->ifr_mtu) {
3722 			HN_UNLOCK(sc);
3723 			break;
3724 		}
3725 
3726 		if (hn_xpnt_vf_isready(sc)) {
3727 			vf_ifp = sc->hn_vf_ifp;
3728 			ifr_vf = *ifr;
3729 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3730 			    sizeof(ifr_vf.ifr_name));
3731 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3732 			    (caddr_t)&ifr_vf);
3733 			if (error) {
3734 				HN_UNLOCK(sc);
3735 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3736 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3737 				break;
3738 			}
3739 		}
3740 
3741 		/*
3742 		 * Suspend this interface before the synthetic parts
3743 		 * are ripped.
3744 		 */
3745 		hn_suspend(sc);
3746 
3747 		/*
3748 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3749 		 */
3750 		hn_synth_detach(sc);
3751 
3752 		/*
3753 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3754 		 * with the new MTU setting.
3755 		 */
3756 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3757 		if (error) {
3758 			HN_UNLOCK(sc);
3759 			break;
3760 		}
3761 
3762 		error = hn_rndis_get_mtu(sc, &mtu);
3763 		if (error)
3764 			mtu = ifr->ifr_mtu;
3765 		else if (bootverbose)
3766 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3767 
3768 		/*
3769 		 * Commit the requested MTU, after the synthetic parts
3770 		 * have been successfully attached.
3771 		 */
3772 		if (mtu >= ifr->ifr_mtu) {
3773 			mtu = ifr->ifr_mtu;
3774 		} else {
3775 			if_printf(ifp, "fixup mtu %d -> %u\n",
3776 			    ifr->ifr_mtu, mtu);
3777 		}
3778 		ifp->if_mtu = mtu;
3779 
3780 		/*
3781 		 * Synthetic parts' reattach may change the chimney
3782 		 * sending size; update it.
3783 		 */
3784 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3785 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3786 
3787 		/*
3788 		 * Make sure that various parameters based on MTU are
3789 		 * still valid, after the MTU change.
3790 		 */
3791 		hn_mtu_change_fixup(sc);
3792 
3793 		/*
3794 		 * All done!  Resume the interface now.
3795 		 */
3796 		hn_resume(sc);
3797 
3798 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3799 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3800 			/*
3801 			 * Since we have reattached the NVS part,
3802 			 * change the datapath to VF again; in case
3803 			 * that it is lost, after the NVS was detached.
3804 			 */
3805 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3806 		}
3807 
3808 		HN_UNLOCK(sc);
3809 		break;
3810 
3811 	case SIOCSIFFLAGS:
3812 		HN_LOCK(sc);
3813 
3814 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3815 			HN_UNLOCK(sc);
3816 			break;
3817 		}
3818 
3819 		if (hn_xpnt_vf_isready(sc))
3820 			hn_xpnt_vf_saveifflags(sc);
3821 
3822 		if (ifp->if_flags & IFF_UP) {
3823 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3824 				/*
3825 				 * Caller meight hold mutex, e.g.
3826 				 * bpf; use busy-wait for the RNDIS
3827 				 * reply.
3828 				 */
3829 				HN_NO_SLEEPING(sc);
3830 				hn_rxfilter_config(sc);
3831 				HN_SLEEPING_OK(sc);
3832 
3833 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3834 					error = hn_xpnt_vf_iocsetflags(sc);
3835 			} else {
3836 				hn_init_locked(sc);
3837 			}
3838 		} else {
3839 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3840 				hn_stop(sc, false);
3841 		}
3842 		sc->hn_if_flags = ifp->if_flags;
3843 
3844 		HN_UNLOCK(sc);
3845 		break;
3846 
3847 	case SIOCSIFCAP:
3848 		HN_LOCK(sc);
3849 
3850 		if (hn_xpnt_vf_isready(sc)) {
3851 			ifr_vf = *ifr;
3852 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3853 			    sizeof(ifr_vf.ifr_name));
3854 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3855 			HN_UNLOCK(sc);
3856 			break;
3857 		}
3858 
3859 		/*
3860 		 * Fix up requested capabilities w/ supported capabilities,
3861 		 * since the supported capabilities could have been changed.
3862 		 */
3863 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3864 		    ifp->if_capenable;
3865 
3866 		if (mask & IFCAP_TXCSUM) {
3867 			ifp->if_capenable ^= IFCAP_TXCSUM;
3868 			if (ifp->if_capenable & IFCAP_TXCSUM)
3869 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3870 			else
3871 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3872 		}
3873 		if (mask & IFCAP_TXCSUM_IPV6) {
3874 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3875 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3876 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3877 			else
3878 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3879 		}
3880 
3881 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3882 		if (mask & IFCAP_RXCSUM)
3883 			ifp->if_capenable ^= IFCAP_RXCSUM;
3884 #ifdef foo
3885 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3886 		if (mask & IFCAP_RXCSUM_IPV6)
3887 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3888 #endif
3889 
3890 		if (mask & IFCAP_LRO)
3891 			ifp->if_capenable ^= IFCAP_LRO;
3892 
3893 		if (mask & IFCAP_TSO4) {
3894 			ifp->if_capenable ^= IFCAP_TSO4;
3895 			if (ifp->if_capenable & IFCAP_TSO4)
3896 				ifp->if_hwassist |= CSUM_IP_TSO;
3897 			else
3898 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3899 		}
3900 		if (mask & IFCAP_TSO6) {
3901 			ifp->if_capenable ^= IFCAP_TSO6;
3902 			if (ifp->if_capenable & IFCAP_TSO6)
3903 				ifp->if_hwassist |= CSUM_IP6_TSO;
3904 			else
3905 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3906 		}
3907 
3908 		HN_UNLOCK(sc);
3909 		break;
3910 
3911 	case SIOCADDMULTI:
3912 	case SIOCDELMULTI:
3913 		HN_LOCK(sc);
3914 
3915 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3916 			HN_UNLOCK(sc);
3917 			break;
3918 		}
3919 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3920 			/*
3921 			 * Multicast uses mutex; use busy-wait for
3922 			 * the RNDIS reply.
3923 			 */
3924 			HN_NO_SLEEPING(sc);
3925 			hn_rxfilter_config(sc);
3926 			HN_SLEEPING_OK(sc);
3927 		}
3928 
3929 		/* XXX vlan(4) style mcast addr maintenance */
3930 		if (hn_xpnt_vf_isready(sc)) {
3931 			int old_if_flags;
3932 
3933 			old_if_flags = sc->hn_vf_ifp->if_flags;
3934 			hn_xpnt_vf_saveifflags(sc);
3935 
3936 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3937 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3938 			     IFF_ALLMULTI))
3939 				error = hn_xpnt_vf_iocsetflags(sc);
3940 		}
3941 
3942 		HN_UNLOCK(sc);
3943 		break;
3944 
3945 	case SIOCSIFMEDIA:
3946 	case SIOCGIFMEDIA:
3947 		HN_LOCK(sc);
3948 		if (hn_xpnt_vf_isready(sc)) {
3949 			/*
3950 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3951 			 * create and pass ifr_vf to the VF here; just
3952 			 * replace the ifr_name.
3953 			 */
3954 			vf_ifp = sc->hn_vf_ifp;
3955 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3956 			    sizeof(ifr->ifr_name));
3957 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3958 			/* Restore the ifr_name. */
3959 			strlcpy(ifr->ifr_name, ifp->if_xname,
3960 			    sizeof(ifr->ifr_name));
3961 			HN_UNLOCK(sc);
3962 			break;
3963 		}
3964 		HN_UNLOCK(sc);
3965 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3966 		break;
3967 
3968 	case SIOCGIFRSSHASH:
3969 		ifrh = (struct ifrsshash *)data;
3970 		HN_LOCK(sc);
3971 		if (sc->hn_rx_ring_inuse == 1) {
3972 			HN_UNLOCK(sc);
3973 			ifrh->ifrh_func = RSS_FUNC_NONE;
3974 			ifrh->ifrh_types = 0;
3975 			break;
3976 		}
3977 
3978 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3979 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3980 		else
3981 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3982 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3983 		HN_UNLOCK(sc);
3984 		break;
3985 
3986 	case SIOCGIFRSSKEY:
3987 		ifrk = (struct ifrsskey *)data;
3988 		HN_LOCK(sc);
3989 		if (sc->hn_rx_ring_inuse == 1) {
3990 			HN_UNLOCK(sc);
3991 			ifrk->ifrk_func = RSS_FUNC_NONE;
3992 			ifrk->ifrk_keylen = 0;
3993 			break;
3994 		}
3995 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3996 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3997 		else
3998 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3999 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4000 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4001 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4002 		HN_UNLOCK(sc);
4003 		break;
4004 
4005 	default:
4006 		error = ether_ioctl(ifp, cmd, data);
4007 		break;
4008 	}
4009 	return (error);
4010 }
4011 
4012 static void
4013 hn_stop(struct hn_softc *sc, bool detaching)
4014 {
4015 	struct ifnet *ifp = sc->hn_ifp;
4016 	int i;
4017 
4018 	HN_LOCK_ASSERT(sc);
4019 
4020 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4021 	    ("synthetic parts were not attached"));
4022 
4023 	/* Clear RUNNING bit ASAP. */
4024 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4025 
4026 	/* Disable polling. */
4027 	hn_polling(sc, 0);
4028 
4029 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4030 		KASSERT(sc->hn_vf_ifp != NULL,
4031 		    ("%s: VF is not attached", ifp->if_xname));
4032 
4033 		/* Mark transparent mode VF as disabled. */
4034 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4035 
4036 		/*
4037 		 * NOTE:
4038 		 * Datapath setting must happen _before_ bringing
4039 		 * the VF down.
4040 		 */
4041 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4042 
4043 		/*
4044 		 * Bring the VF down.
4045 		 */
4046 		hn_xpnt_vf_saveifflags(sc);
4047 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4048 		hn_xpnt_vf_iocsetflags(sc);
4049 	}
4050 
4051 	/* Suspend data transfers. */
4052 	hn_suspend_data(sc);
4053 
4054 	/* Clear OACTIVE bit. */
4055 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4056 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4057 		sc->hn_tx_ring[i].hn_oactive = 0;
4058 
4059 	/*
4060 	 * If the non-transparent mode VF is active, make sure
4061 	 * that the RX filter still allows packet reception.
4062 	 */
4063 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4064 		hn_rxfilter_config(sc);
4065 }
4066 
4067 static void
4068 hn_init_locked(struct hn_softc *sc)
4069 {
4070 	struct ifnet *ifp = sc->hn_ifp;
4071 	int i;
4072 
4073 	HN_LOCK_ASSERT(sc);
4074 
4075 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4076 		return;
4077 
4078 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4079 		return;
4080 
4081 	/* Configure RX filter */
4082 	hn_rxfilter_config(sc);
4083 
4084 	/* Clear OACTIVE bit. */
4085 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4086 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4087 		sc->hn_tx_ring[i].hn_oactive = 0;
4088 
4089 	/* Clear TX 'suspended' bit. */
4090 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4091 
4092 	if (hn_xpnt_vf_isready(sc)) {
4093 		/* Initialize transparent VF. */
4094 		hn_xpnt_vf_init(sc);
4095 	}
4096 
4097 	/* Everything is ready; unleash! */
4098 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4099 
4100 	/* Re-enable polling if requested. */
4101 	if (sc->hn_pollhz > 0)
4102 		hn_polling(sc, sc->hn_pollhz);
4103 }
4104 
4105 static void
4106 hn_init(void *xsc)
4107 {
4108 	struct hn_softc *sc = xsc;
4109 
4110 	HN_LOCK(sc);
4111 	hn_init_locked(sc);
4112 	HN_UNLOCK(sc);
4113 }
4114 
4115 #if __FreeBSD_version >= 1100099
4116 
4117 static int
4118 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4119 {
4120 	struct hn_softc *sc = arg1;
4121 	unsigned int lenlim;
4122 	int error;
4123 
4124 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4125 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4126 	if (error || req->newptr == NULL)
4127 		return error;
4128 
4129 	HN_LOCK(sc);
4130 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4131 	    lenlim > TCP_LRO_LENGTH_MAX) {
4132 		HN_UNLOCK(sc);
4133 		return EINVAL;
4134 	}
4135 	hn_set_lro_lenlim(sc, lenlim);
4136 	HN_UNLOCK(sc);
4137 
4138 	return 0;
4139 }
4140 
4141 static int
4142 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4143 {
4144 	struct hn_softc *sc = arg1;
4145 	int ackcnt, error, i;
4146 
4147 	/*
4148 	 * lro_ackcnt_lim is append count limit,
4149 	 * +1 to turn it into aggregation limit.
4150 	 */
4151 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4152 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4153 	if (error || req->newptr == NULL)
4154 		return error;
4155 
4156 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4157 		return EINVAL;
4158 
4159 	/*
4160 	 * Convert aggregation limit back to append
4161 	 * count limit.
4162 	 */
4163 	--ackcnt;
4164 	HN_LOCK(sc);
4165 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4166 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4167 	HN_UNLOCK(sc);
4168 	return 0;
4169 }
4170 
4171 #endif
4172 
4173 static int
4174 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4175 {
4176 	struct hn_softc *sc = arg1;
4177 	int hcsum = arg2;
4178 	int on, error, i;
4179 
4180 	on = 0;
4181 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4182 		on = 1;
4183 
4184 	error = sysctl_handle_int(oidp, &on, 0, req);
4185 	if (error || req->newptr == NULL)
4186 		return error;
4187 
4188 	HN_LOCK(sc);
4189 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4190 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4191 
4192 		if (on)
4193 			rxr->hn_trust_hcsum |= hcsum;
4194 		else
4195 			rxr->hn_trust_hcsum &= ~hcsum;
4196 	}
4197 	HN_UNLOCK(sc);
4198 	return 0;
4199 }
4200 
4201 static int
4202 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4203 {
4204 	struct hn_softc *sc = arg1;
4205 	int chim_size, error;
4206 
4207 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4208 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4209 	if (error || req->newptr == NULL)
4210 		return error;
4211 
4212 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4213 		return EINVAL;
4214 
4215 	HN_LOCK(sc);
4216 	hn_set_chim_size(sc, chim_size);
4217 	HN_UNLOCK(sc);
4218 	return 0;
4219 }
4220 
4221 #if __FreeBSD_version < 1100095
4222 static int
4223 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4224 {
4225 	struct hn_softc *sc = arg1;
4226 	int ofs = arg2, i, error;
4227 	struct hn_rx_ring *rxr;
4228 	uint64_t stat;
4229 
4230 	stat = 0;
4231 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4232 		rxr = &sc->hn_rx_ring[i];
4233 		stat += *((int *)((uint8_t *)rxr + ofs));
4234 	}
4235 
4236 	error = sysctl_handle_64(oidp, &stat, 0, req);
4237 	if (error || req->newptr == NULL)
4238 		return error;
4239 
4240 	/* Zero out this stat. */
4241 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4242 		rxr = &sc->hn_rx_ring[i];
4243 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4244 	}
4245 	return 0;
4246 }
4247 #else
4248 static int
4249 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4250 {
4251 	struct hn_softc *sc = arg1;
4252 	int ofs = arg2, i, error;
4253 	struct hn_rx_ring *rxr;
4254 	uint64_t stat;
4255 
4256 	stat = 0;
4257 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4258 		rxr = &sc->hn_rx_ring[i];
4259 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4260 	}
4261 
4262 	error = sysctl_handle_64(oidp, &stat, 0, req);
4263 	if (error || req->newptr == NULL)
4264 		return error;
4265 
4266 	/* Zero out this stat. */
4267 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4268 		rxr = &sc->hn_rx_ring[i];
4269 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4270 	}
4271 	return 0;
4272 }
4273 
4274 #endif
4275 
4276 static int
4277 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4278 {
4279 	struct hn_softc *sc = arg1;
4280 	int ofs = arg2, i, error;
4281 	struct hn_rx_ring *rxr;
4282 	u_long stat;
4283 
4284 	stat = 0;
4285 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4286 		rxr = &sc->hn_rx_ring[i];
4287 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4288 	}
4289 
4290 	error = sysctl_handle_long(oidp, &stat, 0, req);
4291 	if (error || req->newptr == NULL)
4292 		return error;
4293 
4294 	/* Zero out this stat. */
4295 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4296 		rxr = &sc->hn_rx_ring[i];
4297 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4298 	}
4299 	return 0;
4300 }
4301 
4302 static int
4303 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4304 {
4305 	struct hn_softc *sc = arg1;
4306 	int ofs = arg2, i, error;
4307 	struct hn_tx_ring *txr;
4308 	u_long stat;
4309 
4310 	stat = 0;
4311 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4312 		txr = &sc->hn_tx_ring[i];
4313 		stat += *((u_long *)((uint8_t *)txr + ofs));
4314 	}
4315 
4316 	error = sysctl_handle_long(oidp, &stat, 0, req);
4317 	if (error || req->newptr == NULL)
4318 		return error;
4319 
4320 	/* Zero out this stat. */
4321 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4322 		txr = &sc->hn_tx_ring[i];
4323 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4324 	}
4325 	return 0;
4326 }
4327 
4328 static int
4329 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4330 {
4331 	struct hn_softc *sc = arg1;
4332 	int ofs = arg2, i, error, conf;
4333 	struct hn_tx_ring *txr;
4334 
4335 	txr = &sc->hn_tx_ring[0];
4336 	conf = *((int *)((uint8_t *)txr + ofs));
4337 
4338 	error = sysctl_handle_int(oidp, &conf, 0, req);
4339 	if (error || req->newptr == NULL)
4340 		return error;
4341 
4342 	HN_LOCK(sc);
4343 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4344 		txr = &sc->hn_tx_ring[i];
4345 		*((int *)((uint8_t *)txr + ofs)) = conf;
4346 	}
4347 	HN_UNLOCK(sc);
4348 
4349 	return 0;
4350 }
4351 
4352 static int
4353 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4354 {
4355 	struct hn_softc *sc = arg1;
4356 	int error, size;
4357 
4358 	size = sc->hn_agg_size;
4359 	error = sysctl_handle_int(oidp, &size, 0, req);
4360 	if (error || req->newptr == NULL)
4361 		return (error);
4362 
4363 	HN_LOCK(sc);
4364 	sc->hn_agg_size = size;
4365 	hn_set_txagg(sc);
4366 	HN_UNLOCK(sc);
4367 
4368 	return (0);
4369 }
4370 
4371 static int
4372 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4373 {
4374 	struct hn_softc *sc = arg1;
4375 	int error, pkts;
4376 
4377 	pkts = sc->hn_agg_pkts;
4378 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4379 	if (error || req->newptr == NULL)
4380 		return (error);
4381 
4382 	HN_LOCK(sc);
4383 	sc->hn_agg_pkts = pkts;
4384 	hn_set_txagg(sc);
4385 	HN_UNLOCK(sc);
4386 
4387 	return (0);
4388 }
4389 
4390 static int
4391 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4392 {
4393 	struct hn_softc *sc = arg1;
4394 	int pkts;
4395 
4396 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4397 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4398 }
4399 
4400 static int
4401 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4402 {
4403 	struct hn_softc *sc = arg1;
4404 	int align;
4405 
4406 	align = sc->hn_tx_ring[0].hn_agg_align;
4407 	return (sysctl_handle_int(oidp, &align, 0, req));
4408 }
4409 
4410 static void
4411 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4412 {
4413 	if (pollhz == 0)
4414 		vmbus_chan_poll_disable(chan);
4415 	else
4416 		vmbus_chan_poll_enable(chan, pollhz);
4417 }
4418 
4419 static void
4420 hn_polling(struct hn_softc *sc, u_int pollhz)
4421 {
4422 	int nsubch = sc->hn_rx_ring_inuse - 1;
4423 
4424 	HN_LOCK_ASSERT(sc);
4425 
4426 	if (nsubch > 0) {
4427 		struct vmbus_channel **subch;
4428 		int i;
4429 
4430 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4431 		for (i = 0; i < nsubch; ++i)
4432 			hn_chan_polling(subch[i], pollhz);
4433 		vmbus_subchan_rel(subch, nsubch);
4434 	}
4435 	hn_chan_polling(sc->hn_prichan, pollhz);
4436 }
4437 
4438 static int
4439 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4440 {
4441 	struct hn_softc *sc = arg1;
4442 	int pollhz, error;
4443 
4444 	pollhz = sc->hn_pollhz;
4445 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4446 	if (error || req->newptr == NULL)
4447 		return (error);
4448 
4449 	if (pollhz != 0 &&
4450 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4451 		return (EINVAL);
4452 
4453 	HN_LOCK(sc);
4454 	if (sc->hn_pollhz != pollhz) {
4455 		sc->hn_pollhz = pollhz;
4456 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4457 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4458 			hn_polling(sc, sc->hn_pollhz);
4459 	}
4460 	HN_UNLOCK(sc);
4461 
4462 	return (0);
4463 }
4464 
4465 static int
4466 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4467 {
4468 	struct hn_softc *sc = arg1;
4469 	char verstr[16];
4470 
4471 	snprintf(verstr, sizeof(verstr), "%u.%u",
4472 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4473 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4474 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4475 }
4476 
4477 static int
4478 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4479 {
4480 	struct hn_softc *sc = arg1;
4481 	char caps_str[128];
4482 	uint32_t caps;
4483 
4484 	HN_LOCK(sc);
4485 	caps = sc->hn_caps;
4486 	HN_UNLOCK(sc);
4487 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4488 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4489 }
4490 
4491 static int
4492 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4493 {
4494 	struct hn_softc *sc = arg1;
4495 	char assist_str[128];
4496 	uint32_t hwassist;
4497 
4498 	HN_LOCK(sc);
4499 	hwassist = sc->hn_ifp->if_hwassist;
4500 	HN_UNLOCK(sc);
4501 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4502 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4503 }
4504 
4505 static int
4506 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4507 {
4508 	struct hn_softc *sc = arg1;
4509 	char filter_str[128];
4510 	uint32_t filter;
4511 
4512 	HN_LOCK(sc);
4513 	filter = sc->hn_rx_filter;
4514 	HN_UNLOCK(sc);
4515 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4516 	    NDIS_PACKET_TYPES);
4517 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4518 }
4519 
4520 #ifndef RSS
4521 
4522 static int
4523 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4524 {
4525 	struct hn_softc *sc = arg1;
4526 	int error;
4527 
4528 	HN_LOCK(sc);
4529 
4530 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4531 	if (error || req->newptr == NULL)
4532 		goto back;
4533 
4534 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4535 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4536 		/*
4537 		 * RSS key is synchronized w/ VF's, don't allow users
4538 		 * to change it.
4539 		 */
4540 		error = EBUSY;
4541 		goto back;
4542 	}
4543 
4544 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4545 	if (error)
4546 		goto back;
4547 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4548 
4549 	if (sc->hn_rx_ring_inuse > 1) {
4550 		error = hn_rss_reconfig(sc);
4551 	} else {
4552 		/* Not RSS capable, at least for now; just save the RSS key. */
4553 		error = 0;
4554 	}
4555 back:
4556 	HN_UNLOCK(sc);
4557 	return (error);
4558 }
4559 
4560 static int
4561 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4562 {
4563 	struct hn_softc *sc = arg1;
4564 	int error;
4565 
4566 	HN_LOCK(sc);
4567 
4568 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4569 	if (error || req->newptr == NULL)
4570 		goto back;
4571 
4572 	/*
4573 	 * Don't allow RSS indirect table change, if this interface is not
4574 	 * RSS capable currently.
4575 	 */
4576 	if (sc->hn_rx_ring_inuse == 1) {
4577 		error = EOPNOTSUPP;
4578 		goto back;
4579 	}
4580 
4581 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4582 	if (error)
4583 		goto back;
4584 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4585 
4586 	hn_rss_ind_fixup(sc);
4587 	error = hn_rss_reconfig(sc);
4588 back:
4589 	HN_UNLOCK(sc);
4590 	return (error);
4591 }
4592 
4593 #endif	/* !RSS */
4594 
4595 static int
4596 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4597 {
4598 	struct hn_softc *sc = arg1;
4599 	char hash_str[128];
4600 	uint32_t hash;
4601 
4602 	HN_LOCK(sc);
4603 	hash = sc->hn_rss_hash;
4604 	HN_UNLOCK(sc);
4605 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4606 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4607 }
4608 
4609 static int
4610 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4611 {
4612 	struct hn_softc *sc = arg1;
4613 	char hash_str[128];
4614 	uint32_t hash;
4615 
4616 	HN_LOCK(sc);
4617 	hash = sc->hn_rss_hcap;
4618 	HN_UNLOCK(sc);
4619 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4620 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4621 }
4622 
4623 static int
4624 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4625 {
4626 	struct hn_softc *sc = arg1;
4627 	char hash_str[128];
4628 	uint32_t hash;
4629 
4630 	HN_LOCK(sc);
4631 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4632 	HN_UNLOCK(sc);
4633 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4634 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4635 }
4636 
4637 static int
4638 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4639 {
4640 	struct hn_softc *sc = arg1;
4641 	char vf_name[IFNAMSIZ + 1];
4642 	struct ifnet *vf_ifp;
4643 
4644 	HN_LOCK(sc);
4645 	vf_name[0] = '\0';
4646 	vf_ifp = sc->hn_vf_ifp;
4647 	if (vf_ifp != NULL)
4648 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4649 	HN_UNLOCK(sc);
4650 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4651 }
4652 
4653 static int
4654 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4655 {
4656 	struct hn_softc *sc = arg1;
4657 	char vf_name[IFNAMSIZ + 1];
4658 	struct ifnet *vf_ifp;
4659 
4660 	HN_LOCK(sc);
4661 	vf_name[0] = '\0';
4662 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4663 	if (vf_ifp != NULL)
4664 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4665 	HN_UNLOCK(sc);
4666 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4667 }
4668 
4669 static int
4670 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4671 {
4672 	struct rm_priotracker pt;
4673 	struct sbuf *sb;
4674 	int error, i;
4675 	bool first;
4676 
4677 	error = sysctl_wire_old_buffer(req, 0);
4678 	if (error != 0)
4679 		return (error);
4680 
4681 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4682 	if (sb == NULL)
4683 		return (ENOMEM);
4684 
4685 	rm_rlock(&hn_vfmap_lock, &pt);
4686 
4687 	first = true;
4688 	for (i = 0; i < hn_vfmap_size; ++i) {
4689 		struct ifnet *ifp;
4690 
4691 		if (hn_vfmap[i] == NULL)
4692 			continue;
4693 
4694 		ifp = ifnet_byindex(i);
4695 		if (ifp != NULL) {
4696 			if (first)
4697 				sbuf_printf(sb, "%s", ifp->if_xname);
4698 			else
4699 				sbuf_printf(sb, " %s", ifp->if_xname);
4700 			first = false;
4701 		}
4702 	}
4703 
4704 	rm_runlock(&hn_vfmap_lock, &pt);
4705 
4706 	error = sbuf_finish(sb);
4707 	sbuf_delete(sb);
4708 	return (error);
4709 }
4710 
4711 static int
4712 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4713 {
4714 	struct rm_priotracker pt;
4715 	struct sbuf *sb;
4716 	int error, i;
4717 	bool first;
4718 
4719 	error = sysctl_wire_old_buffer(req, 0);
4720 	if (error != 0)
4721 		return (error);
4722 
4723 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4724 	if (sb == NULL)
4725 		return (ENOMEM);
4726 
4727 	rm_rlock(&hn_vfmap_lock, &pt);
4728 
4729 	first = true;
4730 	for (i = 0; i < hn_vfmap_size; ++i) {
4731 		struct ifnet *ifp, *hn_ifp;
4732 
4733 		hn_ifp = hn_vfmap[i];
4734 		if (hn_ifp == NULL)
4735 			continue;
4736 
4737 		ifp = ifnet_byindex(i);
4738 		if (ifp != NULL) {
4739 			if (first) {
4740 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4741 				    hn_ifp->if_xname);
4742 			} else {
4743 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4744 				    hn_ifp->if_xname);
4745 			}
4746 			first = false;
4747 		}
4748 	}
4749 
4750 	rm_runlock(&hn_vfmap_lock, &pt);
4751 
4752 	error = sbuf_finish(sb);
4753 	sbuf_delete(sb);
4754 	return (error);
4755 }
4756 
4757 static int
4758 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4759 {
4760 	struct hn_softc *sc = arg1;
4761 	int error, onoff = 0;
4762 
4763 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4764 		onoff = 1;
4765 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4766 	if (error || req->newptr == NULL)
4767 		return (error);
4768 
4769 	HN_LOCK(sc);
4770 	/* NOTE: hn_vf_lock for hn_transmit() */
4771 	rm_wlock(&sc->hn_vf_lock);
4772 	if (onoff)
4773 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4774 	else
4775 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4776 	rm_wunlock(&sc->hn_vf_lock);
4777 	HN_UNLOCK(sc);
4778 
4779 	return (0);
4780 }
4781 
4782 static int
4783 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4784 {
4785 	struct hn_softc *sc = arg1;
4786 	int enabled = 0;
4787 
4788 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4789 		enabled = 1;
4790 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4791 }
4792 
4793 static int
4794 hn_check_iplen(const struct mbuf *m, int hoff)
4795 {
4796 	const struct ip *ip;
4797 	int len, iphlen, iplen;
4798 	const struct tcphdr *th;
4799 	int thoff;				/* TCP data offset */
4800 
4801 	len = hoff + sizeof(struct ip);
4802 
4803 	/* The packet must be at least the size of an IP header. */
4804 	if (m->m_pkthdr.len < len)
4805 		return IPPROTO_DONE;
4806 
4807 	/* The fixed IP header must reside completely in the first mbuf. */
4808 	if (m->m_len < len)
4809 		return IPPROTO_DONE;
4810 
4811 	ip = mtodo(m, hoff);
4812 
4813 	/* Bound check the packet's stated IP header length. */
4814 	iphlen = ip->ip_hl << 2;
4815 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4816 		return IPPROTO_DONE;
4817 
4818 	/* The full IP header must reside completely in the one mbuf. */
4819 	if (m->m_len < hoff + iphlen)
4820 		return IPPROTO_DONE;
4821 
4822 	iplen = ntohs(ip->ip_len);
4823 
4824 	/*
4825 	 * Check that the amount of data in the buffers is as
4826 	 * at least much as the IP header would have us expect.
4827 	 */
4828 	if (m->m_pkthdr.len < hoff + iplen)
4829 		return IPPROTO_DONE;
4830 
4831 	/*
4832 	 * Ignore IP fragments.
4833 	 */
4834 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4835 		return IPPROTO_DONE;
4836 
4837 	/*
4838 	 * The TCP/IP or UDP/IP header must be entirely contained within
4839 	 * the first fragment of a packet.
4840 	 */
4841 	switch (ip->ip_p) {
4842 	case IPPROTO_TCP:
4843 		if (iplen < iphlen + sizeof(struct tcphdr))
4844 			return IPPROTO_DONE;
4845 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4846 			return IPPROTO_DONE;
4847 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4848 		thoff = th->th_off << 2;
4849 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4850 			return IPPROTO_DONE;
4851 		if (m->m_len < hoff + iphlen + thoff)
4852 			return IPPROTO_DONE;
4853 		break;
4854 	case IPPROTO_UDP:
4855 		if (iplen < iphlen + sizeof(struct udphdr))
4856 			return IPPROTO_DONE;
4857 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4858 			return IPPROTO_DONE;
4859 		break;
4860 	default:
4861 		if (iplen < iphlen)
4862 			return IPPROTO_DONE;
4863 		break;
4864 	}
4865 	return ip->ip_p;
4866 }
4867 
4868 static void
4869 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4870 {
4871 	const struct ether_header *eh;
4872 	uint16_t etype;
4873 	int hoff;
4874 
4875 	hoff = sizeof(*eh);
4876 	/* Checked at the beginning of this function. */
4877 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4878 
4879 	eh = mtod(m_new, const struct ether_header *);
4880 	etype = ntohs(eh->ether_type);
4881 	if (etype == ETHERTYPE_VLAN) {
4882 		const struct ether_vlan_header *evl;
4883 
4884 		hoff = sizeof(*evl);
4885 		if (m_new->m_len < hoff)
4886 			return;
4887 		evl = mtod(m_new, const struct ether_vlan_header *);
4888 		etype = ntohs(evl->evl_proto);
4889 	}
4890 	*l3proto = etype;
4891 
4892 	if (etype == ETHERTYPE_IP)
4893 		*l4proto = hn_check_iplen(m_new, hoff);
4894 	else
4895 		*l4proto = IPPROTO_DONE;
4896 }
4897 
4898 static int
4899 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4900 {
4901 	struct sysctl_oid_list *child;
4902 	struct sysctl_ctx_list *ctx;
4903 	device_t dev = sc->hn_dev;
4904 #if defined(INET) || defined(INET6)
4905 #if __FreeBSD_version >= 1100095
4906 	int lroent_cnt;
4907 #endif
4908 #endif
4909 	int i;
4910 
4911 	/*
4912 	 * Create RXBUF for reception.
4913 	 *
4914 	 * NOTE:
4915 	 * - It is shared by all channels.
4916 	 * - A large enough buffer is allocated, certain version of NVSes
4917 	 *   may further limit the usable space.
4918 	 */
4919 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4920 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4921 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4922 	if (sc->hn_rxbuf == NULL) {
4923 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4924 		return (ENOMEM);
4925 	}
4926 
4927 	sc->hn_rx_ring_cnt = ring_cnt;
4928 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4929 
4930 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4931 	    M_DEVBUF, M_WAITOK | M_ZERO);
4932 
4933 #if defined(INET) || defined(INET6)
4934 #if __FreeBSD_version >= 1100095
4935 	lroent_cnt = hn_lro_entry_count;
4936 	if (lroent_cnt < TCP_LRO_ENTRIES)
4937 		lroent_cnt = TCP_LRO_ENTRIES;
4938 	if (bootverbose)
4939 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4940 #endif
4941 #endif	/* INET || INET6 */
4942 
4943 	ctx = device_get_sysctl_ctx(dev);
4944 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4945 
4946 	/* Create dev.hn.UNIT.rx sysctl tree */
4947 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4948 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4949 
4950 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4951 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4952 
4953 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4954 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4955 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4956 		if (rxr->hn_br == NULL) {
4957 			device_printf(dev, "allocate bufring failed\n");
4958 			return (ENOMEM);
4959 		}
4960 
4961 		if (hn_trust_hosttcp)
4962 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4963 		if (hn_trust_hostudp)
4964 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4965 		if (hn_trust_hostip)
4966 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4967 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4968 		rxr->hn_ifp = sc->hn_ifp;
4969 		if (i < sc->hn_tx_ring_cnt)
4970 			rxr->hn_txr = &sc->hn_tx_ring[i];
4971 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4972 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4973 		rxr->hn_rx_idx = i;
4974 		rxr->hn_rxbuf = sc->hn_rxbuf;
4975 
4976 		/*
4977 		 * Initialize LRO.
4978 		 */
4979 #if defined(INET) || defined(INET6)
4980 #if __FreeBSD_version >= 1100095
4981 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4982 		    hn_lro_mbufq_depth);
4983 #else
4984 		tcp_lro_init(&rxr->hn_lro);
4985 		rxr->hn_lro.ifp = sc->hn_ifp;
4986 #endif
4987 #if __FreeBSD_version >= 1100099
4988 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4989 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4990 #endif
4991 #endif	/* INET || INET6 */
4992 
4993 		if (sc->hn_rx_sysctl_tree != NULL) {
4994 			char name[16];
4995 
4996 			/*
4997 			 * Create per RX ring sysctl tree:
4998 			 * dev.hn.UNIT.rx.RINGID
4999 			 */
5000 			snprintf(name, sizeof(name), "%d", i);
5001 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5002 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5003 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5004 
5005 			if (rxr->hn_rx_sysctl_tree != NULL) {
5006 				SYSCTL_ADD_ULONG(ctx,
5007 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5008 				    OID_AUTO, "packets", CTLFLAG_RW,
5009 				    &rxr->hn_pkts, "# of packets received");
5010 				SYSCTL_ADD_ULONG(ctx,
5011 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5012 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5013 				    &rxr->hn_rss_pkts,
5014 				    "# of packets w/ RSS info received");
5015 				SYSCTL_ADD_INT(ctx,
5016 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5017 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5018 				    &rxr->hn_pktbuf_len, 0,
5019 				    "Temporary channel packet buffer length");
5020 			}
5021 		}
5022 	}
5023 
5024 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5025 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5026 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5027 #if __FreeBSD_version < 1100095
5028 	    hn_rx_stat_int_sysctl,
5029 #else
5030 	    hn_rx_stat_u64_sysctl,
5031 #endif
5032 	    "LU", "LRO queued");
5033 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5034 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5035 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5036 #if __FreeBSD_version < 1100095
5037 	    hn_rx_stat_int_sysctl,
5038 #else
5039 	    hn_rx_stat_u64_sysctl,
5040 #endif
5041 	    "LU", "LRO flushed");
5042 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5043 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5044 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5045 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5046 #if __FreeBSD_version >= 1100099
5047 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5048 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5049 	    hn_lro_lenlim_sysctl, "IU",
5050 	    "Max # of data bytes to be aggregated by LRO");
5051 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5052 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5053 	    hn_lro_ackcnt_sysctl, "I",
5054 	    "Max # of ACKs to be aggregated by LRO");
5055 #endif
5056 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5057 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5058 	    hn_trust_hcsum_sysctl, "I",
5059 	    "Trust tcp segement verification on host side, "
5060 	    "when csum info is missing");
5061 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5062 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5063 	    hn_trust_hcsum_sysctl, "I",
5064 	    "Trust udp datagram verification on host side, "
5065 	    "when csum info is missing");
5066 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5067 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5068 	    hn_trust_hcsum_sysctl, "I",
5069 	    "Trust ip packet verification on host side, "
5070 	    "when csum info is missing");
5071 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5072 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5073 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5074 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5075 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5076 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5077 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5078 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5079 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5080 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5081 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5082 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5083 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5084 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5085 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5086 	    hn_rx_stat_ulong_sysctl, "LU",
5087 	    "# of packets that we trust host's csum verification");
5088 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5089 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5090 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5091 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5092 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5093 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5094 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5095 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5096 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5097 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5098 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5099 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5100 
5101 	return (0);
5102 }
5103 
5104 static void
5105 hn_destroy_rx_data(struct hn_softc *sc)
5106 {
5107 	int i;
5108 
5109 	if (sc->hn_rxbuf != NULL) {
5110 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5111 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5112 		else
5113 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5114 		sc->hn_rxbuf = NULL;
5115 	}
5116 
5117 	if (sc->hn_rx_ring_cnt == 0)
5118 		return;
5119 
5120 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5121 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5122 
5123 		if (rxr->hn_br == NULL)
5124 			continue;
5125 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5126 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5127 		} else {
5128 			device_printf(sc->hn_dev,
5129 			    "%dth channel bufring is referenced", i);
5130 		}
5131 		rxr->hn_br = NULL;
5132 
5133 #if defined(INET) || defined(INET6)
5134 		tcp_lro_free(&rxr->hn_lro);
5135 #endif
5136 		free(rxr->hn_pktbuf, M_DEVBUF);
5137 	}
5138 	free(sc->hn_rx_ring, M_DEVBUF);
5139 	sc->hn_rx_ring = NULL;
5140 
5141 	sc->hn_rx_ring_cnt = 0;
5142 	sc->hn_rx_ring_inuse = 0;
5143 }
5144 
5145 static int
5146 hn_tx_ring_create(struct hn_softc *sc, int id)
5147 {
5148 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5149 	device_t dev = sc->hn_dev;
5150 	bus_dma_tag_t parent_dtag;
5151 	int error, i;
5152 
5153 	txr->hn_sc = sc;
5154 	txr->hn_tx_idx = id;
5155 
5156 #ifndef HN_USE_TXDESC_BUFRING
5157 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5158 #endif
5159 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5160 
5161 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5162 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5163 	    M_DEVBUF, M_WAITOK | M_ZERO);
5164 #ifndef HN_USE_TXDESC_BUFRING
5165 	SLIST_INIT(&txr->hn_txlist);
5166 #else
5167 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5168 	    M_WAITOK, &txr->hn_tx_lock);
5169 #endif
5170 
5171 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5172 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5173 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5174 	} else {
5175 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5176 	}
5177 
5178 #ifdef HN_IFSTART_SUPPORT
5179 	if (hn_use_if_start) {
5180 		txr->hn_txeof = hn_start_txeof;
5181 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5182 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5183 	} else
5184 #endif
5185 	{
5186 		int br_depth;
5187 
5188 		txr->hn_txeof = hn_xmit_txeof;
5189 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5190 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5191 
5192 		br_depth = hn_get_txswq_depth(txr);
5193 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5194 		    M_WAITOK, &txr->hn_tx_lock);
5195 	}
5196 
5197 	txr->hn_direct_tx_size = hn_direct_tx_size;
5198 
5199 	/*
5200 	 * Always schedule transmission instead of trying to do direct
5201 	 * transmission.  This one gives the best performance so far.
5202 	 */
5203 	txr->hn_sched_tx = 1;
5204 
5205 	parent_dtag = bus_get_dma_tag(dev);
5206 
5207 	/* DMA tag for RNDIS packet messages. */
5208 	error = bus_dma_tag_create(parent_dtag, /* parent */
5209 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5210 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5211 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5212 	    BUS_SPACE_MAXADDR,		/* highaddr */
5213 	    NULL, NULL,			/* filter, filterarg */
5214 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5215 	    1,				/* nsegments */
5216 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5217 	    0,				/* flags */
5218 	    NULL,			/* lockfunc */
5219 	    NULL,			/* lockfuncarg */
5220 	    &txr->hn_tx_rndis_dtag);
5221 	if (error) {
5222 		device_printf(dev, "failed to create rndis dmatag\n");
5223 		return error;
5224 	}
5225 
5226 	/* DMA tag for data. */
5227 	error = bus_dma_tag_create(parent_dtag, /* parent */
5228 	    1,				/* alignment */
5229 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5230 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5231 	    BUS_SPACE_MAXADDR,		/* highaddr */
5232 	    NULL, NULL,			/* filter, filterarg */
5233 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5234 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5235 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5236 	    0,				/* flags */
5237 	    NULL,			/* lockfunc */
5238 	    NULL,			/* lockfuncarg */
5239 	    &txr->hn_tx_data_dtag);
5240 	if (error) {
5241 		device_printf(dev, "failed to create data dmatag\n");
5242 		return error;
5243 	}
5244 
5245 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5246 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5247 
5248 		txd->txr = txr;
5249 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5250 		STAILQ_INIT(&txd->agg_list);
5251 
5252 		/*
5253 		 * Allocate and load RNDIS packet message.
5254 		 */
5255         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5256 		    (void **)&txd->rndis_pkt,
5257 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5258 		    &txd->rndis_pkt_dmap);
5259 		if (error) {
5260 			device_printf(dev,
5261 			    "failed to allocate rndis_packet_msg, %d\n", i);
5262 			return error;
5263 		}
5264 
5265 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5266 		    txd->rndis_pkt_dmap,
5267 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5268 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5269 		    BUS_DMA_NOWAIT);
5270 		if (error) {
5271 			device_printf(dev,
5272 			    "failed to load rndis_packet_msg, %d\n", i);
5273 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5274 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5275 			return error;
5276 		}
5277 
5278 		/* DMA map for TX data. */
5279 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5280 		    &txd->data_dmap);
5281 		if (error) {
5282 			device_printf(dev,
5283 			    "failed to allocate tx data dmamap\n");
5284 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5285 			    txd->rndis_pkt_dmap);
5286 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5287 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5288 			return error;
5289 		}
5290 
5291 		/* All set, put it to list */
5292 		txd->flags |= HN_TXD_FLAG_ONLIST;
5293 #ifndef HN_USE_TXDESC_BUFRING
5294 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5295 #else
5296 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5297 #endif
5298 	}
5299 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5300 
5301 	if (sc->hn_tx_sysctl_tree != NULL) {
5302 		struct sysctl_oid_list *child;
5303 		struct sysctl_ctx_list *ctx;
5304 		char name[16];
5305 
5306 		/*
5307 		 * Create per TX ring sysctl tree:
5308 		 * dev.hn.UNIT.tx.RINGID
5309 		 */
5310 		ctx = device_get_sysctl_ctx(dev);
5311 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5312 
5313 		snprintf(name, sizeof(name), "%d", id);
5314 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5315 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5316 
5317 		if (txr->hn_tx_sysctl_tree != NULL) {
5318 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5319 
5320 #ifdef HN_DEBUG
5321 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5322 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5323 			    "# of available TX descs");
5324 #endif
5325 #ifdef HN_IFSTART_SUPPORT
5326 			if (!hn_use_if_start)
5327 #endif
5328 			{
5329 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5330 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5331 				    "over active");
5332 			}
5333 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5334 			    CTLFLAG_RW, &txr->hn_pkts,
5335 			    "# of packets transmitted");
5336 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5337 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5338 		}
5339 	}
5340 
5341 	return 0;
5342 }
5343 
5344 static void
5345 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5346 {
5347 	struct hn_tx_ring *txr = txd->txr;
5348 
5349 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5350 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5351 
5352 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5353 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5354 	    txd->rndis_pkt_dmap);
5355 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5356 }
5357 
5358 static void
5359 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5360 {
5361 
5362 	KASSERT(txd->refs == 0 || txd->refs == 1,
5363 	    ("invalid txd refs %d", txd->refs));
5364 
5365 	/* Aggregated txds will be freed by their aggregating txd. */
5366 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5367 		int freed;
5368 
5369 		freed = hn_txdesc_put(txr, txd);
5370 		KASSERT(freed, ("can't free txdesc"));
5371 	}
5372 }
5373 
5374 static void
5375 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5376 {
5377 	int i;
5378 
5379 	if (txr->hn_txdesc == NULL)
5380 		return;
5381 
5382 	/*
5383 	 * NOTE:
5384 	 * Because the freeing of aggregated txds will be deferred
5385 	 * to the aggregating txd, two passes are used here:
5386 	 * - The first pass GCes any pending txds.  This GC is necessary,
5387 	 *   since if the channels are revoked, hypervisor will not
5388 	 *   deliver send-done for all pending txds.
5389 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5390 	 *   were freed.
5391 	 */
5392 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5393 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5394 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5395 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5396 
5397 	if (txr->hn_tx_data_dtag != NULL)
5398 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5399 	if (txr->hn_tx_rndis_dtag != NULL)
5400 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5401 
5402 #ifdef HN_USE_TXDESC_BUFRING
5403 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5404 #endif
5405 
5406 	free(txr->hn_txdesc, M_DEVBUF);
5407 	txr->hn_txdesc = NULL;
5408 
5409 	if (txr->hn_mbuf_br != NULL)
5410 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5411 
5412 #ifndef HN_USE_TXDESC_BUFRING
5413 	mtx_destroy(&txr->hn_txlist_spin);
5414 #endif
5415 	mtx_destroy(&txr->hn_tx_lock);
5416 }
5417 
5418 static int
5419 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5420 {
5421 	struct sysctl_oid_list *child;
5422 	struct sysctl_ctx_list *ctx;
5423 	int i;
5424 
5425 	/*
5426 	 * Create TXBUF for chimney sending.
5427 	 *
5428 	 * NOTE: It is shared by all channels.
5429 	 */
5430 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5431 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5432 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5433 	if (sc->hn_chim == NULL) {
5434 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5435 		return (ENOMEM);
5436 	}
5437 
5438 	sc->hn_tx_ring_cnt = ring_cnt;
5439 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5440 
5441 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5442 	    M_DEVBUF, M_WAITOK | M_ZERO);
5443 
5444 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5445 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5446 
5447 	/* Create dev.hn.UNIT.tx sysctl tree */
5448 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5449 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5450 
5451 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5452 		int error;
5453 
5454 		error = hn_tx_ring_create(sc, i);
5455 		if (error)
5456 			return error;
5457 	}
5458 
5459 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5460 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5461 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5462 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5463 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5464 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5465 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5466 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5467 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5468 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5469 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5470 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5471 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5472 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5473 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5474 	    hn_tx_stat_ulong_sysctl, "LU",
5475 	    "# of packet transmission aggregation flush failure");
5476 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5477 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5478 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5479 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5480 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5481 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5482 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5483 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5484 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5485 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5486 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5487 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5488 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5489 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5490 	    "# of total TX descs");
5491 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5492 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5493 	    "Chimney send packet size upper boundary");
5494 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5495 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5496 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5497 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5498 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5499 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5500 	    hn_tx_conf_int_sysctl, "I",
5501 	    "Size of the packet for direct transmission");
5502 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5503 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5504 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5505 	    hn_tx_conf_int_sysctl, "I",
5506 	    "Always schedule transmission "
5507 	    "instead of doing direct transmission");
5508 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5509 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5510 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5511 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5512 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5513 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5514 	    "Applied packet transmission aggregation size");
5515 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5516 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5517 	    hn_txagg_pktmax_sysctl, "I",
5518 	    "Applied packet transmission aggregation packets");
5519 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5520 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5521 	    hn_txagg_align_sysctl, "I",
5522 	    "Applied packet transmission aggregation alignment");
5523 
5524 	return 0;
5525 }
5526 
5527 static void
5528 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5529 {
5530 	int i;
5531 
5532 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5533 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5534 }
5535 
5536 static void
5537 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5538 {
5539 	struct ifnet *ifp = sc->hn_ifp;
5540 	u_int hw_tsomax;
5541 	int tso_minlen;
5542 
5543 	HN_LOCK_ASSERT(sc);
5544 
5545 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5546 		return;
5547 
5548 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5549 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5550 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5551 
5552 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5553 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5554 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5555 
5556 	if (tso_maxlen < tso_minlen)
5557 		tso_maxlen = tso_minlen;
5558 	else if (tso_maxlen > IP_MAXPACKET)
5559 		tso_maxlen = IP_MAXPACKET;
5560 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5561 		tso_maxlen = sc->hn_ndis_tso_szmax;
5562 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5563 
5564 	if (hn_xpnt_vf_isready(sc)) {
5565 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5566 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5567 	}
5568 	ifp->if_hw_tsomax = hw_tsomax;
5569 	if (bootverbose)
5570 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5571 }
5572 
5573 static void
5574 hn_fixup_tx_data(struct hn_softc *sc)
5575 {
5576 	uint64_t csum_assist;
5577 	int i;
5578 
5579 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5580 	if (hn_tx_chimney_size > 0 &&
5581 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5582 		hn_set_chim_size(sc, hn_tx_chimney_size);
5583 
5584 	csum_assist = 0;
5585 	if (sc->hn_caps & HN_CAP_IPCS)
5586 		csum_assist |= CSUM_IP;
5587 	if (sc->hn_caps & HN_CAP_TCP4CS)
5588 		csum_assist |= CSUM_IP_TCP;
5589 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5590 		csum_assist |= CSUM_IP_UDP;
5591 	if (sc->hn_caps & HN_CAP_TCP6CS)
5592 		csum_assist |= CSUM_IP6_TCP;
5593 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5594 		csum_assist |= CSUM_IP6_UDP;
5595 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5596 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5597 
5598 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5599 		/*
5600 		 * Support HASHVAL pktinfo on TX path.
5601 		 */
5602 		if (bootverbose)
5603 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5604 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5605 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5606 	}
5607 }
5608 
5609 static void
5610 hn_fixup_rx_data(struct hn_softc *sc)
5611 {
5612 
5613 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5614 		int i;
5615 
5616 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5617 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5618 	}
5619 }
5620 
5621 static void
5622 hn_destroy_tx_data(struct hn_softc *sc)
5623 {
5624 	int i;
5625 
5626 	if (sc->hn_chim != NULL) {
5627 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5628 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5629 		} else {
5630 			device_printf(sc->hn_dev,
5631 			    "chimney sending buffer is referenced");
5632 		}
5633 		sc->hn_chim = NULL;
5634 	}
5635 
5636 	if (sc->hn_tx_ring_cnt == 0)
5637 		return;
5638 
5639 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5640 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5641 
5642 	free(sc->hn_tx_ring, M_DEVBUF);
5643 	sc->hn_tx_ring = NULL;
5644 
5645 	sc->hn_tx_ring_cnt = 0;
5646 	sc->hn_tx_ring_inuse = 0;
5647 }
5648 
5649 #ifdef HN_IFSTART_SUPPORT
5650 
5651 static void
5652 hn_start_taskfunc(void *xtxr, int pending __unused)
5653 {
5654 	struct hn_tx_ring *txr = xtxr;
5655 
5656 	mtx_lock(&txr->hn_tx_lock);
5657 	hn_start_locked(txr, 0);
5658 	mtx_unlock(&txr->hn_tx_lock);
5659 }
5660 
5661 static int
5662 hn_start_locked(struct hn_tx_ring *txr, int len)
5663 {
5664 	struct hn_softc *sc = txr->hn_sc;
5665 	struct ifnet *ifp = sc->hn_ifp;
5666 	int sched = 0;
5667 
5668 	KASSERT(hn_use_if_start,
5669 	    ("hn_start_locked is called, when if_start is disabled"));
5670 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5671 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5672 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5673 
5674 	if (__predict_false(txr->hn_suspended))
5675 		return (0);
5676 
5677 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5678 	    IFF_DRV_RUNNING)
5679 		return (0);
5680 
5681 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5682 		struct hn_txdesc *txd;
5683 		struct mbuf *m_head;
5684 		int error;
5685 
5686 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5687 		if (m_head == NULL)
5688 			break;
5689 
5690 		if (len > 0 && m_head->m_pkthdr.len > len) {
5691 			/*
5692 			 * This sending could be time consuming; let callers
5693 			 * dispatch this packet sending (and sending of any
5694 			 * following up packets) to tx taskqueue.
5695 			 */
5696 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5697 			sched = 1;
5698 			break;
5699 		}
5700 
5701 #if defined(INET6) || defined(INET)
5702 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5703 			m_head = hn_tso_fixup(m_head);
5704 			if (__predict_false(m_head == NULL)) {
5705 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5706 				continue;
5707 			}
5708 		} else if (m_head->m_pkthdr.csum_flags &
5709 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5710 			m_head = hn_set_hlen(m_head);
5711 			if (__predict_false(m_head == NULL)) {
5712 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5713 				continue;
5714 			}
5715 		}
5716 #endif
5717 
5718 		txd = hn_txdesc_get(txr);
5719 		if (txd == NULL) {
5720 			txr->hn_no_txdescs++;
5721 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5722 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5723 			break;
5724 		}
5725 
5726 		error = hn_encap(ifp, txr, txd, &m_head);
5727 		if (error) {
5728 			/* Both txd and m_head are freed */
5729 			KASSERT(txr->hn_agg_txd == NULL,
5730 			    ("encap failed w/ pending aggregating txdesc"));
5731 			continue;
5732 		}
5733 
5734 		if (txr->hn_agg_pktleft == 0) {
5735 			if (txr->hn_agg_txd != NULL) {
5736 				KASSERT(m_head == NULL,
5737 				    ("pending mbuf for aggregating txdesc"));
5738 				error = hn_flush_txagg(ifp, txr);
5739 				if (__predict_false(error)) {
5740 					atomic_set_int(&ifp->if_drv_flags,
5741 					    IFF_DRV_OACTIVE);
5742 					break;
5743 				}
5744 			} else {
5745 				KASSERT(m_head != NULL, ("mbuf was freed"));
5746 				error = hn_txpkt(ifp, txr, txd);
5747 				if (__predict_false(error)) {
5748 					/* txd is freed, but m_head is not */
5749 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5750 					atomic_set_int(&ifp->if_drv_flags,
5751 					    IFF_DRV_OACTIVE);
5752 					break;
5753 				}
5754 			}
5755 		}
5756 #ifdef INVARIANTS
5757 		else {
5758 			KASSERT(txr->hn_agg_txd != NULL,
5759 			    ("no aggregating txdesc"));
5760 			KASSERT(m_head == NULL,
5761 			    ("pending mbuf for aggregating txdesc"));
5762 		}
5763 #endif
5764 	}
5765 
5766 	/* Flush pending aggerated transmission. */
5767 	if (txr->hn_agg_txd != NULL)
5768 		hn_flush_txagg(ifp, txr);
5769 	return (sched);
5770 }
5771 
5772 static void
5773 hn_start(struct ifnet *ifp)
5774 {
5775 	struct hn_softc *sc = ifp->if_softc;
5776 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5777 
5778 	if (txr->hn_sched_tx)
5779 		goto do_sched;
5780 
5781 	if (mtx_trylock(&txr->hn_tx_lock)) {
5782 		int sched;
5783 
5784 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5785 		mtx_unlock(&txr->hn_tx_lock);
5786 		if (!sched)
5787 			return;
5788 	}
5789 do_sched:
5790 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5791 }
5792 
5793 static void
5794 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5795 {
5796 	struct hn_tx_ring *txr = xtxr;
5797 
5798 	mtx_lock(&txr->hn_tx_lock);
5799 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5800 	hn_start_locked(txr, 0);
5801 	mtx_unlock(&txr->hn_tx_lock);
5802 }
5803 
5804 static void
5805 hn_start_txeof(struct hn_tx_ring *txr)
5806 {
5807 	struct hn_softc *sc = txr->hn_sc;
5808 	struct ifnet *ifp = sc->hn_ifp;
5809 
5810 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5811 
5812 	if (txr->hn_sched_tx)
5813 		goto do_sched;
5814 
5815 	if (mtx_trylock(&txr->hn_tx_lock)) {
5816 		int sched;
5817 
5818 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5819 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5820 		mtx_unlock(&txr->hn_tx_lock);
5821 		if (sched) {
5822 			taskqueue_enqueue(txr->hn_tx_taskq,
5823 			    &txr->hn_tx_task);
5824 		}
5825 	} else {
5826 do_sched:
5827 		/*
5828 		 * Release the OACTIVE earlier, with the hope, that
5829 		 * others could catch up.  The task will clear the
5830 		 * flag again with the hn_tx_lock to avoid possible
5831 		 * races.
5832 		 */
5833 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5834 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5835 	}
5836 }
5837 
5838 #endif	/* HN_IFSTART_SUPPORT */
5839 
5840 static int
5841 hn_xmit(struct hn_tx_ring *txr, int len)
5842 {
5843 	struct hn_softc *sc = txr->hn_sc;
5844 	struct ifnet *ifp = sc->hn_ifp;
5845 	struct mbuf *m_head;
5846 	int sched = 0;
5847 
5848 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5849 #ifdef HN_IFSTART_SUPPORT
5850 	KASSERT(hn_use_if_start == 0,
5851 	    ("hn_xmit is called, when if_start is enabled"));
5852 #endif
5853 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5854 
5855 	if (__predict_false(txr->hn_suspended))
5856 		return (0);
5857 
5858 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5859 		return (0);
5860 
5861 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5862 		struct hn_txdesc *txd;
5863 		int error;
5864 
5865 		if (len > 0 && m_head->m_pkthdr.len > len) {
5866 			/*
5867 			 * This sending could be time consuming; let callers
5868 			 * dispatch this packet sending (and sending of any
5869 			 * following up packets) to tx taskqueue.
5870 			 */
5871 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5872 			sched = 1;
5873 			break;
5874 		}
5875 
5876 		txd = hn_txdesc_get(txr);
5877 		if (txd == NULL) {
5878 			txr->hn_no_txdescs++;
5879 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5880 			txr->hn_oactive = 1;
5881 			break;
5882 		}
5883 
5884 		error = hn_encap(ifp, txr, txd, &m_head);
5885 		if (error) {
5886 			/* Both txd and m_head are freed; discard */
5887 			KASSERT(txr->hn_agg_txd == NULL,
5888 			    ("encap failed w/ pending aggregating txdesc"));
5889 			drbr_advance(ifp, txr->hn_mbuf_br);
5890 			continue;
5891 		}
5892 
5893 		if (txr->hn_agg_pktleft == 0) {
5894 			if (txr->hn_agg_txd != NULL) {
5895 				KASSERT(m_head == NULL,
5896 				    ("pending mbuf for aggregating txdesc"));
5897 				error = hn_flush_txagg(ifp, txr);
5898 				if (__predict_false(error)) {
5899 					txr->hn_oactive = 1;
5900 					break;
5901 				}
5902 			} else {
5903 				KASSERT(m_head != NULL, ("mbuf was freed"));
5904 				error = hn_txpkt(ifp, txr, txd);
5905 				if (__predict_false(error)) {
5906 					/* txd is freed, but m_head is not */
5907 					drbr_putback(ifp, txr->hn_mbuf_br,
5908 					    m_head);
5909 					txr->hn_oactive = 1;
5910 					break;
5911 				}
5912 			}
5913 		}
5914 #ifdef INVARIANTS
5915 		else {
5916 			KASSERT(txr->hn_agg_txd != NULL,
5917 			    ("no aggregating txdesc"));
5918 			KASSERT(m_head == NULL,
5919 			    ("pending mbuf for aggregating txdesc"));
5920 		}
5921 #endif
5922 
5923 		/* Sent */
5924 		drbr_advance(ifp, txr->hn_mbuf_br);
5925 	}
5926 
5927 	/* Flush pending aggerated transmission. */
5928 	if (txr->hn_agg_txd != NULL)
5929 		hn_flush_txagg(ifp, txr);
5930 	return (sched);
5931 }
5932 
5933 static int
5934 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5935 {
5936 	struct hn_softc *sc = ifp->if_softc;
5937 	struct hn_tx_ring *txr;
5938 	int error, idx = 0;
5939 
5940 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5941 		struct rm_priotracker pt;
5942 
5943 		rm_rlock(&sc->hn_vf_lock, &pt);
5944 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5945 			struct mbuf *m_bpf = NULL;
5946 			int obytes, omcast;
5947 
5948 			obytes = m->m_pkthdr.len;
5949 			omcast = (m->m_flags & M_MCAST) != 0;
5950 
5951 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5952 				if (bpf_peers_present(ifp->if_bpf)) {
5953 					m_bpf = m_copypacket(m, M_NOWAIT);
5954 					if (m_bpf == NULL) {
5955 						/*
5956 						 * Failed to grab a shallow
5957 						 * copy; tap now.
5958 						 */
5959 						ETHER_BPF_MTAP(ifp, m);
5960 					}
5961 				}
5962 			} else {
5963 				ETHER_BPF_MTAP(ifp, m);
5964 			}
5965 
5966 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5967 			rm_runlock(&sc->hn_vf_lock, &pt);
5968 
5969 			if (m_bpf != NULL) {
5970 				if (!error)
5971 					ETHER_BPF_MTAP(ifp, m_bpf);
5972 				m_freem(m_bpf);
5973 			}
5974 
5975 			if (error == ENOBUFS) {
5976 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5977 			} else if (error) {
5978 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5979 			} else {
5980 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5981 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5982 				if (omcast) {
5983 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5984 					    omcast);
5985 				}
5986 			}
5987 			return (error);
5988 		}
5989 		rm_runlock(&sc->hn_vf_lock, &pt);
5990 	}
5991 
5992 #if defined(INET6) || defined(INET)
5993 	/*
5994 	 * Perform TSO packet header fixup or get l2/l3 header length now,
5995 	 * since packet headers should be cache-hot.
5996 	 */
5997 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5998 		m = hn_tso_fixup(m);
5999 		if (__predict_false(m == NULL)) {
6000 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6001 			return EIO;
6002 		}
6003 	} else if (m->m_pkthdr.csum_flags &
6004 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6005 		m = hn_set_hlen(m);
6006 		if (__predict_false(m == NULL)) {
6007 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6008 			return EIO;
6009 		}
6010 	}
6011 #endif
6012 
6013 	/*
6014 	 * Select the TX ring based on flowid
6015 	 */
6016 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6017 #ifdef RSS
6018 		uint32_t bid;
6019 
6020 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6021 		    &bid) == 0)
6022 			idx = bid % sc->hn_tx_ring_inuse;
6023 		else
6024 #endif
6025 		{
6026 #if defined(INET6) || defined(INET)
6027 			int tcpsyn = 0;
6028 
6029 			if (m->m_pkthdr.len < 128 &&
6030 			    (m->m_pkthdr.csum_flags &
6031 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6032 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6033 				m = hn_check_tcpsyn(m, &tcpsyn);
6034 				if (__predict_false(m == NULL)) {
6035 					if_inc_counter(ifp,
6036 					    IFCOUNTER_OERRORS, 1);
6037 					return (EIO);
6038 				}
6039 			}
6040 #else
6041 			const int tcpsyn = 0;
6042 #endif
6043 			if (tcpsyn)
6044 				idx = 0;
6045 			else
6046 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6047 		}
6048 	}
6049 	txr = &sc->hn_tx_ring[idx];
6050 
6051 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6052 	if (error) {
6053 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6054 		return error;
6055 	}
6056 
6057 	if (txr->hn_oactive)
6058 		return 0;
6059 
6060 	if (txr->hn_sched_tx)
6061 		goto do_sched;
6062 
6063 	if (mtx_trylock(&txr->hn_tx_lock)) {
6064 		int sched;
6065 
6066 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6067 		mtx_unlock(&txr->hn_tx_lock);
6068 		if (!sched)
6069 			return 0;
6070 	}
6071 do_sched:
6072 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6073 	return 0;
6074 }
6075 
6076 static void
6077 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6078 {
6079 	struct mbuf *m;
6080 
6081 	mtx_lock(&txr->hn_tx_lock);
6082 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6083 		m_freem(m);
6084 	mtx_unlock(&txr->hn_tx_lock);
6085 }
6086 
6087 static void
6088 hn_xmit_qflush(struct ifnet *ifp)
6089 {
6090 	struct hn_softc *sc = ifp->if_softc;
6091 	struct rm_priotracker pt;
6092 	int i;
6093 
6094 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6095 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6096 	if_qflush(ifp);
6097 
6098 	rm_rlock(&sc->hn_vf_lock, &pt);
6099 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6100 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6101 	rm_runlock(&sc->hn_vf_lock, &pt);
6102 }
6103 
6104 static void
6105 hn_xmit_txeof(struct hn_tx_ring *txr)
6106 {
6107 
6108 	if (txr->hn_sched_tx)
6109 		goto do_sched;
6110 
6111 	if (mtx_trylock(&txr->hn_tx_lock)) {
6112 		int sched;
6113 
6114 		txr->hn_oactive = 0;
6115 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6116 		mtx_unlock(&txr->hn_tx_lock);
6117 		if (sched) {
6118 			taskqueue_enqueue(txr->hn_tx_taskq,
6119 			    &txr->hn_tx_task);
6120 		}
6121 	} else {
6122 do_sched:
6123 		/*
6124 		 * Release the oactive earlier, with the hope, that
6125 		 * others could catch up.  The task will clear the
6126 		 * oactive again with the hn_tx_lock to avoid possible
6127 		 * races.
6128 		 */
6129 		txr->hn_oactive = 0;
6130 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6131 	}
6132 }
6133 
6134 static void
6135 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6136 {
6137 	struct hn_tx_ring *txr = xtxr;
6138 
6139 	mtx_lock(&txr->hn_tx_lock);
6140 	hn_xmit(txr, 0);
6141 	mtx_unlock(&txr->hn_tx_lock);
6142 }
6143 
6144 static void
6145 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6146 {
6147 	struct hn_tx_ring *txr = xtxr;
6148 
6149 	mtx_lock(&txr->hn_tx_lock);
6150 	txr->hn_oactive = 0;
6151 	hn_xmit(txr, 0);
6152 	mtx_unlock(&txr->hn_tx_lock);
6153 }
6154 
6155 static int
6156 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6157 {
6158 	struct vmbus_chan_br cbr;
6159 	struct hn_rx_ring *rxr;
6160 	struct hn_tx_ring *txr = NULL;
6161 	int idx, error;
6162 
6163 	idx = vmbus_chan_subidx(chan);
6164 
6165 	/*
6166 	 * Link this channel to RX/TX ring.
6167 	 */
6168 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6169 	    ("invalid channel index %d, should > 0 && < %d",
6170 	     idx, sc->hn_rx_ring_inuse));
6171 	rxr = &sc->hn_rx_ring[idx];
6172 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6173 	    ("RX ring %d already attached", idx));
6174 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6175 	rxr->hn_chan = chan;
6176 
6177 	if (bootverbose) {
6178 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6179 		    idx, vmbus_chan_id(chan));
6180 	}
6181 
6182 	if (idx < sc->hn_tx_ring_inuse) {
6183 		txr = &sc->hn_tx_ring[idx];
6184 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6185 		    ("TX ring %d already attached", idx));
6186 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6187 
6188 		txr->hn_chan = chan;
6189 		if (bootverbose) {
6190 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6191 			    idx, vmbus_chan_id(chan));
6192 		}
6193 	}
6194 
6195 	/* Bind this channel to a proper CPU. */
6196 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6197 
6198 	/*
6199 	 * Open this channel
6200 	 */
6201 	cbr.cbr = rxr->hn_br;
6202 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6203 	cbr.cbr_txsz = HN_TXBR_SIZE;
6204 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6205 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6206 	if (error) {
6207 		if (error == EISCONN) {
6208 			if_printf(sc->hn_ifp, "bufring is connected after "
6209 			    "chan%u open failure\n", vmbus_chan_id(chan));
6210 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6211 		} else {
6212 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6213 			    vmbus_chan_id(chan), error);
6214 		}
6215 	}
6216 	return (error);
6217 }
6218 
6219 static void
6220 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6221 {
6222 	struct hn_rx_ring *rxr;
6223 	int idx, error;
6224 
6225 	idx = vmbus_chan_subidx(chan);
6226 
6227 	/*
6228 	 * Link this channel to RX/TX ring.
6229 	 */
6230 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6231 	    ("invalid channel index %d, should > 0 && < %d",
6232 	     idx, sc->hn_rx_ring_inuse));
6233 	rxr = &sc->hn_rx_ring[idx];
6234 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6235 	    ("RX ring %d is not attached", idx));
6236 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6237 
6238 	if (idx < sc->hn_tx_ring_inuse) {
6239 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6240 
6241 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6242 		    ("TX ring %d is not attached attached", idx));
6243 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6244 	}
6245 
6246 	/*
6247 	 * Close this channel.
6248 	 *
6249 	 * NOTE:
6250 	 * Channel closing does _not_ destroy the target channel.
6251 	 */
6252 	error = vmbus_chan_close_direct(chan);
6253 	if (error == EISCONN) {
6254 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6255 		    "after being closed\n", vmbus_chan_id(chan));
6256 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6257 	} else if (error) {
6258 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6259 		    vmbus_chan_id(chan), error);
6260 	}
6261 }
6262 
6263 static int
6264 hn_attach_subchans(struct hn_softc *sc)
6265 {
6266 	struct vmbus_channel **subchans;
6267 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6268 	int i, error = 0;
6269 
6270 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6271 
6272 	/* Attach the sub-channels. */
6273 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6274 	for (i = 0; i < subchan_cnt; ++i) {
6275 		int error1;
6276 
6277 		error1 = hn_chan_attach(sc, subchans[i]);
6278 		if (error1) {
6279 			error = error1;
6280 			/* Move on; all channels will be detached later. */
6281 		}
6282 	}
6283 	vmbus_subchan_rel(subchans, subchan_cnt);
6284 
6285 	if (error) {
6286 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6287 	} else {
6288 		if (bootverbose) {
6289 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6290 			    subchan_cnt);
6291 		}
6292 	}
6293 	return (error);
6294 }
6295 
6296 static void
6297 hn_detach_allchans(struct hn_softc *sc)
6298 {
6299 	struct vmbus_channel **subchans;
6300 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6301 	int i;
6302 
6303 	if (subchan_cnt == 0)
6304 		goto back;
6305 
6306 	/* Detach the sub-channels. */
6307 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6308 	for (i = 0; i < subchan_cnt; ++i)
6309 		hn_chan_detach(sc, subchans[i]);
6310 	vmbus_subchan_rel(subchans, subchan_cnt);
6311 
6312 back:
6313 	/*
6314 	 * Detach the primary channel, _after_ all sub-channels
6315 	 * are detached.
6316 	 */
6317 	hn_chan_detach(sc, sc->hn_prichan);
6318 
6319 	/* Wait for sub-channels to be destroyed, if any. */
6320 	vmbus_subchan_drain(sc->hn_prichan);
6321 
6322 #ifdef INVARIANTS
6323 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6324 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6325 		    HN_RX_FLAG_ATTACHED) == 0,
6326 		    ("%dth RX ring is still attached", i));
6327 	}
6328 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6329 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6330 		    HN_TX_FLAG_ATTACHED) == 0,
6331 		    ("%dth TX ring is still attached", i));
6332 	}
6333 #endif
6334 }
6335 
6336 static int
6337 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6338 {
6339 	struct vmbus_channel **subchans;
6340 	int nchan, rxr_cnt, error;
6341 
6342 	nchan = *nsubch + 1;
6343 	if (nchan == 1) {
6344 		/*
6345 		 * Multiple RX/TX rings are not requested.
6346 		 */
6347 		*nsubch = 0;
6348 		return (0);
6349 	}
6350 
6351 	/*
6352 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6353 	 * table entries.
6354 	 */
6355 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6356 	if (error) {
6357 		/* No RSS; this is benign. */
6358 		*nsubch = 0;
6359 		return (0);
6360 	}
6361 	if (bootverbose) {
6362 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6363 		    rxr_cnt, nchan);
6364 	}
6365 
6366 	if (nchan > rxr_cnt)
6367 		nchan = rxr_cnt;
6368 	if (nchan == 1) {
6369 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6370 		*nsubch = 0;
6371 		return (0);
6372 	}
6373 
6374 	/*
6375 	 * Allocate sub-channels from NVS.
6376 	 */
6377 	*nsubch = nchan - 1;
6378 	error = hn_nvs_alloc_subchans(sc, nsubch);
6379 	if (error || *nsubch == 0) {
6380 		/* Failed to allocate sub-channels. */
6381 		*nsubch = 0;
6382 		return (0);
6383 	}
6384 
6385 	/*
6386 	 * Wait for all sub-channels to become ready before moving on.
6387 	 */
6388 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6389 	vmbus_subchan_rel(subchans, *nsubch);
6390 	return (0);
6391 }
6392 
6393 static bool
6394 hn_synth_attachable(const struct hn_softc *sc)
6395 {
6396 	int i;
6397 
6398 	if (sc->hn_flags & HN_FLAG_ERRORS)
6399 		return (false);
6400 
6401 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6402 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6403 
6404 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6405 			return (false);
6406 	}
6407 	return (true);
6408 }
6409 
6410 /*
6411  * Make sure that the RX filter is zero after the successful
6412  * RNDIS initialization.
6413  *
6414  * NOTE:
6415  * Under certain conditions on certain versions of Hyper-V,
6416  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6417  * after the successful RNDIS initialization, which breaks
6418  * the assumption of any following code (well, it breaks the
6419  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6420  * explicitly, drain packets sneaking through, and drain the
6421  * interrupt taskqueues scheduled due to the stealth packets.
6422  */
6423 static void
6424 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6425 {
6426 
6427 	hn_disable_rx(sc);
6428 	hn_drain_rxtx(sc, nchan);
6429 }
6430 
6431 static int
6432 hn_synth_attach(struct hn_softc *sc, int mtu)
6433 {
6434 #define ATTACHED_NVS		0x0002
6435 #define ATTACHED_RNDIS		0x0004
6436 
6437 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6438 	int error, nsubch, nchan = 1, i, rndis_inited;
6439 	uint32_t old_caps, attached = 0;
6440 
6441 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6442 	    ("synthetic parts were attached"));
6443 
6444 	if (!hn_synth_attachable(sc))
6445 		return (ENXIO);
6446 
6447 	/* Save capabilities for later verification. */
6448 	old_caps = sc->hn_caps;
6449 	sc->hn_caps = 0;
6450 
6451 	/* Clear RSS stuffs. */
6452 	sc->hn_rss_ind_size = 0;
6453 	sc->hn_rss_hash = 0;
6454 	sc->hn_rss_hcap = 0;
6455 
6456 	/*
6457 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6458 	 */
6459 	error = hn_chan_attach(sc, sc->hn_prichan);
6460 	if (error)
6461 		goto failed;
6462 
6463 	/*
6464 	 * Attach NVS.
6465 	 */
6466 	error = hn_nvs_attach(sc, mtu);
6467 	if (error)
6468 		goto failed;
6469 	attached |= ATTACHED_NVS;
6470 
6471 	/*
6472 	 * Attach RNDIS _after_ NVS is attached.
6473 	 */
6474 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6475 	if (rndis_inited)
6476 		attached |= ATTACHED_RNDIS;
6477 	if (error)
6478 		goto failed;
6479 
6480 	/*
6481 	 * Make sure capabilities are not changed.
6482 	 */
6483 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6484 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6485 		    old_caps, sc->hn_caps);
6486 		error = ENXIO;
6487 		goto failed;
6488 	}
6489 
6490 	/*
6491 	 * Allocate sub-channels for multi-TX/RX rings.
6492 	 *
6493 	 * NOTE:
6494 	 * The # of RX rings that can be used is equivalent to the # of
6495 	 * channels to be requested.
6496 	 */
6497 	nsubch = sc->hn_rx_ring_cnt - 1;
6498 	error = hn_synth_alloc_subchans(sc, &nsubch);
6499 	if (error)
6500 		goto failed;
6501 	/* NOTE: _Full_ synthetic parts detach is required now. */
6502 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6503 
6504 	/*
6505 	 * Set the # of TX/RX rings that could be used according to
6506 	 * the # of channels that NVS offered.
6507 	 */
6508 	nchan = nsubch + 1;
6509 	hn_set_ring_inuse(sc, nchan);
6510 	if (nchan == 1) {
6511 		/* Only the primary channel can be used; done */
6512 		goto back;
6513 	}
6514 
6515 	/*
6516 	 * Attach the sub-channels.
6517 	 *
6518 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6519 	 */
6520 	error = hn_attach_subchans(sc);
6521 	if (error)
6522 		goto failed;
6523 
6524 	/*
6525 	 * Configure RSS key and indirect table _after_ all sub-channels
6526 	 * are attached.
6527 	 */
6528 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6529 		/*
6530 		 * RSS key is not set yet; set it to the default RSS key.
6531 		 */
6532 		if (bootverbose)
6533 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6534 #ifdef RSS
6535 		rss_getkey(rss->rss_key);
6536 #else
6537 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6538 #endif
6539 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6540 	}
6541 
6542 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6543 		/*
6544 		 * RSS indirect table is not set yet; set it up in round-
6545 		 * robin fashion.
6546 		 */
6547 		if (bootverbose) {
6548 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6549 			    "table\n");
6550 		}
6551 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6552 			uint32_t subidx;
6553 
6554 #ifdef RSS
6555 			subidx = rss_get_indirection_to_bucket(i);
6556 #else
6557 			subidx = i;
6558 #endif
6559 			rss->rss_ind[i] = subidx % nchan;
6560 		}
6561 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6562 	} else {
6563 		/*
6564 		 * # of usable channels may be changed, so we have to
6565 		 * make sure that all entries in RSS indirect table
6566 		 * are valid.
6567 		 *
6568 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6569 		 */
6570 		hn_rss_ind_fixup(sc);
6571 	}
6572 
6573 	sc->hn_rss_hash = sc->hn_rss_hcap;
6574 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6575 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6576 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6577 		hn_vf_rss_fixup(sc, false);
6578 	}
6579 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6580 	if (error)
6581 		goto failed;
6582 back:
6583 	/*
6584 	 * Fixup transmission aggregation setup.
6585 	 */
6586 	hn_set_txagg(sc);
6587 	hn_rndis_init_fixat(sc, nchan);
6588 	return (0);
6589 
6590 failed:
6591 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6592 		hn_rndis_init_fixat(sc, nchan);
6593 		hn_synth_detach(sc);
6594 	} else {
6595 		if (attached & ATTACHED_RNDIS) {
6596 			hn_rndis_init_fixat(sc, nchan);
6597 			hn_rndis_detach(sc);
6598 		}
6599 		if (attached & ATTACHED_NVS)
6600 			hn_nvs_detach(sc);
6601 		hn_chan_detach(sc, sc->hn_prichan);
6602 		/* Restore old capabilities. */
6603 		sc->hn_caps = old_caps;
6604 	}
6605 	return (error);
6606 
6607 #undef ATTACHED_RNDIS
6608 #undef ATTACHED_NVS
6609 }
6610 
6611 /*
6612  * NOTE:
6613  * The interface must have been suspended though hn_suspend(), before
6614  * this function get called.
6615  */
6616 static void
6617 hn_synth_detach(struct hn_softc *sc)
6618 {
6619 
6620 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6621 	    ("synthetic parts were not attached"));
6622 
6623 	/* Detach the RNDIS first. */
6624 	hn_rndis_detach(sc);
6625 
6626 	/* Detach NVS. */
6627 	hn_nvs_detach(sc);
6628 
6629 	/* Detach all of the channels. */
6630 	hn_detach_allchans(sc);
6631 
6632 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6633 }
6634 
6635 static void
6636 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6637 {
6638 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6639 	    ("invalid ring count %d", ring_cnt));
6640 
6641 	if (sc->hn_tx_ring_cnt > ring_cnt)
6642 		sc->hn_tx_ring_inuse = ring_cnt;
6643 	else
6644 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6645 	sc->hn_rx_ring_inuse = ring_cnt;
6646 
6647 #ifdef RSS
6648 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6649 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6650 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6651 		    rss_getnumbuckets());
6652 	}
6653 #endif
6654 
6655 	if (bootverbose) {
6656 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6657 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6658 	}
6659 }
6660 
6661 static void
6662 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6663 {
6664 
6665 	/*
6666 	 * NOTE:
6667 	 * The TX bufring will not be drained by the hypervisor,
6668 	 * if the primary channel is revoked.
6669 	 */
6670 	while (!vmbus_chan_rx_empty(chan) ||
6671 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6672 	     !vmbus_chan_tx_empty(chan)))
6673 		pause("waitch", 1);
6674 	vmbus_chan_intr_drain(chan);
6675 }
6676 
6677 static void
6678 hn_disable_rx(struct hn_softc *sc)
6679 {
6680 
6681 	/*
6682 	 * Disable RX by clearing RX filter forcefully.
6683 	 */
6684 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6685 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6686 
6687 	/*
6688 	 * Give RNDIS enough time to flush all pending data packets.
6689 	 */
6690 	pause("waitrx", (200 * hz) / 1000);
6691 }
6692 
6693 /*
6694  * NOTE:
6695  * RX/TX _must_ have been suspended/disabled, before this function
6696  * is called.
6697  */
6698 static void
6699 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6700 {
6701 	struct vmbus_channel **subch = NULL;
6702 	int nsubch;
6703 
6704 	/*
6705 	 * Drain RX/TX bufrings and interrupts.
6706 	 */
6707 	nsubch = nchan - 1;
6708 	if (nsubch > 0)
6709 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6710 
6711 	if (subch != NULL) {
6712 		int i;
6713 
6714 		for (i = 0; i < nsubch; ++i)
6715 			hn_chan_drain(sc, subch[i]);
6716 	}
6717 	hn_chan_drain(sc, sc->hn_prichan);
6718 
6719 	if (subch != NULL)
6720 		vmbus_subchan_rel(subch, nsubch);
6721 }
6722 
6723 static void
6724 hn_suspend_data(struct hn_softc *sc)
6725 {
6726 	struct hn_tx_ring *txr;
6727 	int i;
6728 
6729 	HN_LOCK_ASSERT(sc);
6730 
6731 	/*
6732 	 * Suspend TX.
6733 	 */
6734 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6735 		txr = &sc->hn_tx_ring[i];
6736 
6737 		mtx_lock(&txr->hn_tx_lock);
6738 		txr->hn_suspended = 1;
6739 		mtx_unlock(&txr->hn_tx_lock);
6740 		/* No one is able send more packets now. */
6741 
6742 		/*
6743 		 * Wait for all pending sends to finish.
6744 		 *
6745 		 * NOTE:
6746 		 * We will _not_ receive all pending send-done, if the
6747 		 * primary channel is revoked.
6748 		 */
6749 		while (hn_tx_ring_pending(txr) &&
6750 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6751 			pause("hnwtx", 1 /* 1 tick */);
6752 	}
6753 
6754 	/*
6755 	 * Disable RX.
6756 	 */
6757 	hn_disable_rx(sc);
6758 
6759 	/*
6760 	 * Drain RX/TX.
6761 	 */
6762 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6763 
6764 	/*
6765 	 * Drain any pending TX tasks.
6766 	 *
6767 	 * NOTE:
6768 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6769 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6770 	 */
6771 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6772 		txr = &sc->hn_tx_ring[i];
6773 
6774 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6775 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6776 	}
6777 }
6778 
6779 static void
6780 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6781 {
6782 
6783 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6784 }
6785 
6786 static void
6787 hn_suspend_mgmt(struct hn_softc *sc)
6788 {
6789 	struct task task;
6790 
6791 	HN_LOCK_ASSERT(sc);
6792 
6793 	/*
6794 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6795 	 * through hn_mgmt_taskq.
6796 	 */
6797 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6798 	vmbus_chan_run_task(sc->hn_prichan, &task);
6799 
6800 	/*
6801 	 * Make sure that all pending management tasks are completed.
6802 	 */
6803 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6804 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6805 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6806 }
6807 
6808 static void
6809 hn_suspend(struct hn_softc *sc)
6810 {
6811 
6812 	/* Disable polling. */
6813 	hn_polling(sc, 0);
6814 
6815 	/*
6816 	 * If the non-transparent mode VF is activated, the synthetic
6817 	 * device is receiving packets, so the data path of the
6818 	 * synthetic device must be suspended.
6819 	 */
6820 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6821 	    (sc->hn_flags & HN_FLAG_RXVF))
6822 		hn_suspend_data(sc);
6823 	hn_suspend_mgmt(sc);
6824 }
6825 
6826 static void
6827 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6828 {
6829 	int i;
6830 
6831 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6832 	    ("invalid TX ring count %d", tx_ring_cnt));
6833 
6834 	for (i = 0; i < tx_ring_cnt; ++i) {
6835 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6836 
6837 		mtx_lock(&txr->hn_tx_lock);
6838 		txr->hn_suspended = 0;
6839 		mtx_unlock(&txr->hn_tx_lock);
6840 	}
6841 }
6842 
6843 static void
6844 hn_resume_data(struct hn_softc *sc)
6845 {
6846 	int i;
6847 
6848 	HN_LOCK_ASSERT(sc);
6849 
6850 	/*
6851 	 * Re-enable RX.
6852 	 */
6853 	hn_rxfilter_config(sc);
6854 
6855 	/*
6856 	 * Make sure to clear suspend status on "all" TX rings,
6857 	 * since hn_tx_ring_inuse can be changed after
6858 	 * hn_suspend_data().
6859 	 */
6860 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6861 
6862 #ifdef HN_IFSTART_SUPPORT
6863 	if (!hn_use_if_start)
6864 #endif
6865 	{
6866 		/*
6867 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6868 		 * reduced.
6869 		 */
6870 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6871 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6872 	}
6873 
6874 	/*
6875 	 * Kick start TX.
6876 	 */
6877 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6878 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6879 
6880 		/*
6881 		 * Use txeof task, so that any pending oactive can be
6882 		 * cleared properly.
6883 		 */
6884 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6885 	}
6886 }
6887 
6888 static void
6889 hn_resume_mgmt(struct hn_softc *sc)
6890 {
6891 
6892 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6893 
6894 	/*
6895 	 * Kick off network change detection, if it was pending.
6896 	 * If no network change was pending, start link status
6897 	 * checks, which is more lightweight than network change
6898 	 * detection.
6899 	 */
6900 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6901 		hn_change_network(sc);
6902 	else
6903 		hn_update_link_status(sc);
6904 }
6905 
6906 static void
6907 hn_resume(struct hn_softc *sc)
6908 {
6909 
6910 	/*
6911 	 * If the non-transparent mode VF is activated, the synthetic
6912 	 * device have to receive packets, so the data path of the
6913 	 * synthetic device must be resumed.
6914 	 */
6915 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6916 	    (sc->hn_flags & HN_FLAG_RXVF))
6917 		hn_resume_data(sc);
6918 
6919 	/*
6920 	 * Don't resume link status change if VF is attached/activated.
6921 	 * - In the non-transparent VF mode, the synthetic device marks
6922 	 *   link down until the VF is deactivated; i.e. VF is down.
6923 	 * - In transparent VF mode, VF's media status is used until
6924 	 *   the VF is detached.
6925 	 */
6926 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6927 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6928 		hn_resume_mgmt(sc);
6929 
6930 	/*
6931 	 * Re-enable polling if this interface is running and
6932 	 * the polling is requested.
6933 	 */
6934 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6935 		hn_polling(sc, sc->hn_pollhz);
6936 }
6937 
6938 static void
6939 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6940 {
6941 	const struct rndis_status_msg *msg;
6942 	int ofs;
6943 
6944 	if (dlen < sizeof(*msg)) {
6945 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6946 		return;
6947 	}
6948 	msg = data;
6949 
6950 	switch (msg->rm_status) {
6951 	case RNDIS_STATUS_MEDIA_CONNECT:
6952 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6953 		hn_update_link_status(sc);
6954 		break;
6955 
6956 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6957 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6958 		/* Not really useful; ignore. */
6959 		break;
6960 
6961 	case RNDIS_STATUS_NETWORK_CHANGE:
6962 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6963 		if (dlen < ofs + msg->rm_stbuflen ||
6964 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6965 			if_printf(sc->hn_ifp, "network changed\n");
6966 		} else {
6967 			uint32_t change;
6968 
6969 			memcpy(&change, ((const uint8_t *)msg) + ofs,
6970 			    sizeof(change));
6971 			if_printf(sc->hn_ifp, "network changed, change %u\n",
6972 			    change);
6973 		}
6974 		hn_change_network(sc);
6975 		break;
6976 
6977 	default:
6978 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6979 		    msg->rm_status);
6980 		break;
6981 	}
6982 }
6983 
6984 static int
6985 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6986 {
6987 	const struct rndis_pktinfo *pi = info_data;
6988 	uint32_t mask = 0;
6989 
6990 	while (info_dlen != 0) {
6991 		const void *data;
6992 		uint32_t dlen;
6993 
6994 		if (__predict_false(info_dlen < sizeof(*pi)))
6995 			return (EINVAL);
6996 		if (__predict_false(info_dlen < pi->rm_size))
6997 			return (EINVAL);
6998 		info_dlen -= pi->rm_size;
6999 
7000 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7001 			return (EINVAL);
7002 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7003 			return (EINVAL);
7004 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7005 		data = pi->rm_data;
7006 
7007 		switch (pi->rm_type) {
7008 		case NDIS_PKTINFO_TYPE_VLAN:
7009 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7010 				return (EINVAL);
7011 			info->vlan_info = *((const uint32_t *)data);
7012 			mask |= HN_RXINFO_VLAN;
7013 			break;
7014 
7015 		case NDIS_PKTINFO_TYPE_CSUM:
7016 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7017 				return (EINVAL);
7018 			info->csum_info = *((const uint32_t *)data);
7019 			mask |= HN_RXINFO_CSUM;
7020 			break;
7021 
7022 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7023 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7024 				return (EINVAL);
7025 			info->hash_value = *((const uint32_t *)data);
7026 			mask |= HN_RXINFO_HASHVAL;
7027 			break;
7028 
7029 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
7030 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7031 				return (EINVAL);
7032 			info->hash_info = *((const uint32_t *)data);
7033 			mask |= HN_RXINFO_HASHINF;
7034 			break;
7035 
7036 		default:
7037 			goto next;
7038 		}
7039 
7040 		if (mask == HN_RXINFO_ALL) {
7041 			/* All found; done */
7042 			break;
7043 		}
7044 next:
7045 		pi = (const struct rndis_pktinfo *)
7046 		    ((const uint8_t *)pi + pi->rm_size);
7047 	}
7048 
7049 	/*
7050 	 * Final fixup.
7051 	 * - If there is no hash value, invalidate the hash info.
7052 	 */
7053 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7054 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7055 	return (0);
7056 }
7057 
7058 static __inline bool
7059 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7060 {
7061 
7062 	if (off < check_off) {
7063 		if (__predict_true(off + len <= check_off))
7064 			return (false);
7065 	} else if (off > check_off) {
7066 		if (__predict_true(check_off + check_len <= off))
7067 			return (false);
7068 	}
7069 	return (true);
7070 }
7071 
7072 static void
7073 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7074 {
7075 	const struct rndis_packet_msg *pkt;
7076 	struct hn_rxinfo info;
7077 	int data_off, pktinfo_off, data_len, pktinfo_len;
7078 
7079 	/*
7080 	 * Check length.
7081 	 */
7082 	if (__predict_false(dlen < sizeof(*pkt))) {
7083 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7084 		return;
7085 	}
7086 	pkt = data;
7087 
7088 	if (__predict_false(dlen < pkt->rm_len)) {
7089 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7090 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7091 		return;
7092 	}
7093 	if (__predict_false(pkt->rm_len <
7094 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7095 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7096 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7097 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7098 		    pkt->rm_pktinfolen);
7099 		return;
7100 	}
7101 	if (__predict_false(pkt->rm_datalen == 0)) {
7102 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7103 		return;
7104 	}
7105 
7106 	/*
7107 	 * Check offests.
7108 	 */
7109 #define IS_OFFSET_INVALID(ofs)			\
7110 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7111 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7112 
7113 	/* XXX Hyper-V does not meet data offset alignment requirement */
7114 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7115 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7116 		    "data offset %u\n", pkt->rm_dataoffset);
7117 		return;
7118 	}
7119 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7120 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7121 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7122 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7123 		return;
7124 	}
7125 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7126 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7127 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7128 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7129 		return;
7130 	}
7131 
7132 #undef IS_OFFSET_INVALID
7133 
7134 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7135 	data_len = pkt->rm_datalen;
7136 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7137 	pktinfo_len = pkt->rm_pktinfolen;
7138 
7139 	/*
7140 	 * Check OOB coverage.
7141 	 */
7142 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7143 		int oob_off, oob_len;
7144 
7145 		if_printf(rxr->hn_ifp, "got oobdata\n");
7146 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7147 		oob_len = pkt->rm_oobdatalen;
7148 
7149 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7150 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7151 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7152 			    pkt->rm_len, oob_off, oob_len);
7153 			return;
7154 		}
7155 
7156 		/*
7157 		 * Check against data.
7158 		 */
7159 		if (hn_rndis_check_overlap(oob_off, oob_len,
7160 		    data_off, data_len)) {
7161 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7162 			    "oob overlaps data, oob abs %d len %d, "
7163 			    "data abs %d len %d\n",
7164 			    oob_off, oob_len, data_off, data_len);
7165 			return;
7166 		}
7167 
7168 		/*
7169 		 * Check against pktinfo.
7170 		 */
7171 		if (pktinfo_len != 0 &&
7172 		    hn_rndis_check_overlap(oob_off, oob_len,
7173 		    pktinfo_off, pktinfo_len)) {
7174 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7175 			    "oob overlaps pktinfo, oob abs %d len %d, "
7176 			    "pktinfo abs %d len %d\n",
7177 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7178 			return;
7179 		}
7180 	}
7181 
7182 	/*
7183 	 * Check per-packet-info coverage and find useful per-packet-info.
7184 	 */
7185 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7186 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7187 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7188 	if (__predict_true(pktinfo_len != 0)) {
7189 		bool overlap;
7190 		int error;
7191 
7192 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7193 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7194 			    "pktinfo overflow, msglen %u, "
7195 			    "pktinfo abs %d len %d\n",
7196 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7197 			return;
7198 		}
7199 
7200 		/*
7201 		 * Check packet info coverage.
7202 		 */
7203 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7204 		    data_off, data_len);
7205 		if (__predict_false(overlap)) {
7206 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7207 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7208 			    "data abs %d len %d\n",
7209 			    pktinfo_off, pktinfo_len, data_off, data_len);
7210 			return;
7211 		}
7212 
7213 		/*
7214 		 * Find useful per-packet-info.
7215 		 */
7216 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7217 		    pktinfo_len, &info);
7218 		if (__predict_false(error)) {
7219 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7220 			    "pktinfo\n");
7221 			return;
7222 		}
7223 	}
7224 
7225 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7226 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7227 		    "data overflow, msglen %u, data abs %d len %d\n",
7228 		    pkt->rm_len, data_off, data_len);
7229 		return;
7230 	}
7231 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7232 }
7233 
7234 static __inline void
7235 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7236 {
7237 	const struct rndis_msghdr *hdr;
7238 
7239 	if (__predict_false(dlen < sizeof(*hdr))) {
7240 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7241 		return;
7242 	}
7243 	hdr = data;
7244 
7245 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7246 		/* Hot data path. */
7247 		hn_rndis_rx_data(rxr, data, dlen);
7248 		/* Done! */
7249 		return;
7250 	}
7251 
7252 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7253 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7254 	else
7255 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7256 }
7257 
7258 static void
7259 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7260 {
7261 	const struct hn_nvs_hdr *hdr;
7262 
7263 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7264 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7265 		return;
7266 	}
7267 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7268 
7269 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7270 		/* Useless; ignore */
7271 		return;
7272 	}
7273 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7274 }
7275 
7276 static void
7277 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7278     const struct vmbus_chanpkt_hdr *pkt)
7279 {
7280 	struct hn_nvs_sendctx *sndc;
7281 
7282 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7283 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7284 	    VMBUS_CHANPKT_DATALEN(pkt));
7285 	/*
7286 	 * NOTE:
7287 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7288 	 * its callback.
7289 	 */
7290 }
7291 
7292 static void
7293 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7294     const struct vmbus_chanpkt_hdr *pkthdr)
7295 {
7296 	const struct vmbus_chanpkt_rxbuf *pkt;
7297 	const struct hn_nvs_hdr *nvs_hdr;
7298 	int count, i, hlen;
7299 
7300 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7301 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7302 		return;
7303 	}
7304 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7305 
7306 	/* Make sure that this is a RNDIS message. */
7307 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7308 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7309 		    nvs_hdr->nvs_type);
7310 		return;
7311 	}
7312 
7313 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7314 	if (__predict_false(hlen < sizeof(*pkt))) {
7315 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7316 		return;
7317 	}
7318 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7319 
7320 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7321 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7322 		    pkt->cp_rxbuf_id);
7323 		return;
7324 	}
7325 
7326 	count = pkt->cp_rxbuf_cnt;
7327 	if (__predict_false(hlen <
7328 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7329 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7330 		return;
7331 	}
7332 
7333 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7334 	for (i = 0; i < count; ++i) {
7335 		int ofs, len;
7336 
7337 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7338 		len = pkt->cp_rxbuf[i].rb_len;
7339 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7340 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7341 			    "ofs %d, len %d\n", i, ofs, len);
7342 			continue;
7343 		}
7344 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7345 	}
7346 
7347 	/*
7348 	 * Ack the consumed RXBUF associated w/ this channel packet,
7349 	 * so that this RXBUF can be recycled by the hypervisor.
7350 	 */
7351 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7352 }
7353 
7354 static void
7355 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7356     uint64_t tid)
7357 {
7358 	struct hn_nvs_rndis_ack ack;
7359 	int retries, error;
7360 
7361 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7362 	ack.nvs_status = HN_NVS_STATUS_OK;
7363 
7364 	retries = 0;
7365 again:
7366 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7367 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7368 	if (__predict_false(error == EAGAIN)) {
7369 		/*
7370 		 * NOTE:
7371 		 * This should _not_ happen in real world, since the
7372 		 * consumption of the TX bufring from the TX path is
7373 		 * controlled.
7374 		 */
7375 		if (rxr->hn_ack_failed == 0)
7376 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7377 		rxr->hn_ack_failed++;
7378 		retries++;
7379 		if (retries < 10) {
7380 			DELAY(100);
7381 			goto again;
7382 		}
7383 		/* RXBUF leaks! */
7384 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7385 	}
7386 }
7387 
7388 static void
7389 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7390 {
7391 	struct hn_rx_ring *rxr = xrxr;
7392 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7393 
7394 	for (;;) {
7395 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7396 		int error, pktlen;
7397 
7398 		pktlen = rxr->hn_pktbuf_len;
7399 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7400 		if (__predict_false(error == ENOBUFS)) {
7401 			void *nbuf;
7402 			int nlen;
7403 
7404 			/*
7405 			 * Expand channel packet buffer.
7406 			 *
7407 			 * XXX
7408 			 * Use M_WAITOK here, since allocation failure
7409 			 * is fatal.
7410 			 */
7411 			nlen = rxr->hn_pktbuf_len * 2;
7412 			while (nlen < pktlen)
7413 				nlen *= 2;
7414 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7415 
7416 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7417 			    rxr->hn_pktbuf_len, nlen);
7418 
7419 			free(rxr->hn_pktbuf, M_DEVBUF);
7420 			rxr->hn_pktbuf = nbuf;
7421 			rxr->hn_pktbuf_len = nlen;
7422 			/* Retry! */
7423 			continue;
7424 		} else if (__predict_false(error == EAGAIN)) {
7425 			/* No more channel packets; done! */
7426 			break;
7427 		}
7428 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7429 
7430 		switch (pkt->cph_type) {
7431 		case VMBUS_CHANPKT_TYPE_COMP:
7432 			hn_nvs_handle_comp(sc, chan, pkt);
7433 			break;
7434 
7435 		case VMBUS_CHANPKT_TYPE_RXBUF:
7436 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7437 			break;
7438 
7439 		case VMBUS_CHANPKT_TYPE_INBAND:
7440 			hn_nvs_handle_notify(sc, pkt);
7441 			break;
7442 
7443 		default:
7444 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7445 			    pkt->cph_type);
7446 			break;
7447 		}
7448 	}
7449 	hn_chan_rollup(rxr, rxr->hn_txr);
7450 }
7451 
7452 static void
7453 hn_sysinit(void *arg __unused)
7454 {
7455 	int i;
7456 
7457 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7458 
7459 #ifdef HN_IFSTART_SUPPORT
7460 	/*
7461 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7462 	 * mainly due to the IFF_DRV_OACTIVE flag.
7463 	 */
7464 	if (hn_xpnt_vf && hn_use_if_start) {
7465 		hn_use_if_start = 0;
7466 		printf("hn: tranparent VF mode, if_transmit will be used, "
7467 		    "instead of if_start\n");
7468 	}
7469 #endif
7470 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7471 		printf("hn: invalid transparent VF attach routing "
7472 		    "wait timeout %d, reset to %d\n",
7473 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7474 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7475 	}
7476 
7477 	/*
7478 	 * Initialize VF map.
7479 	 */
7480 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7481 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7482 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7483 	    M_WAITOK | M_ZERO);
7484 
7485 	/*
7486 	 * Fix the # of TX taskqueues.
7487 	 */
7488 	if (hn_tx_taskq_cnt <= 0)
7489 		hn_tx_taskq_cnt = 1;
7490 	else if (hn_tx_taskq_cnt > mp_ncpus)
7491 		hn_tx_taskq_cnt = mp_ncpus;
7492 
7493 	/*
7494 	 * Fix the TX taskqueue mode.
7495 	 */
7496 	switch (hn_tx_taskq_mode) {
7497 	case HN_TX_TASKQ_M_INDEP:
7498 	case HN_TX_TASKQ_M_GLOBAL:
7499 	case HN_TX_TASKQ_M_EVTTQ:
7500 		break;
7501 	default:
7502 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7503 		break;
7504 	}
7505 
7506 	if (vm_guest != VM_GUEST_HV)
7507 		return;
7508 
7509 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7510 		return;
7511 
7512 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7513 	    M_DEVBUF, M_WAITOK);
7514 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7515 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7516 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7517 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7518 		    "hn tx%d", i);
7519 	}
7520 }
7521 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7522 
7523 static void
7524 hn_sysuninit(void *arg __unused)
7525 {
7526 
7527 	if (hn_tx_taskque != NULL) {
7528 		int i;
7529 
7530 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7531 			taskqueue_free(hn_tx_taskque[i]);
7532 		free(hn_tx_taskque, M_DEVBUF);
7533 	}
7534 
7535 	if (hn_vfmap != NULL)
7536 		free(hn_vfmap, M_DEVBUF);
7537 	rm_destroy(&hn_vfmap_lock);
7538 
7539 	counter_u64_free(hn_udpcs_fixup);
7540 }
7541 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7542