xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 68b2efbd3b74f0d45bbbf07cef5408e455eefbd1)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87 
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99 
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107 
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112 
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118 
119 #include "vmbus_if.h"
120 
121 #define HN_IFSTART_SUPPORT
122 
123 #define HN_RING_CNT_DEF_MAX		8
124 
125 #define HN_VFMAP_SIZE_DEF		8
126 
127 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
128 
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT			512
131 
132 #define HN_RNDIS_PKT_LEN					\
133 	(sizeof(struct rndis_packet_msg) +			\
134 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
140 
141 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
146 
147 #define HN_DIRECT_TX_SIZE_DEF		128
148 
149 #define HN_EARLY_TXEOF_THRESH		8
150 
151 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
152 
153 #define HN_LROENT_CNT_DEF		128
154 
155 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
159 
160 #define HN_LRO_ACKCNT_DEF		1
161 
162 #define HN_LOCK_INIT(sc)		\
163 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc)					\
167 do {							\
168 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
169 		DELAY(1000);				\
170 } while (0)
171 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
172 
173 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc)		\
176 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc)	\
178 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 
180 #define HN_PKTSIZE_MIN(align)		\
181 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 	    HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align)		\
184 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191 
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 	SLIST_ENTRY(hn_txdesc)		link;
195 #endif
196 	STAILQ_ENTRY(hn_txdesc)		agg_link;
197 
198 	/* Aggregated txdescs, in sending order. */
199 	STAILQ_HEAD(, hn_txdesc)	agg_list;
200 
201 	/* The oldest packet, if transmission aggregation happens. */
202 	struct mbuf			*m;
203 	struct hn_tx_ring		*txr;
204 	int				refs;
205 	uint32_t			flags;	/* HN_TXD_FLAG_ */
206 	struct hn_nvs_sendctx		send_ctx;
207 	uint32_t			chim_index;
208 	int				chim_size;
209 
210 	bus_dmamap_t			data_dmap;
211 
212 	bus_addr_t			rndis_pkt_paddr;
213 	struct rndis_packet_msg		*rndis_pkt;
214 	bus_dmamap_t			rndis_pkt_dmap;
215 };
216 
217 #define HN_TXD_FLAG_ONLIST		0x0001
218 #define HN_TXD_FLAG_DMAMAP		0x0002
219 #define HN_TXD_FLAG_ONAGG		0x0004
220 
221 struct hn_rxinfo {
222 	uint32_t			vlan_info;
223 	uint32_t			csum_info;
224 	uint32_t			hash_info;
225 	uint32_t			hash_value;
226 };
227 
228 struct hn_rxvf_setarg {
229 	struct hn_rx_ring	*rxr;
230 	struct ifnet		*vf_ifp;
231 };
232 
233 #define HN_RXINFO_VLAN			0x0001
234 #define HN_RXINFO_CSUM			0x0002
235 #define HN_RXINFO_HASHINF		0x0004
236 #define HN_RXINFO_HASHVAL		0x0008
237 #define HN_RXINFO_ALL			\
238 	(HN_RXINFO_VLAN |		\
239 	 HN_RXINFO_CSUM |		\
240 	 HN_RXINFO_HASHINF |		\
241 	 HN_RXINFO_HASHVAL)
242 
243 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID	0
245 #define HN_NDIS_HASH_INFO_INVALID	0
246 
247 static int			hn_probe(device_t);
248 static int			hn_attach(device_t);
249 static int			hn_detach(device_t);
250 static int			hn_shutdown(device_t);
251 static void			hn_chan_callback(struct vmbus_channel *,
252 				    void *);
253 
254 static void			hn_init(void *);
255 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void			hn_start(struct ifnet *);
258 #endif
259 static int			hn_transmit(struct ifnet *, struct mbuf *);
260 static void			hn_xmit_qflush(struct ifnet *);
261 static int			hn_ifmedia_upd(struct ifnet *);
262 static void			hn_ifmedia_sts(struct ifnet *,
263 				    struct ifmediareq *);
264 
265 static void			hn_ifnet_event(void *, struct ifnet *, int);
266 static void			hn_ifaddr_event(void *, struct ifnet *);
267 static void			hn_ifnet_attevent(void *, struct ifnet *);
268 static void			hn_ifnet_detevent(void *, struct ifnet *);
269 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 
271 static bool			hn_ismyvf(const struct hn_softc *,
272 				    const struct ifnet *);
273 static void			hn_rxvf_change(struct hn_softc *,
274 				    struct ifnet *, bool);
275 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void			hn_rxvf_set_task(void *, int);
277 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 				    struct ifreq *);
281 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool			hn_xpnt_vf_isready(struct hn_softc *);
283 static void			hn_xpnt_vf_setready(struct hn_softc *);
284 static void			hn_xpnt_vf_init_taskfunc(void *, int);
285 static void			hn_xpnt_vf_init(struct hn_softc *);
286 static void			hn_xpnt_vf_setenable(struct hn_softc *);
287 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void			hn_vf_rss_restore(struct hn_softc *);
290 
291 static int			hn_rndis_rxinfo(const void *, int,
292 				    struct hn_rxinfo *);
293 static void			hn_rndis_rx_data(struct hn_rx_ring *,
294 				    const void *, int);
295 static void			hn_rndis_rx_status(struct hn_softc *,
296 				    const void *, int);
297 static void			hn_rndis_init_fixat(struct hn_softc *, int);
298 
299 static void			hn_nvs_handle_notify(struct hn_softc *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_comp(struct hn_softc *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *,
306 				    const struct vmbus_chanpkt_hdr *);
307 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 				    struct vmbus_channel *, uint64_t);
309 
310 #if __FreeBSD_version >= 1100099
311 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 
347 static void			hn_stop(struct hn_softc *, bool);
348 static void			hn_init_locked(struct hn_softc *);
349 static int			hn_chan_attach(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_chan_detach(struct hn_softc *,
352 				    struct vmbus_channel *);
353 static int			hn_attach_subchans(struct hn_softc *);
354 static void			hn_detach_allchans(struct hn_softc *);
355 static void			hn_chan_rollup(struct hn_rx_ring *,
356 				    struct hn_tx_ring *);
357 static void			hn_set_ring_inuse(struct hn_softc *, int);
358 static int			hn_synth_attach(struct hn_softc *, int);
359 static void			hn_synth_detach(struct hn_softc *);
360 static int			hn_synth_alloc_subchans(struct hn_softc *,
361 				    int *);
362 static bool			hn_synth_attachable(const struct hn_softc *);
363 static void			hn_suspend(struct hn_softc *);
364 static void			hn_suspend_data(struct hn_softc *);
365 static void			hn_suspend_mgmt(struct hn_softc *);
366 static void			hn_resume(struct hn_softc *);
367 static void			hn_resume_data(struct hn_softc *);
368 static void			hn_resume_mgmt(struct hn_softc *);
369 static void			hn_suspend_mgmt_taskfunc(void *, int);
370 static void			hn_chan_drain(struct hn_softc *,
371 				    struct vmbus_channel *);
372 static void			hn_disable_rx(struct hn_softc *);
373 static void			hn_drain_rxtx(struct hn_softc *, int);
374 static void			hn_polling(struct hn_softc *, u_int);
375 static void			hn_chan_polling(struct vmbus_channel *, u_int);
376 static void			hn_mtu_change_fixup(struct hn_softc *);
377 
378 static void			hn_update_link_status(struct hn_softc *);
379 static void			hn_change_network(struct hn_softc *);
380 static void			hn_link_taskfunc(void *, int);
381 static void			hn_netchg_init_taskfunc(void *, int);
382 static void			hn_netchg_status_taskfunc(void *, int);
383 static void			hn_link_status(struct hn_softc *);
384 
385 static int			hn_create_rx_data(struct hn_softc *, int);
386 static void			hn_destroy_rx_data(struct hn_softc *);
387 static int			hn_check_iplen(const struct mbuf *, int);
388 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
389 static int			hn_rxfilter_config(struct hn_softc *);
390 #ifndef RSS
391 static int			hn_rss_reconfig(struct hn_softc *);
392 #endif
393 static void			hn_rss_ind_fixup(struct hn_softc *);
394 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
395 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
396 				    int, const struct hn_rxinfo *);
397 static uint32_t			hn_rss_type_fromndis(uint32_t);
398 static uint32_t			hn_rss_type_tondis(uint32_t);
399 
400 static int			hn_tx_ring_create(struct hn_softc *, int);
401 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
402 static int			hn_create_tx_data(struct hn_softc *, int);
403 static void			hn_fixup_tx_data(struct hn_softc *);
404 static void			hn_destroy_tx_data(struct hn_softc *);
405 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
406 static void			hn_txdesc_gc(struct hn_tx_ring *,
407 				    struct hn_txdesc *);
408 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
409 				    struct hn_txdesc *, struct mbuf **);
410 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 				    struct hn_txdesc *);
412 static void			hn_set_chim_size(struct hn_softc *, int);
413 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
414 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
415 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
416 static void			hn_resume_tx(struct hn_softc *, int);
417 static void			hn_set_txagg(struct hn_softc *);
418 static void			*hn_try_txagg(struct ifnet *,
419 				    struct hn_tx_ring *, struct hn_txdesc *,
420 				    int);
421 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
422 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
423 				    struct hn_softc *, struct vmbus_channel *,
424 				    const void *, int);
425 static int			hn_txpkt_sglist(struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static int			hn_txpkt_chim(struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static int			hn_xmit(struct hn_tx_ring *, int);
430 static void			hn_xmit_taskfunc(void *, int);
431 static void			hn_xmit_txeof(struct hn_tx_ring *);
432 static void			hn_xmit_txeof_taskfunc(void *, int);
433 #ifdef HN_IFSTART_SUPPORT
434 static int			hn_start_locked(struct hn_tx_ring *, int);
435 static void			hn_start_taskfunc(void *, int);
436 static void			hn_start_txeof(struct hn_tx_ring *);
437 static void			hn_start_txeof_taskfunc(void *, int);
438 #endif
439 
440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
441     "Hyper-V network interface");
442 
443 /* Trust tcp segements verification on host side. */
444 static int			hn_trust_hosttcp = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
446     &hn_trust_hosttcp, 0,
447     "Trust tcp segement verification on host side, "
448     "when csum info is missing (global setting)");
449 
450 /* Trust udp datagrams verification on host side. */
451 static int			hn_trust_hostudp = 1;
452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
453     &hn_trust_hostudp, 0,
454     "Trust udp datagram verification on host side, "
455     "when csum info is missing (global setting)");
456 
457 /* Trust ip packets verification on host side. */
458 static int			hn_trust_hostip = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460     &hn_trust_hostip, 0,
461     "Trust ip packet verification on host side, "
462     "when csum info is missing (global setting)");
463 
464 /*
465  * Offload UDP/IPv4 checksum.
466  */
467 static int			hn_enable_udp4cs = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
469     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 
471 /*
472  * Offload UDP/IPv6 checksum.
473  */
474 static int			hn_enable_udp6cs = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
476     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 
478 /* Stats. */
479 static counter_u64_t		hn_udpcs_fixup;
480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
481     &hn_udpcs_fixup, "# of UDP checksum fixup");
482 
483 /*
484  * See hn_set_hlen().
485  *
486  * This value is for Azure.  For Hyper-V, set this above
487  * 65536 to disable UDP datagram checksum fixup.
488  */
489 static int			hn_udpcs_fixup_mtu = 1420;
490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
491     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
492 
493 /* Limit TSO burst size */
494 static int			hn_tso_maxlen = IP_MAXPACKET;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
496     &hn_tso_maxlen, 0, "TSO burst limit");
497 
498 /* Limit chimney send size */
499 static int			hn_tx_chimney_size = 0;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
501     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
502 
503 /* Limit the size of packet for direct transmission */
504 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
506     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
507 
508 /* # of LRO entries per RX ring */
509 #if defined(INET) || defined(INET6)
510 #if __FreeBSD_version >= 1100095
511 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
513     &hn_lro_entry_count, 0, "LRO entry count");
514 #endif
515 #endif
516 
517 static int			hn_tx_taskq_cnt = 1;
518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
519     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
520 
521 #define HN_TX_TASKQ_M_INDEP	0
522 #define HN_TX_TASKQ_M_GLOBAL	1
523 #define HN_TX_TASKQ_M_EVTTQ	2
524 
525 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
527     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
528     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
529 
530 #ifndef HN_USE_TXDESC_BUFRING
531 static int			hn_use_txdesc_bufring = 0;
532 #else
533 static int			hn_use_txdesc_bufring = 1;
534 #endif
535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
536     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
537 
538 #ifdef HN_IFSTART_SUPPORT
539 /* Use ifnet.if_start instead of ifnet.if_transmit */
540 static int			hn_use_if_start = 0;
541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
542     &hn_use_if_start, 0, "Use if_start TX method");
543 #endif
544 
545 /* # of channels to use */
546 static int			hn_chan_cnt = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
548     &hn_chan_cnt, 0,
549     "# of channels to use; each channel has one RX ring and one TX ring");
550 
551 /* # of transmit rings to use */
552 static int			hn_tx_ring_cnt = 0;
553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
554     &hn_tx_ring_cnt, 0, "# of TX rings to use");
555 
556 /* Software TX ring deptch */
557 static int			hn_tx_swq_depth = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
559     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
560 
561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
562 #if __FreeBSD_version >= 1100095
563 static u_int			hn_lro_mbufq_depth = 0;
564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
565     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 #endif
567 
568 /* Packet transmission aggregation size limit */
569 static int			hn_tx_agg_size = -1;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
571     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
572 
573 /* Packet transmission aggregation count limit */
574 static int			hn_tx_agg_pkts = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
576     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 
578 /* VF list */
579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
580     0, 0, hn_vflist_sysctl, "A", "VF list");
581 
582 /* VF mapping */
583 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
584     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 
586 /* Transparent VF */
587 static int			hn_xpnt_vf = 0;
588 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
589     &hn_xpnt_vf, 0, "Transparent VF mod");
590 
591 /* Accurate BPF support for Transparent VF */
592 static int			hn_xpnt_vf_accbpf = 0;
593 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
594     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
595 
596 /* Extra wait for transparent VF attach routing; unit seconds. */
597 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
598 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
599     &hn_xpnt_vf_attwait, 0,
600     "Extra wait for transparent VF attach routing; unit: seconds");
601 
602 static u_int			hn_cpu_index;	/* next CPU for channel */
603 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
604 
605 static struct rmlock		hn_vfmap_lock;
606 static int			hn_vfmap_size;
607 static struct ifnet		**hn_vfmap;
608 
609 #ifndef RSS
610 static const uint8_t
611 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
612 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
613 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
614 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
615 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
616 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
617 };
618 #endif	/* !RSS */
619 
620 static const struct hyperv_guid	hn_guid = {
621 	.hv_guid = {
622 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
623 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
624 };
625 
626 static device_method_t hn_methods[] = {
627 	/* Device interface */
628 	DEVMETHOD(device_probe,		hn_probe),
629 	DEVMETHOD(device_attach,	hn_attach),
630 	DEVMETHOD(device_detach,	hn_detach),
631 	DEVMETHOD(device_shutdown,	hn_shutdown),
632 	DEVMETHOD_END
633 };
634 
635 static driver_t hn_driver = {
636 	"hn",
637 	hn_methods,
638 	sizeof(struct hn_softc)
639 };
640 
641 static devclass_t hn_devclass;
642 
643 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646 
647 #if __FreeBSD_version >= 1100099
648 static void
649 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
650 {
651 	int i;
652 
653 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
654 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 }
656 #endif
657 
658 static int
659 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
660 {
661 
662 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
663 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
664 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
665 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
666 }
667 
668 static int
669 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
670 {
671 	struct hn_nvs_rndis rndis;
672 
673 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size > 0, ("invalid rndis chim txd"));
675 
676 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
677 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
678 	rndis.nvs_chim_idx = txd->chim_index;
679 	rndis.nvs_chim_sz = txd->chim_size;
680 
681 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
682 	    &rndis, sizeof(rndis), &txd->send_ctx));
683 }
684 
685 static __inline uint32_t
686 hn_chim_alloc(struct hn_softc *sc)
687 {
688 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
689 	u_long *bmap = sc->hn_chim_bmap;
690 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
691 
692 	for (i = 0; i < bmap_cnt; ++i) {
693 		int idx;
694 
695 		idx = ffsl(~bmap[i]);
696 		if (idx == 0)
697 			continue;
698 
699 		--idx; /* ffsl is 1-based */
700 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
701 		    ("invalid i %d and idx %d", i, idx));
702 
703 		if (atomic_testandset_long(&bmap[i], idx))
704 			continue;
705 
706 		ret = i * LONG_BIT + idx;
707 		break;
708 	}
709 	return (ret);
710 }
711 
712 static __inline void
713 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 {
715 	u_long mask;
716 	uint32_t idx;
717 
718 	idx = chim_idx / LONG_BIT;
719 	KASSERT(idx < sc->hn_chim_bmap_cnt,
720 	    ("invalid chimney index 0x%x", chim_idx));
721 
722 	mask = 1UL << (chim_idx % LONG_BIT);
723 	KASSERT(sc->hn_chim_bmap[idx] & mask,
724 	    ("index bitmap 0x%lx, chimney index %u, "
725 	     "bitmap idx %d, bitmask 0x%lx",
726 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
727 
728 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
729 }
730 
731 #if defined(INET6) || defined(INET)
732 
733 #define PULLUP_HDR(m, len)				\
734 do {							\
735 	if (__predict_false((m)->m_len < (len))) {	\
736 		(m) = m_pullup((m), (len));		\
737 		if ((m) == NULL)			\
738 			return (NULL);			\
739 	}						\
740 } while (0)
741 
742 /*
743  * NOTE: If this function failed, the m_head would be freed.
744  */
745 static __inline struct mbuf *
746 hn_tso_fixup(struct mbuf *m_head)
747 {
748 	struct ether_vlan_header *evl;
749 	struct tcphdr *th;
750 	int ehlen;
751 
752 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
753 
754 	PULLUP_HDR(m_head, sizeof(*evl));
755 	evl = mtod(m_head, struct ether_vlan_header *);
756 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
757 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
758 	else
759 		ehlen = ETHER_HDR_LEN;
760 	m_head->m_pkthdr.l2hlen = ehlen;
761 
762 #ifdef INET
763 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
764 		struct ip *ip;
765 		int iphlen;
766 
767 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
768 		ip = mtodo(m_head, ehlen);
769 		iphlen = ip->ip_hl << 2;
770 		m_head->m_pkthdr.l3hlen = iphlen;
771 
772 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
773 		th = mtodo(m_head, ehlen + iphlen);
774 
775 		ip->ip_len = 0;
776 		ip->ip_sum = 0;
777 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
778 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
779 	}
780 #endif
781 #if defined(INET6) && defined(INET)
782 	else
783 #endif
784 #ifdef INET6
785 	{
786 		struct ip6_hdr *ip6;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
789 		ip6 = mtodo(m_head, ehlen);
790 		if (ip6->ip6_nxt != IPPROTO_TCP) {
791 			m_freem(m_head);
792 			return (NULL);
793 		}
794 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
795 
796 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
797 		th = mtodo(m_head, ehlen + sizeof(*ip6));
798 
799 		ip6->ip6_plen = 0;
800 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
801 	}
802 #endif
803 	return (m_head);
804 }
805 
806 /*
807  * NOTE: If this function failed, the m_head would be freed.
808  */
809 static __inline struct mbuf *
810 hn_set_hlen(struct mbuf *m_head)
811 {
812 	const struct ether_vlan_header *evl;
813 	int ehlen;
814 
815 	PULLUP_HDR(m_head, sizeof(*evl));
816 	evl = mtod(m_head, const struct ether_vlan_header *);
817 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
818 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
819 	else
820 		ehlen = ETHER_HDR_LEN;
821 	m_head->m_pkthdr.l2hlen = ehlen;
822 
823 #ifdef INET
824 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
825 		const struct ip *ip;
826 		int iphlen;
827 
828 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
829 		ip = mtodo(m_head, ehlen);
830 		iphlen = ip->ip_hl << 2;
831 		m_head->m_pkthdr.l3hlen = iphlen;
832 
833 		/*
834 		 * UDP checksum offload does not work in Azure, if the
835 		 * following conditions meet:
836 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
837 		 * - IP_DF is not set in the IP hdr.
838 		 *
839 		 * Fallback to software checksum for these UDP datagrams.
840 		 */
841 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
842 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
843 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
844 			uint16_t off = ehlen + iphlen;
845 
846 			counter_u64_add(hn_udpcs_fixup, 1);
847 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
848 			*(uint16_t *)(m_head->m_data + off +
849                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
850 			    m_head, m_head->m_pkthdr.len, off);
851 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
852 		}
853 	}
854 #endif
855 #if defined(INET6) && defined(INET)
856 	else
857 #endif
858 #ifdef INET6
859 	{
860 		const struct ip6_hdr *ip6;
861 
862 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
863 		ip6 = mtodo(m_head, ehlen);
864 		if (ip6->ip6_nxt != IPPROTO_TCP) {
865 			m_freem(m_head);
866 			return (NULL);
867 		}
868 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
869 	}
870 #endif
871 	return (m_head);
872 }
873 
874 /*
875  * NOTE: If this function failed, the m_head would be freed.
876  */
877 static __inline struct mbuf *
878 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
879 {
880 	const struct tcphdr *th;
881 	int ehlen, iphlen;
882 
883 	*tcpsyn = 0;
884 	ehlen = m_head->m_pkthdr.l2hlen;
885 	iphlen = m_head->m_pkthdr.l3hlen;
886 
887 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
888 	th = mtodo(m_head, ehlen + iphlen);
889 	if (th->th_flags & TH_SYN)
890 		*tcpsyn = 1;
891 	return (m_head);
892 }
893 
894 #undef PULLUP_HDR
895 
896 #endif	/* INET6 || INET */
897 
898 static int
899 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
900 {
901 	int error = 0;
902 
903 	HN_LOCK_ASSERT(sc);
904 
905 	if (sc->hn_rx_filter != filter) {
906 		error = hn_rndis_set_rxfilter(sc, filter);
907 		if (!error)
908 			sc->hn_rx_filter = filter;
909 	}
910 	return (error);
911 }
912 
913 static int
914 hn_rxfilter_config(struct hn_softc *sc)
915 {
916 	struct ifnet *ifp = sc->hn_ifp;
917 	uint32_t filter;
918 
919 	HN_LOCK_ASSERT(sc);
920 
921 	/*
922 	 * If the non-transparent mode VF is activated, we don't know how
923 	 * its RX filter is configured, so stick the synthetic device in
924 	 * the promiscous mode.
925 	 */
926 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
927 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
928 	} else {
929 		filter = NDIS_PACKET_TYPE_DIRECTED;
930 		if (ifp->if_flags & IFF_BROADCAST)
931 			filter |= NDIS_PACKET_TYPE_BROADCAST;
932 		/* TODO: support multicast list */
933 		if ((ifp->if_flags & IFF_ALLMULTI) ||
934 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
935 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
936 	}
937 	return (hn_set_rxfilter(sc, filter));
938 }
939 
940 static void
941 hn_set_txagg(struct hn_softc *sc)
942 {
943 	uint32_t size, pkts;
944 	int i;
945 
946 	/*
947 	 * Setup aggregation size.
948 	 */
949 	if (sc->hn_agg_size < 0)
950 		size = UINT32_MAX;
951 	else
952 		size = sc->hn_agg_size;
953 
954 	if (sc->hn_rndis_agg_size < size)
955 		size = sc->hn_rndis_agg_size;
956 
957 	/* NOTE: We only aggregate packets using chimney sending buffers. */
958 	if (size > (uint32_t)sc->hn_chim_szmax)
959 		size = sc->hn_chim_szmax;
960 
961 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
962 		/* Disable */
963 		size = 0;
964 		pkts = 0;
965 		goto done;
966 	}
967 
968 	/* NOTE: Type of the per TX ring setting is 'int'. */
969 	if (size > INT_MAX)
970 		size = INT_MAX;
971 
972 	/*
973 	 * Setup aggregation packet count.
974 	 */
975 	if (sc->hn_agg_pkts < 0)
976 		pkts = UINT32_MAX;
977 	else
978 		pkts = sc->hn_agg_pkts;
979 
980 	if (sc->hn_rndis_agg_pkts < pkts)
981 		pkts = sc->hn_rndis_agg_pkts;
982 
983 	if (pkts <= 1) {
984 		/* Disable */
985 		size = 0;
986 		pkts = 0;
987 		goto done;
988 	}
989 
990 	/* NOTE: Type of the per TX ring setting is 'short'. */
991 	if (pkts > SHRT_MAX)
992 		pkts = SHRT_MAX;
993 
994 done:
995 	/* NOTE: Type of the per TX ring setting is 'short'. */
996 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
997 		/* Disable */
998 		size = 0;
999 		pkts = 0;
1000 	}
1001 
1002 	if (bootverbose) {
1003 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1004 		    size, pkts, sc->hn_rndis_agg_align);
1005 	}
1006 
1007 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1008 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1009 
1010 		mtx_lock(&txr->hn_tx_lock);
1011 		txr->hn_agg_szmax = size;
1012 		txr->hn_agg_pktmax = pkts;
1013 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1014 		mtx_unlock(&txr->hn_tx_lock);
1015 	}
1016 }
1017 
1018 static int
1019 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1020 {
1021 
1022 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1023 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1024 		return txr->hn_txdesc_cnt;
1025 	return hn_tx_swq_depth;
1026 }
1027 
1028 #ifndef RSS
1029 static int
1030 hn_rss_reconfig(struct hn_softc *sc)
1031 {
1032 	int error;
1033 
1034 	HN_LOCK_ASSERT(sc);
1035 
1036 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1037 		return (ENXIO);
1038 
1039 	/*
1040 	 * Disable RSS first.
1041 	 *
1042 	 * NOTE:
1043 	 * Direct reconfiguration by setting the UNCHG flags does
1044 	 * _not_ work properly.
1045 	 */
1046 	if (bootverbose)
1047 		if_printf(sc->hn_ifp, "disable RSS\n");
1048 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1049 	if (error) {
1050 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1051 		return (error);
1052 	}
1053 
1054 	/*
1055 	 * Reenable the RSS w/ the updated RSS key or indirect
1056 	 * table.
1057 	 */
1058 	if (bootverbose)
1059 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1060 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1061 	if (error) {
1062 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1063 		return (error);
1064 	}
1065 	return (0);
1066 }
1067 #endif	/* !RSS */
1068 
1069 static void
1070 hn_rss_ind_fixup(struct hn_softc *sc)
1071 {
1072 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1073 	int i, nchan;
1074 
1075 	nchan = sc->hn_rx_ring_inuse;
1076 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1077 
1078 	/*
1079 	 * Check indirect table to make sure that all channels in it
1080 	 * can be used.
1081 	 */
1082 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1083 		if (rss->rss_ind[i] >= nchan) {
1084 			if_printf(sc->hn_ifp,
1085 			    "RSS indirect table %d fixup: %u -> %d\n",
1086 			    i, rss->rss_ind[i], nchan - 1);
1087 			rss->rss_ind[i] = nchan - 1;
1088 		}
1089 	}
1090 }
1091 
1092 static int
1093 hn_ifmedia_upd(struct ifnet *ifp __unused)
1094 {
1095 
1096 	return EOPNOTSUPP;
1097 }
1098 
1099 static void
1100 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1101 {
1102 	struct hn_softc *sc = ifp->if_softc;
1103 
1104 	ifmr->ifm_status = IFM_AVALID;
1105 	ifmr->ifm_active = IFM_ETHER;
1106 
1107 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1108 		ifmr->ifm_active |= IFM_NONE;
1109 		return;
1110 	}
1111 	ifmr->ifm_status |= IFM_ACTIVE;
1112 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1113 }
1114 
1115 static void
1116 hn_rxvf_set_task(void *xarg, int pending __unused)
1117 {
1118 	struct hn_rxvf_setarg *arg = xarg;
1119 
1120 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1121 }
1122 
1123 static void
1124 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1125 {
1126 	struct hn_rx_ring *rxr;
1127 	struct hn_rxvf_setarg arg;
1128 	struct task task;
1129 	int i;
1130 
1131 	HN_LOCK_ASSERT(sc);
1132 
1133 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1134 
1135 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1136 		rxr = &sc->hn_rx_ring[i];
1137 
1138 		if (i < sc->hn_rx_ring_inuse) {
1139 			arg.rxr = rxr;
1140 			arg.vf_ifp = vf_ifp;
1141 			vmbus_chan_run_task(rxr->hn_chan, &task);
1142 		} else {
1143 			rxr->hn_rxvf_ifp = vf_ifp;
1144 		}
1145 	}
1146 }
1147 
1148 static bool
1149 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1150 {
1151 	const struct ifnet *hn_ifp;
1152 
1153 	hn_ifp = sc->hn_ifp;
1154 
1155 	if (ifp == hn_ifp)
1156 		return (false);
1157 
1158 	if (ifp->if_alloctype != IFT_ETHER)
1159 		return (false);
1160 
1161 	/* Ignore lagg/vlan interfaces */
1162 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1163 	    strcmp(ifp->if_dname, "vlan") == 0)
1164 		return (false);
1165 
1166 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1167 		return (false);
1168 
1169 	return (true);
1170 }
1171 
1172 static void
1173 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1174 {
1175 	struct ifnet *hn_ifp;
1176 
1177 	HN_LOCK(sc);
1178 
1179 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1180 		goto out;
1181 
1182 	if (!hn_ismyvf(sc, ifp))
1183 		goto out;
1184 	hn_ifp = sc->hn_ifp;
1185 
1186 	if (rxvf) {
1187 		if (sc->hn_flags & HN_FLAG_RXVF)
1188 			goto out;
1189 
1190 		sc->hn_flags |= HN_FLAG_RXVF;
1191 		hn_rxfilter_config(sc);
1192 	} else {
1193 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1194 			goto out;
1195 
1196 		sc->hn_flags &= ~HN_FLAG_RXVF;
1197 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1198 			hn_rxfilter_config(sc);
1199 		else
1200 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1201 	}
1202 
1203 	hn_nvs_set_datapath(sc,
1204 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1205 
1206 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1207 
1208 	if (rxvf) {
1209 		hn_vf_rss_fixup(sc, true);
1210 		hn_suspend_mgmt(sc);
1211 		sc->hn_link_flags &=
1212 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1213 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1214 	} else {
1215 		hn_vf_rss_restore(sc);
1216 		hn_resume_mgmt(sc);
1217 	}
1218 
1219 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1220 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1221 
1222 	if (bootverbose) {
1223 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1224 		    rxvf ? "to" : "from", ifp->if_xname);
1225 	}
1226 out:
1227 	HN_UNLOCK(sc);
1228 }
1229 
1230 static void
1231 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1232 {
1233 
1234 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1235 		return;
1236 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1237 }
1238 
1239 static void
1240 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1241 {
1242 
1243 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1244 }
1245 
1246 static int
1247 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1248 {
1249 	struct ifnet *ifp, *vf_ifp;
1250 	uint64_t tmp;
1251 	int error;
1252 
1253 	HN_LOCK_ASSERT(sc);
1254 	ifp = sc->hn_ifp;
1255 	vf_ifp = sc->hn_vf_ifp;
1256 
1257 	/*
1258 	 * Fix up requested capabilities w/ supported capabilities,
1259 	 * since the supported capabilities could have been changed.
1260 	 */
1261 	ifr->ifr_reqcap &= ifp->if_capabilities;
1262 	/* Pass SIOCSIFCAP to VF. */
1263 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1264 
1265 	/*
1266 	 * NOTE:
1267 	 * The error will be propagated to the callers, however, it
1268 	 * is _not_ useful here.
1269 	 */
1270 
1271 	/*
1272 	 * Merge VF's enabled capabilities.
1273 	 */
1274 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1275 
1276 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1277 	if (ifp->if_capenable & IFCAP_TXCSUM)
1278 		ifp->if_hwassist |= tmp;
1279 	else
1280 		ifp->if_hwassist &= ~tmp;
1281 
1282 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1283 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1284 		ifp->if_hwassist |= tmp;
1285 	else
1286 		ifp->if_hwassist &= ~tmp;
1287 
1288 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1289 	if (ifp->if_capenable & IFCAP_TSO4)
1290 		ifp->if_hwassist |= tmp;
1291 	else
1292 		ifp->if_hwassist &= ~tmp;
1293 
1294 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1295 	if (ifp->if_capenable & IFCAP_TSO6)
1296 		ifp->if_hwassist |= tmp;
1297 	else
1298 		ifp->if_hwassist &= ~tmp;
1299 
1300 	return (error);
1301 }
1302 
1303 static int
1304 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1305 {
1306 	struct ifnet *vf_ifp;
1307 	struct ifreq ifr;
1308 
1309 	HN_LOCK_ASSERT(sc);
1310 	vf_ifp = sc->hn_vf_ifp;
1311 
1312 	memset(&ifr, 0, sizeof(ifr));
1313 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1314 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1315 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1316 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1317 }
1318 
1319 static void
1320 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1321 {
1322 	struct ifnet *ifp = sc->hn_ifp;
1323 	int allmulti = 0;
1324 
1325 	HN_LOCK_ASSERT(sc);
1326 
1327 	/* XXX vlan(4) style mcast addr maintenance */
1328 	if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1329 		allmulti = IFF_ALLMULTI;
1330 
1331 	/* Always set the VF's if_flags */
1332 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1333 }
1334 
1335 static void
1336 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1337 {
1338 	struct rm_priotracker pt;
1339 	struct ifnet *hn_ifp = NULL;
1340 	struct mbuf *mn;
1341 
1342 	/*
1343 	 * XXX racy, if hn(4) ever detached.
1344 	 */
1345 	rm_rlock(&hn_vfmap_lock, &pt);
1346 	if (vf_ifp->if_index < hn_vfmap_size)
1347 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1348 	rm_runlock(&hn_vfmap_lock, &pt);
1349 
1350 	if (hn_ifp != NULL) {
1351 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1352 			/*
1353 			 * Allow tapping on the VF.
1354 			 */
1355 			ETHER_BPF_MTAP(vf_ifp, mn);
1356 
1357 			/*
1358 			 * Update VF stats.
1359 			 */
1360 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1361 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1362 				    mn->m_pkthdr.len);
1363 			}
1364 			/*
1365 			 * XXX IFCOUNTER_IMCAST
1366 			 * This stat updating is kinda invasive, since it
1367 			 * requires two checks on the mbuf: the length check
1368 			 * and the ethernet header check.  As of this write,
1369 			 * all multicast packets go directly to hn(4), which
1370 			 * makes imcast stat updating in the VF a try in vian.
1371 			 */
1372 
1373 			/*
1374 			 * Fix up rcvif and increase hn(4)'s ipackets.
1375 			 */
1376 			mn->m_pkthdr.rcvif = hn_ifp;
1377 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1378 		}
1379 		/*
1380 		 * Go through hn(4)'s if_input.
1381 		 */
1382 		hn_ifp->if_input(hn_ifp, m);
1383 	} else {
1384 		/*
1385 		 * In the middle of the transition; free this
1386 		 * mbuf chain.
1387 		 */
1388 		while (m != NULL) {
1389 			mn = m->m_nextpkt;
1390 			m->m_nextpkt = NULL;
1391 			m_freem(m);
1392 			m = mn;
1393 		}
1394 	}
1395 }
1396 
1397 static void
1398 hn_mtu_change_fixup(struct hn_softc *sc)
1399 {
1400 	struct ifnet *ifp;
1401 
1402 	HN_LOCK_ASSERT(sc);
1403 	ifp = sc->hn_ifp;
1404 
1405 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1406 #if __FreeBSD_version >= 1100099
1407 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1408 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1409 #endif
1410 }
1411 
1412 static uint32_t
1413 hn_rss_type_fromndis(uint32_t rss_hash)
1414 {
1415 	uint32_t types = 0;
1416 
1417 	if (rss_hash & NDIS_HASH_IPV4)
1418 		types |= RSS_TYPE_IPV4;
1419 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1420 		types |= RSS_TYPE_TCP_IPV4;
1421 	if (rss_hash & NDIS_HASH_IPV6)
1422 		types |= RSS_TYPE_IPV6;
1423 	if (rss_hash & NDIS_HASH_IPV6_EX)
1424 		types |= RSS_TYPE_IPV6_EX;
1425 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1426 		types |= RSS_TYPE_TCP_IPV6;
1427 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1428 		types |= RSS_TYPE_TCP_IPV6_EX;
1429 	return (types);
1430 }
1431 
1432 static uint32_t
1433 hn_rss_type_tondis(uint32_t types)
1434 {
1435 	uint32_t rss_hash = 0;
1436 
1437 	KASSERT((types &
1438 	(RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1439 	("UDP4, UDP6 and UDP6EX are not supported"));
1440 
1441 	if (types & RSS_TYPE_IPV4)
1442 		rss_hash |= NDIS_HASH_IPV4;
1443 	if (types & RSS_TYPE_TCP_IPV4)
1444 		rss_hash |= NDIS_HASH_TCP_IPV4;
1445 	if (types & RSS_TYPE_IPV6)
1446 		rss_hash |= NDIS_HASH_IPV6;
1447 	if (types & RSS_TYPE_IPV6_EX)
1448 		rss_hash |= NDIS_HASH_IPV6_EX;
1449 	if (types & RSS_TYPE_TCP_IPV6)
1450 		rss_hash |= NDIS_HASH_TCP_IPV6;
1451 	if (types & RSS_TYPE_TCP_IPV6_EX)
1452 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1453 	return (rss_hash);
1454 }
1455 
1456 static void
1457 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1458 {
1459 	int i;
1460 
1461 	HN_LOCK_ASSERT(sc);
1462 
1463 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1464 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1465 }
1466 
1467 static void
1468 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1469 {
1470 	struct ifnet *ifp, *vf_ifp;
1471 	struct ifrsshash ifrh;
1472 	struct ifrsskey ifrk;
1473 	int error;
1474 	uint32_t my_types, diff_types, mbuf_types = 0;
1475 
1476 	HN_LOCK_ASSERT(sc);
1477 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1478 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1479 
1480 	if (sc->hn_rx_ring_inuse == 1) {
1481 		/* No RSS on synthetic parts; done. */
1482 		return;
1483 	}
1484 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1485 		/* Synthetic parts do not support Toeplitz; done. */
1486 		return;
1487 	}
1488 
1489 	ifp = sc->hn_ifp;
1490 	vf_ifp = sc->hn_vf_ifp;
1491 
1492 	/*
1493 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1494 	 * supported.
1495 	 */
1496 	memset(&ifrk, 0, sizeof(ifrk));
1497 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1498 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1499 	if (error) {
1500 		if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1501 		    vf_ifp->if_xname, error);
1502 		goto done;
1503 	}
1504 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1505 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1506 		    vf_ifp->if_xname, ifrk.ifrk_func);
1507 		goto done;
1508 	}
1509 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1510 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1511 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1512 		goto done;
1513 	}
1514 
1515 	/*
1516 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1517 	 */
1518 	memset(&ifrh, 0, sizeof(ifrh));
1519 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1520 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1521 	if (error) {
1522 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1523 		    vf_ifp->if_xname, error);
1524 		goto done;
1525 	}
1526 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1527 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1528 		    vf_ifp->if_xname, ifrh.ifrh_func);
1529 		goto done;
1530 	}
1531 
1532 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1533 	if ((ifrh.ifrh_types & my_types) == 0) {
1534 		/* This disables RSS; ignore it then */
1535 		if_printf(ifp, "%s intersection of RSS types failed.  "
1536 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1537 		    ifrh.ifrh_types, my_types);
1538 		goto done;
1539 	}
1540 
1541 	diff_types = my_types ^ ifrh.ifrh_types;
1542 	my_types &= ifrh.ifrh_types;
1543 	mbuf_types = my_types;
1544 
1545 	/*
1546 	 * Detect RSS hash value/type confliction.
1547 	 *
1548 	 * NOTE:
1549 	 * We don't disable the hash type, but stop delivery the hash
1550 	 * value/type through mbufs on RX path.
1551 	 */
1552 	if ((my_types & RSS_TYPE_IPV4) &&
1553 	    (diff_types & ifrh.ifrh_types &
1554 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1555 		/* Conflict; disable IPV4 hash type/value delivery. */
1556 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1557 		mbuf_types &= ~RSS_TYPE_IPV4;
1558 	}
1559 	if ((my_types & RSS_TYPE_IPV6) &&
1560 	    (diff_types & ifrh.ifrh_types &
1561 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1562 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1563 	      RSS_TYPE_IPV6_EX))) {
1564 		/* Conflict; disable IPV6 hash type/value delivery. */
1565 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1566 		mbuf_types &= ~RSS_TYPE_IPV6;
1567 	}
1568 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1569 	    (diff_types & ifrh.ifrh_types &
1570 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1571 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1572 	      RSS_TYPE_IPV6))) {
1573 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1574 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1575 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1576 	}
1577 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1578 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1579 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1580 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1581 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1582 	}
1583 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1584 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1585 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1586 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1587 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1588 	}
1589 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1590 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1591 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1592 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1593 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1594 	}
1595 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1596 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1597 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1598 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1599 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1600 	}
1601 
1602 	/*
1603 	 * Indirect table does not matter.
1604 	 */
1605 
1606 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1607 	    hn_rss_type_tondis(my_types);
1608 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1609 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1610 
1611 	if (reconf) {
1612 		error = hn_rss_reconfig(sc);
1613 		if (error) {
1614 			/* XXX roll-back? */
1615 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1616 			/* XXX keep going. */
1617 		}
1618 	}
1619 done:
1620 	/* Hash deliverability for mbufs. */
1621 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1622 }
1623 
1624 static void
1625 hn_vf_rss_restore(struct hn_softc *sc)
1626 {
1627 
1628 	HN_LOCK_ASSERT(sc);
1629 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1630 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1631 
1632 	if (sc->hn_rx_ring_inuse == 1)
1633 		goto done;
1634 
1635 	/*
1636 	 * Restore hash types.  Key does _not_ matter.
1637 	 */
1638 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1639 		int error;
1640 
1641 		sc->hn_rss_hash = sc->hn_rss_hcap;
1642 		error = hn_rss_reconfig(sc);
1643 		if (error) {
1644 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1645 			    error);
1646 			/* XXX keep going. */
1647 		}
1648 	}
1649 done:
1650 	/* Hash deliverability for mbufs. */
1651 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1652 }
1653 
1654 static void
1655 hn_xpnt_vf_setready(struct hn_softc *sc)
1656 {
1657 	struct ifnet *ifp, *vf_ifp;
1658 	struct ifreq ifr;
1659 
1660 	HN_LOCK_ASSERT(sc);
1661 	ifp = sc->hn_ifp;
1662 	vf_ifp = sc->hn_vf_ifp;
1663 
1664 	/*
1665 	 * Mark the VF ready.
1666 	 */
1667 	sc->hn_vf_rdytick = 0;
1668 
1669 	/*
1670 	 * Save information for restoration.
1671 	 */
1672 	sc->hn_saved_caps = ifp->if_capabilities;
1673 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1674 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1675 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1676 
1677 	/*
1678 	 * Intersect supported/enabled capabilities.
1679 	 *
1680 	 * NOTE:
1681 	 * if_hwassist is not changed here.
1682 	 */
1683 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1684 	ifp->if_capenable &= ifp->if_capabilities;
1685 
1686 	/*
1687 	 * Fix TSO settings.
1688 	 */
1689 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1690 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1691 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1692 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1693 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1694 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1695 
1696 	/*
1697 	 * Change VF's enabled capabilities.
1698 	 */
1699 	memset(&ifr, 0, sizeof(ifr));
1700 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1701 	ifr.ifr_reqcap = ifp->if_capenable;
1702 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1703 
1704 	if (ifp->if_mtu != ETHERMTU) {
1705 		int error;
1706 
1707 		/*
1708 		 * Change VF's MTU.
1709 		 */
1710 		memset(&ifr, 0, sizeof(ifr));
1711 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1712 		ifr.ifr_mtu = ifp->if_mtu;
1713 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1714 		if (error) {
1715 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1716 			    vf_ifp->if_xname, ifp->if_mtu);
1717 			if (ifp->if_mtu > ETHERMTU) {
1718 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1719 
1720 				/*
1721 				 * XXX
1722 				 * No need to adjust the synthetic parts' MTU;
1723 				 * failure of the adjustment will cause us
1724 				 * infinite headache.
1725 				 */
1726 				ifp->if_mtu = ETHERMTU;
1727 				hn_mtu_change_fixup(sc);
1728 			}
1729 		}
1730 	}
1731 }
1732 
1733 static bool
1734 hn_xpnt_vf_isready(struct hn_softc *sc)
1735 {
1736 
1737 	HN_LOCK_ASSERT(sc);
1738 
1739 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1740 		return (false);
1741 
1742 	if (sc->hn_vf_rdytick == 0)
1743 		return (true);
1744 
1745 	if (sc->hn_vf_rdytick > ticks)
1746 		return (false);
1747 
1748 	/* Mark VF as ready. */
1749 	hn_xpnt_vf_setready(sc);
1750 	return (true);
1751 }
1752 
1753 static void
1754 hn_xpnt_vf_setenable(struct hn_softc *sc)
1755 {
1756 	int i;
1757 
1758 	HN_LOCK_ASSERT(sc);
1759 
1760 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1761 	rm_wlock(&sc->hn_vf_lock);
1762 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1763 	rm_wunlock(&sc->hn_vf_lock);
1764 
1765 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1766 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1767 }
1768 
1769 static void
1770 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1771 {
1772 	int i;
1773 
1774 	HN_LOCK_ASSERT(sc);
1775 
1776 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1777 	rm_wlock(&sc->hn_vf_lock);
1778 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1779 	if (clear_vf)
1780 		sc->hn_vf_ifp = NULL;
1781 	rm_wunlock(&sc->hn_vf_lock);
1782 
1783 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1784 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1785 }
1786 
1787 static void
1788 hn_xpnt_vf_init(struct hn_softc *sc)
1789 {
1790 	int error;
1791 
1792 	HN_LOCK_ASSERT(sc);
1793 
1794 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1795 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1796 
1797 	if (bootverbose) {
1798 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1799 		    sc->hn_vf_ifp->if_xname);
1800 	}
1801 
1802 	/*
1803 	 * Bring the VF up.
1804 	 */
1805 	hn_xpnt_vf_saveifflags(sc);
1806 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1807 	error = hn_xpnt_vf_iocsetflags(sc);
1808 	if (error) {
1809 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1810 		    sc->hn_vf_ifp->if_xname, error);
1811 		return;
1812 	}
1813 
1814 	/*
1815 	 * NOTE:
1816 	 * Datapath setting must happen _after_ bringing the VF up.
1817 	 */
1818 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1819 
1820 	/*
1821 	 * NOTE:
1822 	 * Fixup RSS related bits _after_ the VF is brought up, since
1823 	 * many VFs generate RSS key during it's initialization.
1824 	 */
1825 	hn_vf_rss_fixup(sc, true);
1826 
1827 	/* Mark transparent mode VF as enabled. */
1828 	hn_xpnt_vf_setenable(sc);
1829 }
1830 
1831 static void
1832 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1833 {
1834 	struct hn_softc *sc = xsc;
1835 
1836 	HN_LOCK(sc);
1837 
1838 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1839 		goto done;
1840 	if (sc->hn_vf_ifp == NULL)
1841 		goto done;
1842 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1843 		goto done;
1844 
1845 	if (sc->hn_vf_rdytick != 0) {
1846 		/* Mark VF as ready. */
1847 		hn_xpnt_vf_setready(sc);
1848 	}
1849 
1850 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1851 		/*
1852 		 * Delayed VF initialization.
1853 		 */
1854 		if (bootverbose) {
1855 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1856 			    sc->hn_vf_ifp->if_xname);
1857 		}
1858 		hn_xpnt_vf_init(sc);
1859 	}
1860 done:
1861 	HN_UNLOCK(sc);
1862 }
1863 
1864 static void
1865 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1866 {
1867 	struct hn_softc *sc = xsc;
1868 
1869 	HN_LOCK(sc);
1870 
1871 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1872 		goto done;
1873 
1874 	if (!hn_ismyvf(sc, ifp))
1875 		goto done;
1876 
1877 	if (sc->hn_vf_ifp != NULL) {
1878 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1879 		    sc->hn_vf_ifp->if_xname);
1880 		goto done;
1881 	}
1882 
1883 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1884 		/*
1885 		 * ifnet.if_start is _not_ supported by transparent
1886 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1887 		 */
1888 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1889 		    "in transparent VF mode.\n", ifp->if_xname);
1890 		goto done;
1891 	}
1892 
1893 	rm_wlock(&hn_vfmap_lock);
1894 
1895 	if (ifp->if_index >= hn_vfmap_size) {
1896 		struct ifnet **newmap;
1897 		int newsize;
1898 
1899 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1900 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1901 		    M_WAITOK | M_ZERO);
1902 
1903 		memcpy(newmap, hn_vfmap,
1904 		    sizeof(struct ifnet *) * hn_vfmap_size);
1905 		free(hn_vfmap, M_DEVBUF);
1906 		hn_vfmap = newmap;
1907 		hn_vfmap_size = newsize;
1908 	}
1909 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1910 	    ("%s: ifindex %d was mapped to %s",
1911 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1912 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1913 
1914 	rm_wunlock(&hn_vfmap_lock);
1915 
1916 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1917 	rm_wlock(&sc->hn_vf_lock);
1918 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1919 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1920 	sc->hn_vf_ifp = ifp;
1921 	rm_wunlock(&sc->hn_vf_lock);
1922 
1923 	if (hn_xpnt_vf) {
1924 		int wait_ticks;
1925 
1926 		/*
1927 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1928 		 * Save vf_ifp's current if_input for later restoration.
1929 		 */
1930 		sc->hn_vf_input = ifp->if_input;
1931 		ifp->if_input = hn_xpnt_vf_input;
1932 
1933 		/*
1934 		 * Stop link status management; use the VF's.
1935 		 */
1936 		hn_suspend_mgmt(sc);
1937 
1938 		/*
1939 		 * Give VF sometime to complete its attach routing.
1940 		 */
1941 		wait_ticks = hn_xpnt_vf_attwait * hz;
1942 		sc->hn_vf_rdytick = ticks + wait_ticks;
1943 
1944 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1945 		    wait_ticks);
1946 	}
1947 done:
1948 	HN_UNLOCK(sc);
1949 }
1950 
1951 static void
1952 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1953 {
1954 	struct hn_softc *sc = xsc;
1955 
1956 	HN_LOCK(sc);
1957 
1958 	if (sc->hn_vf_ifp == NULL)
1959 		goto done;
1960 
1961 	if (!hn_ismyvf(sc, ifp))
1962 		goto done;
1963 
1964 	if (hn_xpnt_vf) {
1965 		/*
1966 		 * Make sure that the delayed initialization is not running.
1967 		 *
1968 		 * NOTE:
1969 		 * - This lock _must_ be released, since the hn_vf_init task
1970 		 *   will try holding this lock.
1971 		 * - It is safe to release this lock here, since the
1972 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1973 		 *
1974 		 * XXX racy, if hn(4) ever detached.
1975 		 */
1976 		HN_UNLOCK(sc);
1977 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1978 		HN_LOCK(sc);
1979 
1980 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1981 		    sc->hn_ifp->if_xname));
1982 		ifp->if_input = sc->hn_vf_input;
1983 		sc->hn_vf_input = NULL;
1984 
1985 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1986 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1987 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1988 
1989 		if (sc->hn_vf_rdytick == 0) {
1990 			/*
1991 			 * The VF was ready; restore some settings.
1992 			 */
1993 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1994 			/*
1995 			 * NOTE:
1996 			 * There is _no_ need to fixup if_capenable and
1997 			 * if_hwassist, since the if_capabilities before
1998 			 * restoration was an intersection of the VF's
1999 			 * if_capabilites and the synthetic device's
2000 			 * if_capabilites.
2001 			 */
2002 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2003 			sc->hn_ifp->if_hw_tsomaxsegcount =
2004 			    sc->hn_saved_tsosegcnt;
2005 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2006 		}
2007 
2008 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2009 			/*
2010 			 * Restore RSS settings.
2011 			 */
2012 			hn_vf_rss_restore(sc);
2013 
2014 			/*
2015 			 * Resume link status management, which was suspended
2016 			 * by hn_ifnet_attevent().
2017 			 */
2018 			hn_resume_mgmt(sc);
2019 		}
2020 	}
2021 
2022 	/* Mark transparent mode VF as disabled. */
2023 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2024 
2025 	rm_wlock(&hn_vfmap_lock);
2026 
2027 	KASSERT(ifp->if_index < hn_vfmap_size,
2028 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2029 	if (hn_vfmap[ifp->if_index] != NULL) {
2030 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2031 		    ("%s: ifindex %d was mapped to %s",
2032 		     ifp->if_xname, ifp->if_index,
2033 		     hn_vfmap[ifp->if_index]->if_xname));
2034 		hn_vfmap[ifp->if_index] = NULL;
2035 	}
2036 
2037 	rm_wunlock(&hn_vfmap_lock);
2038 done:
2039 	HN_UNLOCK(sc);
2040 }
2041 
2042 static void
2043 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2044 {
2045 	struct hn_softc *sc = xsc;
2046 
2047 	if (sc->hn_vf_ifp == ifp)
2048 		if_link_state_change(sc->hn_ifp, link_state);
2049 }
2050 
2051 static int
2052 hn_probe(device_t dev)
2053 {
2054 
2055 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2056 		device_set_desc(dev, "Hyper-V Network Interface");
2057 		return BUS_PROBE_DEFAULT;
2058 	}
2059 	return ENXIO;
2060 }
2061 
2062 static int
2063 hn_attach(device_t dev)
2064 {
2065 	struct hn_softc *sc = device_get_softc(dev);
2066 	struct sysctl_oid_list *child;
2067 	struct sysctl_ctx_list *ctx;
2068 	uint8_t eaddr[ETHER_ADDR_LEN];
2069 	struct ifnet *ifp = NULL;
2070 	int error, ring_cnt, tx_ring_cnt;
2071 	uint32_t mtu;
2072 
2073 	sc->hn_dev = dev;
2074 	sc->hn_prichan = vmbus_get_channel(dev);
2075 	HN_LOCK_INIT(sc);
2076 	rm_init(&sc->hn_vf_lock, "hnvf");
2077 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2078 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2079 
2080 	/*
2081 	 * Initialize these tunables once.
2082 	 */
2083 	sc->hn_agg_size = hn_tx_agg_size;
2084 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2085 
2086 	/*
2087 	 * Setup taskqueue for transmission.
2088 	 */
2089 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2090 		int i;
2091 
2092 		sc->hn_tx_taskqs =
2093 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2094 		    M_DEVBUF, M_WAITOK);
2095 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2096 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2097 			    M_WAITOK, taskqueue_thread_enqueue,
2098 			    &sc->hn_tx_taskqs[i]);
2099 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2100 			    "%s tx%d", device_get_nameunit(dev), i);
2101 		}
2102 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2103 		sc->hn_tx_taskqs = hn_tx_taskque;
2104 	}
2105 
2106 	/*
2107 	 * Setup taskqueue for mangement tasks, e.g. link status.
2108 	 */
2109 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2110 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2111 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2112 	    device_get_nameunit(dev));
2113 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2114 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2115 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2116 	    hn_netchg_status_taskfunc, sc);
2117 
2118 	if (hn_xpnt_vf) {
2119 		/*
2120 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2121 		 */
2122 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2123 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2124 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2125 		    device_get_nameunit(dev));
2126 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2127 		    hn_xpnt_vf_init_taskfunc, sc);
2128 	}
2129 
2130 	/*
2131 	 * Allocate ifnet and setup its name earlier, so that if_printf
2132 	 * can be used by functions, which will be called after
2133 	 * ether_ifattach().
2134 	 */
2135 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2136 	ifp->if_softc = sc;
2137 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2138 
2139 	/*
2140 	 * Initialize ifmedia earlier so that it can be unconditionally
2141 	 * destroyed, if error happened later on.
2142 	 */
2143 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2144 
2145 	/*
2146 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2147 	 * to use (tx_ring_cnt).
2148 	 *
2149 	 * NOTE:
2150 	 * The # of RX rings to use is same as the # of channels to use.
2151 	 */
2152 	ring_cnt = hn_chan_cnt;
2153 	if (ring_cnt <= 0) {
2154 		/* Default */
2155 		ring_cnt = mp_ncpus;
2156 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2157 			ring_cnt = HN_RING_CNT_DEF_MAX;
2158 	} else if (ring_cnt > mp_ncpus) {
2159 		ring_cnt = mp_ncpus;
2160 	}
2161 #ifdef RSS
2162 	if (ring_cnt > rss_getnumbuckets())
2163 		ring_cnt = rss_getnumbuckets();
2164 #endif
2165 
2166 	tx_ring_cnt = hn_tx_ring_cnt;
2167 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2168 		tx_ring_cnt = ring_cnt;
2169 #ifdef HN_IFSTART_SUPPORT
2170 	if (hn_use_if_start) {
2171 		/* ifnet.if_start only needs one TX ring. */
2172 		tx_ring_cnt = 1;
2173 	}
2174 #endif
2175 
2176 	/*
2177 	 * Set the leader CPU for channels.
2178 	 */
2179 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2180 
2181 	/*
2182 	 * Create enough TX/RX rings, even if only limited number of
2183 	 * channels can be allocated.
2184 	 */
2185 	error = hn_create_tx_data(sc, tx_ring_cnt);
2186 	if (error)
2187 		goto failed;
2188 	error = hn_create_rx_data(sc, ring_cnt);
2189 	if (error)
2190 		goto failed;
2191 
2192 	/*
2193 	 * Create transaction context for NVS and RNDIS transactions.
2194 	 */
2195 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2196 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2197 	if (sc->hn_xact == NULL) {
2198 		error = ENXIO;
2199 		goto failed;
2200 	}
2201 
2202 	/*
2203 	 * Install orphan handler for the revocation of this device's
2204 	 * primary channel.
2205 	 *
2206 	 * NOTE:
2207 	 * The processing order is critical here:
2208 	 * Install the orphan handler, _before_ testing whether this
2209 	 * device's primary channel has been revoked or not.
2210 	 */
2211 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2212 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2213 		error = ENXIO;
2214 		goto failed;
2215 	}
2216 
2217 	/*
2218 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2219 	 */
2220 	error = hn_synth_attach(sc, ETHERMTU);
2221 	if (error)
2222 		goto failed;
2223 
2224 	error = hn_rndis_get_eaddr(sc, eaddr);
2225 	if (error)
2226 		goto failed;
2227 
2228 	error = hn_rndis_get_mtu(sc, &mtu);
2229 	if (error)
2230 		mtu = ETHERMTU;
2231 	else if (bootverbose)
2232 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2233 
2234 #if __FreeBSD_version >= 1100099
2235 	if (sc->hn_rx_ring_inuse > 1) {
2236 		/*
2237 		 * Reduce TCP segment aggregation limit for multiple
2238 		 * RX rings to increase ACK timeliness.
2239 		 */
2240 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2241 	}
2242 #endif
2243 
2244 	/*
2245 	 * Fixup TX stuffs after synthetic parts are attached.
2246 	 */
2247 	hn_fixup_tx_data(sc);
2248 
2249 	ctx = device_get_sysctl_ctx(dev);
2250 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2251 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2252 	    &sc->hn_nvs_ver, 0, "NVS version");
2253 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2254 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2255 	    hn_ndis_version_sysctl, "A", "NDIS version");
2256 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2257 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2258 	    hn_caps_sysctl, "A", "capabilities");
2259 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2260 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2261 	    hn_hwassist_sysctl, "A", "hwassist");
2262 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2263 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2264 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2265 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2266 	    "max # of TSO segments");
2267 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2268 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2269 	    "max size of TSO segment");
2270 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2271 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2272 	    hn_rxfilter_sysctl, "A", "rxfilter");
2273 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2274 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2275 	    hn_rss_hash_sysctl, "A", "RSS hash");
2276 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2277 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2278 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2279 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2280 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2282 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2283 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2284 #ifndef RSS
2285 	/*
2286 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2287 	 */
2288 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2289 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2290 	    hn_rss_key_sysctl, "IU", "RSS key");
2291 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2292 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2293 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2294 #endif
2295 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2296 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2297 	    "RNDIS offered packet transmission aggregation size limit");
2298 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2299 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2300 	    "RNDIS offered packet transmission aggregation count limit");
2301 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2302 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2303 	    "RNDIS packet transmission aggregation alignment");
2304 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2305 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2306 	    hn_txagg_size_sysctl, "I",
2307 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2308 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2309 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2310 	    hn_txagg_pkts_sysctl, "I",
2311 	    "Packet transmission aggregation packets, "
2312 	    "0 -- disable, -1 -- auto");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2314 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_polling_sysctl, "I",
2316 	    "Polling frequency: [100,1000000], 0 disable polling");
2317 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2318 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 	    hn_vf_sysctl, "A", "Virtual Function's name");
2320 	if (!hn_xpnt_vf) {
2321 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2322 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2323 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2324 	} else {
2325 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2326 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2327 		    hn_xpnt_vf_enabled_sysctl, "I",
2328 		    "Transparent VF enabled");
2329 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2330 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2331 		    hn_xpnt_vf_accbpf_sysctl, "I",
2332 		    "Accurate BPF for transparent VF");
2333 	}
2334 
2335 	/*
2336 	 * Setup the ifmedia, which has been initialized earlier.
2337 	 */
2338 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2339 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2340 	/* XXX ifmedia_set really should do this for us */
2341 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2342 
2343 	/*
2344 	 * Setup the ifnet for this interface.
2345 	 */
2346 
2347 	ifp->if_baudrate = IF_Gbps(10);
2348 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2349 	ifp->if_ioctl = hn_ioctl;
2350 	ifp->if_init = hn_init;
2351 #ifdef HN_IFSTART_SUPPORT
2352 	if (hn_use_if_start) {
2353 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2354 
2355 		ifp->if_start = hn_start;
2356 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2357 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2358 		IFQ_SET_READY(&ifp->if_snd);
2359 	} else
2360 #endif
2361 	{
2362 		ifp->if_transmit = hn_transmit;
2363 		ifp->if_qflush = hn_xmit_qflush;
2364 	}
2365 
2366 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2367 #ifdef foo
2368 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2369 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2370 #endif
2371 	if (sc->hn_caps & HN_CAP_VLAN) {
2372 		/* XXX not sure about VLAN_MTU. */
2373 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2374 	}
2375 
2376 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2377 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2378 		ifp->if_capabilities |= IFCAP_TXCSUM;
2379 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2380 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2381 	if (sc->hn_caps & HN_CAP_TSO4) {
2382 		ifp->if_capabilities |= IFCAP_TSO4;
2383 		ifp->if_hwassist |= CSUM_IP_TSO;
2384 	}
2385 	if (sc->hn_caps & HN_CAP_TSO6) {
2386 		ifp->if_capabilities |= IFCAP_TSO6;
2387 		ifp->if_hwassist |= CSUM_IP6_TSO;
2388 	}
2389 
2390 	/* Enable all available capabilities by default. */
2391 	ifp->if_capenable = ifp->if_capabilities;
2392 
2393 	/*
2394 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2395 	 * be enabled through SIOCSIFCAP.
2396 	 */
2397 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2398 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2399 
2400 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2401 		/*
2402 		 * Lock hn_set_tso_maxsize() to simplify its
2403 		 * internal logic.
2404 		 */
2405 		HN_LOCK(sc);
2406 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2407 		HN_UNLOCK(sc);
2408 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2409 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2410 	}
2411 
2412 	ether_ifattach(ifp, eaddr);
2413 
2414 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2415 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2416 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2417 	}
2418 	if (mtu < ETHERMTU) {
2419 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2420 		ifp->if_mtu = mtu;
2421 	}
2422 
2423 	/* Inform the upper layer about the long frame support. */
2424 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2425 
2426 	/*
2427 	 * Kick off link status check.
2428 	 */
2429 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2430 	hn_update_link_status(sc);
2431 
2432 	if (!hn_xpnt_vf) {
2433 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2434 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2435 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2436 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2437 	} else {
2438 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2439 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2440 	}
2441 
2442 	/*
2443 	 * NOTE:
2444 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2445 	 * since interface's LLADDR is needed; interface LLADDR is not
2446 	 * available when ifnet_arrival event is triggered.
2447 	 */
2448 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2449 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2450 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2451 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2452 
2453 	return (0);
2454 failed:
2455 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2456 		hn_synth_detach(sc);
2457 	hn_detach(dev);
2458 	return (error);
2459 }
2460 
2461 static int
2462 hn_detach(device_t dev)
2463 {
2464 	struct hn_softc *sc = device_get_softc(dev);
2465 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2466 
2467 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2468 		/*
2469 		 * In case that the vmbus missed the orphan handler
2470 		 * installation.
2471 		 */
2472 		vmbus_xact_ctx_orphan(sc->hn_xact);
2473 	}
2474 
2475 	if (sc->hn_ifaddr_evthand != NULL)
2476 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2477 	if (sc->hn_ifnet_evthand != NULL)
2478 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2479 	if (sc->hn_ifnet_atthand != NULL) {
2480 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2481 		    sc->hn_ifnet_atthand);
2482 	}
2483 	if (sc->hn_ifnet_dethand != NULL) {
2484 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2485 		    sc->hn_ifnet_dethand);
2486 	}
2487 	if (sc->hn_ifnet_lnkhand != NULL)
2488 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2489 
2490 	vf_ifp = sc->hn_vf_ifp;
2491 	__compiler_membar();
2492 	if (vf_ifp != NULL)
2493 		hn_ifnet_detevent(sc, vf_ifp);
2494 
2495 	if (device_is_attached(dev)) {
2496 		HN_LOCK(sc);
2497 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2498 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2499 				hn_stop(sc, true);
2500 			/*
2501 			 * NOTE:
2502 			 * hn_stop() only suspends data, so managment
2503 			 * stuffs have to be suspended manually here.
2504 			 */
2505 			hn_suspend_mgmt(sc);
2506 			hn_synth_detach(sc);
2507 		}
2508 		HN_UNLOCK(sc);
2509 		ether_ifdetach(ifp);
2510 	}
2511 
2512 	ifmedia_removeall(&sc->hn_media);
2513 	hn_destroy_rx_data(sc);
2514 	hn_destroy_tx_data(sc);
2515 
2516 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2517 		int i;
2518 
2519 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2520 			taskqueue_free(sc->hn_tx_taskqs[i]);
2521 		free(sc->hn_tx_taskqs, M_DEVBUF);
2522 	}
2523 	taskqueue_free(sc->hn_mgmt_taskq0);
2524 	if (sc->hn_vf_taskq != NULL)
2525 		taskqueue_free(sc->hn_vf_taskq);
2526 
2527 	if (sc->hn_xact != NULL) {
2528 		/*
2529 		 * Uninstall the orphan handler _before_ the xact is
2530 		 * destructed.
2531 		 */
2532 		vmbus_chan_unset_orphan(sc->hn_prichan);
2533 		vmbus_xact_ctx_destroy(sc->hn_xact);
2534 	}
2535 
2536 	if_free(ifp);
2537 
2538 	HN_LOCK_DESTROY(sc);
2539 	rm_destroy(&sc->hn_vf_lock);
2540 	return (0);
2541 }
2542 
2543 static int
2544 hn_shutdown(device_t dev)
2545 {
2546 
2547 	return (0);
2548 }
2549 
2550 static void
2551 hn_link_status(struct hn_softc *sc)
2552 {
2553 	uint32_t link_status;
2554 	int error;
2555 
2556 	error = hn_rndis_get_linkstatus(sc, &link_status);
2557 	if (error) {
2558 		/* XXX what to do? */
2559 		return;
2560 	}
2561 
2562 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2563 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2564 	else
2565 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2566 	if_link_state_change(sc->hn_ifp,
2567 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2568 	    LINK_STATE_UP : LINK_STATE_DOWN);
2569 }
2570 
2571 static void
2572 hn_link_taskfunc(void *xsc, int pending __unused)
2573 {
2574 	struct hn_softc *sc = xsc;
2575 
2576 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2577 		return;
2578 	hn_link_status(sc);
2579 }
2580 
2581 static void
2582 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2583 {
2584 	struct hn_softc *sc = xsc;
2585 
2586 	/* Prevent any link status checks from running. */
2587 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2588 
2589 	/*
2590 	 * Fake up a [link down --> link up] state change; 5 seconds
2591 	 * delay is used, which closely simulates miibus reaction
2592 	 * upon link down event.
2593 	 */
2594 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2595 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2596 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2597 	    &sc->hn_netchg_status, 5 * hz);
2598 }
2599 
2600 static void
2601 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2602 {
2603 	struct hn_softc *sc = xsc;
2604 
2605 	/* Re-allow link status checks. */
2606 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2607 	hn_link_status(sc);
2608 }
2609 
2610 static void
2611 hn_update_link_status(struct hn_softc *sc)
2612 {
2613 
2614 	if (sc->hn_mgmt_taskq != NULL)
2615 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2616 }
2617 
2618 static void
2619 hn_change_network(struct hn_softc *sc)
2620 {
2621 
2622 	if (sc->hn_mgmt_taskq != NULL)
2623 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2624 }
2625 
2626 static __inline int
2627 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2628     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2629 {
2630 	struct mbuf *m = *m_head;
2631 	int error;
2632 
2633 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2634 
2635 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2636 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2637 	if (error == EFBIG) {
2638 		struct mbuf *m_new;
2639 
2640 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2641 		if (m_new == NULL)
2642 			return ENOBUFS;
2643 		else
2644 			*m_head = m = m_new;
2645 		txr->hn_tx_collapsed++;
2646 
2647 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2648 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2649 	}
2650 	if (!error) {
2651 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2652 		    BUS_DMASYNC_PREWRITE);
2653 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2654 	}
2655 	return error;
2656 }
2657 
2658 static __inline int
2659 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2660 {
2661 
2662 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2663 	    ("put an onlist txd %#x", txd->flags));
2664 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2665 	    ("put an onagg txd %#x", txd->flags));
2666 
2667 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2668 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2669 		return 0;
2670 
2671 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2672 		struct hn_txdesc *tmp_txd;
2673 
2674 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2675 			int freed;
2676 
2677 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2678 			    ("resursive aggregation on aggregated txdesc"));
2679 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2680 			    ("not aggregated txdesc"));
2681 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2682 			    ("aggregated txdesc uses dmamap"));
2683 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2684 			    ("aggregated txdesc consumes "
2685 			     "chimney sending buffer"));
2686 			KASSERT(tmp_txd->chim_size == 0,
2687 			    ("aggregated txdesc has non-zero "
2688 			     "chimney sending size"));
2689 
2690 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2691 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2692 			freed = hn_txdesc_put(txr, tmp_txd);
2693 			KASSERT(freed, ("failed to free aggregated txdesc"));
2694 		}
2695 	}
2696 
2697 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2698 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2699 		    ("chim txd uses dmamap"));
2700 		hn_chim_free(txr->hn_sc, txd->chim_index);
2701 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2702 		txd->chim_size = 0;
2703 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2704 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2705 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2706 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2707 		    txd->data_dmap);
2708 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2709 	}
2710 
2711 	if (txd->m != NULL) {
2712 		m_freem(txd->m);
2713 		txd->m = NULL;
2714 	}
2715 
2716 	txd->flags |= HN_TXD_FLAG_ONLIST;
2717 #ifndef HN_USE_TXDESC_BUFRING
2718 	mtx_lock_spin(&txr->hn_txlist_spin);
2719 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2720 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2721 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2722 	txr->hn_txdesc_avail++;
2723 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2724 	mtx_unlock_spin(&txr->hn_txlist_spin);
2725 #else	/* HN_USE_TXDESC_BUFRING */
2726 #ifdef HN_DEBUG
2727 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2728 #endif
2729 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2730 #endif	/* !HN_USE_TXDESC_BUFRING */
2731 
2732 	return 1;
2733 }
2734 
2735 static __inline struct hn_txdesc *
2736 hn_txdesc_get(struct hn_tx_ring *txr)
2737 {
2738 	struct hn_txdesc *txd;
2739 
2740 #ifndef HN_USE_TXDESC_BUFRING
2741 	mtx_lock_spin(&txr->hn_txlist_spin);
2742 	txd = SLIST_FIRST(&txr->hn_txlist);
2743 	if (txd != NULL) {
2744 		KASSERT(txr->hn_txdesc_avail > 0,
2745 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2746 		txr->hn_txdesc_avail--;
2747 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2748 	}
2749 	mtx_unlock_spin(&txr->hn_txlist_spin);
2750 #else
2751 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2752 #endif
2753 
2754 	if (txd != NULL) {
2755 #ifdef HN_USE_TXDESC_BUFRING
2756 #ifdef HN_DEBUG
2757 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2758 #endif
2759 #endif	/* HN_USE_TXDESC_BUFRING */
2760 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2761 		    STAILQ_EMPTY(&txd->agg_list) &&
2762 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2763 		    txd->chim_size == 0 &&
2764 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2765 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2766 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2767 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2768 		txd->refs = 1;
2769 	}
2770 	return txd;
2771 }
2772 
2773 static __inline void
2774 hn_txdesc_hold(struct hn_txdesc *txd)
2775 {
2776 
2777 	/* 0->1 transition will never work */
2778 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2779 	atomic_add_int(&txd->refs, 1);
2780 }
2781 
2782 static __inline void
2783 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2784 {
2785 
2786 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2787 	    ("recursive aggregation on aggregating txdesc"));
2788 
2789 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2790 	    ("already aggregated"));
2791 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2792 	    ("recursive aggregation on to-be-aggregated txdesc"));
2793 
2794 	txd->flags |= HN_TXD_FLAG_ONAGG;
2795 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2796 }
2797 
2798 static bool
2799 hn_tx_ring_pending(struct hn_tx_ring *txr)
2800 {
2801 	bool pending = false;
2802 
2803 #ifndef HN_USE_TXDESC_BUFRING
2804 	mtx_lock_spin(&txr->hn_txlist_spin);
2805 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2806 		pending = true;
2807 	mtx_unlock_spin(&txr->hn_txlist_spin);
2808 #else
2809 	if (!buf_ring_full(txr->hn_txdesc_br))
2810 		pending = true;
2811 #endif
2812 	return (pending);
2813 }
2814 
2815 static __inline void
2816 hn_txeof(struct hn_tx_ring *txr)
2817 {
2818 	txr->hn_has_txeof = 0;
2819 	txr->hn_txeof(txr);
2820 }
2821 
2822 static void
2823 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2824     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2825 {
2826 	struct hn_txdesc *txd = sndc->hn_cbarg;
2827 	struct hn_tx_ring *txr;
2828 
2829 	txr = txd->txr;
2830 	KASSERT(txr->hn_chan == chan,
2831 	    ("channel mismatch, on chan%u, should be chan%u",
2832 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2833 
2834 	txr->hn_has_txeof = 1;
2835 	hn_txdesc_put(txr, txd);
2836 
2837 	++txr->hn_txdone_cnt;
2838 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2839 		txr->hn_txdone_cnt = 0;
2840 		if (txr->hn_oactive)
2841 			hn_txeof(txr);
2842 	}
2843 }
2844 
2845 static void
2846 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2847 {
2848 #if defined(INET) || defined(INET6)
2849 	tcp_lro_flush_all(&rxr->hn_lro);
2850 #endif
2851 
2852 	/*
2853 	 * NOTE:
2854 	 * 'txr' could be NULL, if multiple channels and
2855 	 * ifnet.if_start method are enabled.
2856 	 */
2857 	if (txr == NULL || !txr->hn_has_txeof)
2858 		return;
2859 
2860 	txr->hn_txdone_cnt = 0;
2861 	hn_txeof(txr);
2862 }
2863 
2864 static __inline uint32_t
2865 hn_rndis_pktmsg_offset(uint32_t ofs)
2866 {
2867 
2868 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2869 	    ("invalid RNDIS packet msg offset %u", ofs));
2870 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2871 }
2872 
2873 static __inline void *
2874 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2875     size_t pi_dlen, uint32_t pi_type)
2876 {
2877 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2878 	struct rndis_pktinfo *pi;
2879 
2880 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2881 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2882 
2883 	/*
2884 	 * Per-packet-info does not move; it only grows.
2885 	 *
2886 	 * NOTE:
2887 	 * rm_pktinfooffset in this phase counts from the beginning
2888 	 * of rndis_packet_msg.
2889 	 */
2890 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2891 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2892 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2893 	    pkt->rm_pktinfolen);
2894 	pkt->rm_pktinfolen += pi_size;
2895 
2896 	pi->rm_size = pi_size;
2897 	pi->rm_type = pi_type;
2898 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2899 
2900 	return (pi->rm_data);
2901 }
2902 
2903 static __inline int
2904 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2905 {
2906 	struct hn_txdesc *txd;
2907 	struct mbuf *m;
2908 	int error, pkts;
2909 
2910 	txd = txr->hn_agg_txd;
2911 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2912 
2913 	/*
2914 	 * Since hn_txpkt() will reset this temporary stat, save
2915 	 * it now, so that oerrors can be updated properly, if
2916 	 * hn_txpkt() ever fails.
2917 	 */
2918 	pkts = txr->hn_stat_pkts;
2919 
2920 	/*
2921 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2922 	 * failure, save it for later freeing, if hn_txpkt() ever
2923 	 * fails.
2924 	 */
2925 	m = txd->m;
2926 	error = hn_txpkt(ifp, txr, txd);
2927 	if (__predict_false(error)) {
2928 		/* txd is freed, but m is not. */
2929 		m_freem(m);
2930 
2931 		txr->hn_flush_failed++;
2932 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2933 	}
2934 
2935 	/* Reset all aggregation states. */
2936 	txr->hn_agg_txd = NULL;
2937 	txr->hn_agg_szleft = 0;
2938 	txr->hn_agg_pktleft = 0;
2939 	txr->hn_agg_prevpkt = NULL;
2940 
2941 	return (error);
2942 }
2943 
2944 static void *
2945 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2946     int pktsize)
2947 {
2948 	void *chim;
2949 
2950 	if (txr->hn_agg_txd != NULL) {
2951 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2952 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2953 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2954 			int olen;
2955 
2956 			/*
2957 			 * Update the previous RNDIS packet's total length,
2958 			 * it can be increased due to the mandatory alignment
2959 			 * padding for this RNDIS packet.  And update the
2960 			 * aggregating txdesc's chimney sending buffer size
2961 			 * accordingly.
2962 			 *
2963 			 * XXX
2964 			 * Zero-out the padding, as required by the RNDIS spec.
2965 			 */
2966 			olen = pkt->rm_len;
2967 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2968 			agg_txd->chim_size += pkt->rm_len - olen;
2969 
2970 			/* Link this txdesc to the parent. */
2971 			hn_txdesc_agg(agg_txd, txd);
2972 
2973 			chim = (uint8_t *)pkt + pkt->rm_len;
2974 			/* Save the current packet for later fixup. */
2975 			txr->hn_agg_prevpkt = chim;
2976 
2977 			txr->hn_agg_pktleft--;
2978 			txr->hn_agg_szleft -= pktsize;
2979 			if (txr->hn_agg_szleft <=
2980 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2981 				/*
2982 				 * Probably can't aggregate more packets,
2983 				 * flush this aggregating txdesc proactively.
2984 				 */
2985 				txr->hn_agg_pktleft = 0;
2986 			}
2987 			/* Done! */
2988 			return (chim);
2989 		}
2990 		hn_flush_txagg(ifp, txr);
2991 	}
2992 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2993 
2994 	txr->hn_tx_chimney_tried++;
2995 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
2996 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2997 		return (NULL);
2998 	txr->hn_tx_chimney++;
2999 
3000 	chim = txr->hn_sc->hn_chim +
3001 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3002 
3003 	if (txr->hn_agg_pktmax > 1 &&
3004 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3005 		txr->hn_agg_txd = txd;
3006 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3007 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3008 		txr->hn_agg_prevpkt = chim;
3009 	}
3010 	return (chim);
3011 }
3012 
3013 /*
3014  * NOTE:
3015  * If this function fails, then both txd and m_head0 will be freed.
3016  */
3017 static int
3018 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3019     struct mbuf **m_head0)
3020 {
3021 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3022 	int error, nsegs, i;
3023 	struct mbuf *m_head = *m_head0;
3024 	struct rndis_packet_msg *pkt;
3025 	uint32_t *pi_data;
3026 	void *chim = NULL;
3027 	int pkt_hlen, pkt_size;
3028 
3029 	pkt = txd->rndis_pkt;
3030 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3031 	if (pkt_size < txr->hn_chim_size) {
3032 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3033 		if (chim != NULL)
3034 			pkt = chim;
3035 	} else {
3036 		if (txr->hn_agg_txd != NULL)
3037 			hn_flush_txagg(ifp, txr);
3038 	}
3039 
3040 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3041 	pkt->rm_len = m_head->m_pkthdr.len;
3042 	pkt->rm_dataoffset = 0;
3043 	pkt->rm_datalen = m_head->m_pkthdr.len;
3044 	pkt->rm_oobdataoffset = 0;
3045 	pkt->rm_oobdatalen = 0;
3046 	pkt->rm_oobdataelements = 0;
3047 	pkt->rm_pktinfooffset = sizeof(*pkt);
3048 	pkt->rm_pktinfolen = 0;
3049 	pkt->rm_vchandle = 0;
3050 	pkt->rm_reserved = 0;
3051 
3052 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3053 		/*
3054 		 * Set the hash value for this packet, so that the host could
3055 		 * dispatch the TX done event for this packet back to this TX
3056 		 * ring's channel.
3057 		 */
3058 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3059 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3060 		*pi_data = txr->hn_tx_idx;
3061 	}
3062 
3063 	if (m_head->m_flags & M_VLANTAG) {
3064 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3065 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3066 		*pi_data = NDIS_VLAN_INFO_MAKE(
3067 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3068 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3069 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3070 	}
3071 
3072 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3073 #if defined(INET6) || defined(INET)
3074 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3075 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3076 #ifdef INET
3077 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3078 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3079 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3080 			    m_head->m_pkthdr.tso_segsz);
3081 		}
3082 #endif
3083 #if defined(INET6) && defined(INET)
3084 		else
3085 #endif
3086 #ifdef INET6
3087 		{
3088 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3089 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3090 			    m_head->m_pkthdr.tso_segsz);
3091 		}
3092 #endif
3093 #endif	/* INET6 || INET */
3094 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3095 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3096 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3097 		if (m_head->m_pkthdr.csum_flags &
3098 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3099 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3100 		} else {
3101 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3102 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3103 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3104 		}
3105 
3106 		if (m_head->m_pkthdr.csum_flags &
3107 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3108 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3109 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3110 		} else if (m_head->m_pkthdr.csum_flags &
3111 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3112 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3113 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3114 		}
3115 	}
3116 
3117 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3118 	/* Fixup RNDIS packet message total length */
3119 	pkt->rm_len += pkt_hlen;
3120 	/* Convert RNDIS packet message offsets */
3121 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3122 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3123 
3124 	/*
3125 	 * Fast path: Chimney sending.
3126 	 */
3127 	if (chim != NULL) {
3128 		struct hn_txdesc *tgt_txd = txd;
3129 
3130 		if (txr->hn_agg_txd != NULL) {
3131 			tgt_txd = txr->hn_agg_txd;
3132 #ifdef INVARIANTS
3133 			*m_head0 = NULL;
3134 #endif
3135 		}
3136 
3137 		KASSERT(pkt == chim,
3138 		    ("RNDIS pkt not in chimney sending buffer"));
3139 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3140 		    ("chimney sending buffer is not used"));
3141 		tgt_txd->chim_size += pkt->rm_len;
3142 
3143 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3144 		    ((uint8_t *)chim) + pkt_hlen);
3145 
3146 		txr->hn_gpa_cnt = 0;
3147 		txr->hn_sendpkt = hn_txpkt_chim;
3148 		goto done;
3149 	}
3150 
3151 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3152 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3153 	    ("chimney buffer is used"));
3154 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3155 
3156 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3157 	if (__predict_false(error)) {
3158 		int freed;
3159 
3160 		/*
3161 		 * This mbuf is not linked w/ the txd yet, so free it now.
3162 		 */
3163 		m_freem(m_head);
3164 		*m_head0 = NULL;
3165 
3166 		freed = hn_txdesc_put(txr, txd);
3167 		KASSERT(freed != 0,
3168 		    ("fail to free txd upon txdma error"));
3169 
3170 		txr->hn_txdma_failed++;
3171 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3172 		return error;
3173 	}
3174 	*m_head0 = m_head;
3175 
3176 	/* +1 RNDIS packet message */
3177 	txr->hn_gpa_cnt = nsegs + 1;
3178 
3179 	/* send packet with page buffer */
3180 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3181 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3182 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3183 
3184 	/*
3185 	 * Fill the page buffers with mbuf info after the page
3186 	 * buffer for RNDIS packet message.
3187 	 */
3188 	for (i = 0; i < nsegs; ++i) {
3189 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3190 
3191 		gpa->gpa_page = atop(segs[i].ds_addr);
3192 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3193 		gpa->gpa_len = segs[i].ds_len;
3194 	}
3195 
3196 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3197 	txd->chim_size = 0;
3198 	txr->hn_sendpkt = hn_txpkt_sglist;
3199 done:
3200 	txd->m = m_head;
3201 
3202 	/* Set the completion routine */
3203 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3204 
3205 	/* Update temporary stats for later use. */
3206 	txr->hn_stat_pkts++;
3207 	txr->hn_stat_size += m_head->m_pkthdr.len;
3208 	if (m_head->m_flags & M_MCAST)
3209 		txr->hn_stat_mcasts++;
3210 
3211 	return 0;
3212 }
3213 
3214 /*
3215  * NOTE:
3216  * If this function fails, then txd will be freed, but the mbuf
3217  * associated w/ the txd will _not_ be freed.
3218  */
3219 static int
3220 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3221 {
3222 	int error, send_failed = 0, has_bpf;
3223 
3224 again:
3225 	has_bpf = bpf_peers_present(ifp->if_bpf);
3226 	if (has_bpf) {
3227 		/*
3228 		 * Make sure that this txd and any aggregated txds are not
3229 		 * freed before ETHER_BPF_MTAP.
3230 		 */
3231 		hn_txdesc_hold(txd);
3232 	}
3233 	error = txr->hn_sendpkt(txr, txd);
3234 	if (!error) {
3235 		if (has_bpf) {
3236 			const struct hn_txdesc *tmp_txd;
3237 
3238 			ETHER_BPF_MTAP(ifp, txd->m);
3239 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3240 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3241 		}
3242 
3243 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3244 #ifdef HN_IFSTART_SUPPORT
3245 		if (!hn_use_if_start)
3246 #endif
3247 		{
3248 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3249 			    txr->hn_stat_size);
3250 			if (txr->hn_stat_mcasts != 0) {
3251 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3252 				    txr->hn_stat_mcasts);
3253 			}
3254 		}
3255 		txr->hn_pkts += txr->hn_stat_pkts;
3256 		txr->hn_sends++;
3257 	}
3258 	if (has_bpf)
3259 		hn_txdesc_put(txr, txd);
3260 
3261 	if (__predict_false(error)) {
3262 		int freed;
3263 
3264 		/*
3265 		 * This should "really rarely" happen.
3266 		 *
3267 		 * XXX Too many RX to be acked or too many sideband
3268 		 * commands to run?  Ask netvsc_channel_rollup()
3269 		 * to kick start later.
3270 		 */
3271 		txr->hn_has_txeof = 1;
3272 		if (!send_failed) {
3273 			txr->hn_send_failed++;
3274 			send_failed = 1;
3275 			/*
3276 			 * Try sending again after set hn_has_txeof;
3277 			 * in case that we missed the last
3278 			 * netvsc_channel_rollup().
3279 			 */
3280 			goto again;
3281 		}
3282 		if_printf(ifp, "send failed\n");
3283 
3284 		/*
3285 		 * Caller will perform further processing on the
3286 		 * associated mbuf, so don't free it in hn_txdesc_put();
3287 		 * only unload it from the DMA map in hn_txdesc_put(),
3288 		 * if it was loaded.
3289 		 */
3290 		txd->m = NULL;
3291 		freed = hn_txdesc_put(txr, txd);
3292 		KASSERT(freed != 0,
3293 		    ("fail to free txd upon send error"));
3294 
3295 		txr->hn_send_failed++;
3296 	}
3297 
3298 	/* Reset temporary stats, after this sending is done. */
3299 	txr->hn_stat_size = 0;
3300 	txr->hn_stat_pkts = 0;
3301 	txr->hn_stat_mcasts = 0;
3302 
3303 	return (error);
3304 }
3305 
3306 /*
3307  * Append the specified data to the indicated mbuf chain,
3308  * Extend the mbuf chain if the new data does not fit in
3309  * existing space.
3310  *
3311  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3312  * There should be an equivalent in the kernel mbuf code,
3313  * but there does not appear to be one yet.
3314  *
3315  * Differs from m_append() in that additional mbufs are
3316  * allocated with cluster size MJUMPAGESIZE, and filled
3317  * accordingly.
3318  *
3319  * Return 1 if able to complete the job; otherwise 0.
3320  */
3321 static int
3322 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3323 {
3324 	struct mbuf *m, *n;
3325 	int remainder, space;
3326 
3327 	for (m = m0; m->m_next != NULL; m = m->m_next)
3328 		;
3329 	remainder = len;
3330 	space = M_TRAILINGSPACE(m);
3331 	if (space > 0) {
3332 		/*
3333 		 * Copy into available space.
3334 		 */
3335 		if (space > remainder)
3336 			space = remainder;
3337 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3338 		m->m_len += space;
3339 		cp += space;
3340 		remainder -= space;
3341 	}
3342 	while (remainder > 0) {
3343 		/*
3344 		 * Allocate a new mbuf; could check space
3345 		 * and allocate a cluster instead.
3346 		 */
3347 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3348 		if (n == NULL)
3349 			break;
3350 		n->m_len = min(MJUMPAGESIZE, remainder);
3351 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3352 		cp += n->m_len;
3353 		remainder -= n->m_len;
3354 		m->m_next = n;
3355 		m = n;
3356 	}
3357 	if (m0->m_flags & M_PKTHDR)
3358 		m0->m_pkthdr.len += len - remainder;
3359 
3360 	return (remainder == 0);
3361 }
3362 
3363 #if defined(INET) || defined(INET6)
3364 static __inline int
3365 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3366 {
3367 #if __FreeBSD_version >= 1100095
3368 	if (hn_lro_mbufq_depth) {
3369 		tcp_lro_queue_mbuf(lc, m);
3370 		return 0;
3371 	}
3372 #endif
3373 	return tcp_lro_rx(lc, m, 0);
3374 }
3375 #endif
3376 
3377 static int
3378 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3379     const struct hn_rxinfo *info)
3380 {
3381 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3382 	struct mbuf *m_new;
3383 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3384 	int hash_type = M_HASHTYPE_NONE;
3385 
3386 	ifp = hn_ifp;
3387 	if (rxr->hn_rxvf_ifp != NULL) {
3388 		/*
3389 		 * Non-transparent mode VF; pretend this packet is from
3390 		 * the VF.
3391 		 */
3392 		ifp = rxr->hn_rxvf_ifp;
3393 		is_vf = 1;
3394 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3395 		/* Transparent mode VF. */
3396 		is_vf = 1;
3397 	}
3398 
3399 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3400 		/*
3401 		 * NOTE:
3402 		 * See the NOTE of hn_rndis_init_fixat().  This
3403 		 * function can be reached, immediately after the
3404 		 * RNDIS is initialized but before the ifnet is
3405 		 * setup on the hn_attach() path; drop the unexpected
3406 		 * packets.
3407 		 */
3408 		return (0);
3409 	}
3410 
3411 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3412 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3413 		return (0);
3414 	}
3415 
3416 	if (dlen <= MHLEN) {
3417 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3418 		if (m_new == NULL) {
3419 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3420 			return (0);
3421 		}
3422 		memcpy(mtod(m_new, void *), data, dlen);
3423 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3424 		rxr->hn_small_pkts++;
3425 	} else {
3426 		/*
3427 		 * Get an mbuf with a cluster.  For packets 2K or less,
3428 		 * get a standard 2K cluster.  For anything larger, get a
3429 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3430 		 * if looped around to the Hyper-V TX channel, so avoid them.
3431 		 */
3432 		size = MCLBYTES;
3433 		if (dlen > MCLBYTES) {
3434 			/* 4096 */
3435 			size = MJUMPAGESIZE;
3436 		}
3437 
3438 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3439 		if (m_new == NULL) {
3440 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3441 			return (0);
3442 		}
3443 
3444 		hv_m_append(m_new, dlen, data);
3445 	}
3446 	m_new->m_pkthdr.rcvif = ifp;
3447 
3448 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3449 		do_csum = 0;
3450 
3451 	/* receive side checksum offload */
3452 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3453 		/* IP csum offload */
3454 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3455 			m_new->m_pkthdr.csum_flags |=
3456 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3457 			rxr->hn_csum_ip++;
3458 		}
3459 
3460 		/* TCP/UDP csum offload */
3461 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3462 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3463 			m_new->m_pkthdr.csum_flags |=
3464 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3465 			m_new->m_pkthdr.csum_data = 0xffff;
3466 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3467 				rxr->hn_csum_tcp++;
3468 			else
3469 				rxr->hn_csum_udp++;
3470 		}
3471 
3472 		/*
3473 		 * XXX
3474 		 * As of this write (Oct 28th, 2016), host side will turn
3475 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3476 		 * the do_lro setting here is actually _not_ accurate.  We
3477 		 * depend on the RSS hash type check to reset do_lro.
3478 		 */
3479 		if ((info->csum_info &
3480 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3481 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3482 			do_lro = 1;
3483 	} else {
3484 		const struct ether_header *eh;
3485 		uint16_t etype;
3486 		int hoff;
3487 
3488 		hoff = sizeof(*eh);
3489 		/* Checked at the beginning of this function. */
3490 		KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3491 
3492 		eh = mtod(m_new, struct ether_header *);
3493 		etype = ntohs(eh->ether_type);
3494 		if (etype == ETHERTYPE_VLAN) {
3495 			const struct ether_vlan_header *evl;
3496 
3497 			hoff = sizeof(*evl);
3498 			if (m_new->m_len < hoff)
3499 				goto skip;
3500 			evl = mtod(m_new, struct ether_vlan_header *);
3501 			etype = ntohs(evl->evl_proto);
3502 		}
3503 
3504 		if (etype == ETHERTYPE_IP) {
3505 			int pr;
3506 
3507 			pr = hn_check_iplen(m_new, hoff);
3508 			if (pr == IPPROTO_TCP) {
3509 				if (do_csum &&
3510 				    (rxr->hn_trust_hcsum &
3511 				     HN_TRUST_HCSUM_TCP)) {
3512 					rxr->hn_csum_trusted++;
3513 					m_new->m_pkthdr.csum_flags |=
3514 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3515 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3516 					m_new->m_pkthdr.csum_data = 0xffff;
3517 				}
3518 				do_lro = 1;
3519 			} else if (pr == IPPROTO_UDP) {
3520 				if (do_csum &&
3521 				    (rxr->hn_trust_hcsum &
3522 				     HN_TRUST_HCSUM_UDP)) {
3523 					rxr->hn_csum_trusted++;
3524 					m_new->m_pkthdr.csum_flags |=
3525 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3526 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3527 					m_new->m_pkthdr.csum_data = 0xffff;
3528 				}
3529 			} else if (pr != IPPROTO_DONE && do_csum &&
3530 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3531 				rxr->hn_csum_trusted++;
3532 				m_new->m_pkthdr.csum_flags |=
3533 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3534 			}
3535 		}
3536 	}
3537 skip:
3538 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3539 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3540 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3541 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3542 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3543 		m_new->m_flags |= M_VLANTAG;
3544 	}
3545 
3546 	/*
3547 	 * If VF is activated (tranparent/non-transparent mode does not
3548 	 * matter here).
3549 	 *
3550 	 * - Disable LRO
3551 	 *
3552 	 *   hn(4) will only receive broadcast packets, multicast packets,
3553 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3554 	 *   packet types.
3555 	 *
3556 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3557 	 *   all, since the LRO flush will use hn(4) as the receiving
3558 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3559 	 */
3560 	if (is_vf)
3561 		do_lro = 0;
3562 
3563 	/*
3564 	 * If VF is activated (tranparent/non-transparent mode does not
3565 	 * matter here), do _not_ mess with unsupported hash types or
3566 	 * functions.
3567 	 */
3568 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3569 		rxr->hn_rss_pkts++;
3570 		m_new->m_pkthdr.flowid = info->hash_value;
3571 		if (!is_vf)
3572 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3573 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3574 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3575 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3576 			    rxr->hn_mbuf_hash);
3577 
3578 			/*
3579 			 * NOTE:
3580 			 * do_lro is resetted, if the hash types are not TCP
3581 			 * related.  See the comment in the above csum_flags
3582 			 * setup section.
3583 			 */
3584 			switch (type) {
3585 			case NDIS_HASH_IPV4:
3586 				hash_type = M_HASHTYPE_RSS_IPV4;
3587 				do_lro = 0;
3588 				break;
3589 
3590 			case NDIS_HASH_TCP_IPV4:
3591 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3592 				break;
3593 
3594 			case NDIS_HASH_IPV6:
3595 				hash_type = M_HASHTYPE_RSS_IPV6;
3596 				do_lro = 0;
3597 				break;
3598 
3599 			case NDIS_HASH_IPV6_EX:
3600 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3601 				do_lro = 0;
3602 				break;
3603 
3604 			case NDIS_HASH_TCP_IPV6:
3605 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3606 				break;
3607 
3608 			case NDIS_HASH_TCP_IPV6_EX:
3609 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3610 				break;
3611 			}
3612 		}
3613 	} else if (!is_vf) {
3614 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3615 		hash_type = M_HASHTYPE_OPAQUE;
3616 	}
3617 	M_HASHTYPE_SET(m_new, hash_type);
3618 
3619 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3620 	if (hn_ifp != ifp) {
3621 		const struct ether_header *eh;
3622 
3623 		/*
3624 		 * Non-transparent mode VF is activated.
3625 		 */
3626 
3627 		/*
3628 		 * Allow tapping on hn(4).
3629 		 */
3630 		ETHER_BPF_MTAP(hn_ifp, m_new);
3631 
3632 		/*
3633 		 * Update hn(4)'s stats.
3634 		 */
3635 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3636 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3637 		/* Checked at the beginning of this function. */
3638 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3639 		eh = mtod(m_new, struct ether_header *);
3640 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3641 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3642 	}
3643 	rxr->hn_pkts++;
3644 
3645 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3646 #if defined(INET) || defined(INET6)
3647 		struct lro_ctrl *lro = &rxr->hn_lro;
3648 
3649 		if (lro->lro_cnt) {
3650 			rxr->hn_lro_tried++;
3651 			if (hn_lro_rx(lro, m_new) == 0) {
3652 				/* DONE! */
3653 				return 0;
3654 			}
3655 		}
3656 #endif
3657 	}
3658 	ifp->if_input(ifp, m_new);
3659 
3660 	return (0);
3661 }
3662 
3663 static int
3664 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3665 {
3666 	struct hn_softc *sc = ifp->if_softc;
3667 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3668 	struct ifnet *vf_ifp;
3669 	int mask, error = 0;
3670 	struct ifrsskey *ifrk;
3671 	struct ifrsshash *ifrh;
3672 	uint32_t mtu;
3673 
3674 	switch (cmd) {
3675 	case SIOCSIFMTU:
3676 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3677 			error = EINVAL;
3678 			break;
3679 		}
3680 
3681 		HN_LOCK(sc);
3682 
3683 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3684 			HN_UNLOCK(sc);
3685 			break;
3686 		}
3687 
3688 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3689 			/* Can't change MTU */
3690 			HN_UNLOCK(sc);
3691 			error = EOPNOTSUPP;
3692 			break;
3693 		}
3694 
3695 		if (ifp->if_mtu == ifr->ifr_mtu) {
3696 			HN_UNLOCK(sc);
3697 			break;
3698 		}
3699 
3700 		if (hn_xpnt_vf_isready(sc)) {
3701 			vf_ifp = sc->hn_vf_ifp;
3702 			ifr_vf = *ifr;
3703 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3704 			    sizeof(ifr_vf.ifr_name));
3705 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3706 			    (caddr_t)&ifr_vf);
3707 			if (error) {
3708 				HN_UNLOCK(sc);
3709 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3710 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3711 				break;
3712 			}
3713 		}
3714 
3715 		/*
3716 		 * Suspend this interface before the synthetic parts
3717 		 * are ripped.
3718 		 */
3719 		hn_suspend(sc);
3720 
3721 		/*
3722 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3723 		 */
3724 		hn_synth_detach(sc);
3725 
3726 		/*
3727 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3728 		 * with the new MTU setting.
3729 		 */
3730 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3731 		if (error) {
3732 			HN_UNLOCK(sc);
3733 			break;
3734 		}
3735 
3736 		error = hn_rndis_get_mtu(sc, &mtu);
3737 		if (error)
3738 			mtu = ifr->ifr_mtu;
3739 		else if (bootverbose)
3740 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3741 
3742 		/*
3743 		 * Commit the requested MTU, after the synthetic parts
3744 		 * have been successfully attached.
3745 		 */
3746 		if (mtu >= ifr->ifr_mtu) {
3747 			mtu = ifr->ifr_mtu;
3748 		} else {
3749 			if_printf(ifp, "fixup mtu %d -> %u\n",
3750 			    ifr->ifr_mtu, mtu);
3751 		}
3752 		ifp->if_mtu = mtu;
3753 
3754 		/*
3755 		 * Synthetic parts' reattach may change the chimney
3756 		 * sending size; update it.
3757 		 */
3758 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3759 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3760 
3761 		/*
3762 		 * Make sure that various parameters based on MTU are
3763 		 * still valid, after the MTU change.
3764 		 */
3765 		hn_mtu_change_fixup(sc);
3766 
3767 		/*
3768 		 * All done!  Resume the interface now.
3769 		 */
3770 		hn_resume(sc);
3771 
3772 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3773 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3774 			/*
3775 			 * Since we have reattached the NVS part,
3776 			 * change the datapath to VF again; in case
3777 			 * that it is lost, after the NVS was detached.
3778 			 */
3779 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3780 		}
3781 
3782 		HN_UNLOCK(sc);
3783 		break;
3784 
3785 	case SIOCSIFFLAGS:
3786 		HN_LOCK(sc);
3787 
3788 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3789 			HN_UNLOCK(sc);
3790 			break;
3791 		}
3792 
3793 		if (hn_xpnt_vf_isready(sc))
3794 			hn_xpnt_vf_saveifflags(sc);
3795 
3796 		if (ifp->if_flags & IFF_UP) {
3797 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3798 				/*
3799 				 * Caller meight hold mutex, e.g.
3800 				 * bpf; use busy-wait for the RNDIS
3801 				 * reply.
3802 				 */
3803 				HN_NO_SLEEPING(sc);
3804 				hn_rxfilter_config(sc);
3805 				HN_SLEEPING_OK(sc);
3806 
3807 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3808 					error = hn_xpnt_vf_iocsetflags(sc);
3809 			} else {
3810 				hn_init_locked(sc);
3811 			}
3812 		} else {
3813 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3814 				hn_stop(sc, false);
3815 		}
3816 		sc->hn_if_flags = ifp->if_flags;
3817 
3818 		HN_UNLOCK(sc);
3819 		break;
3820 
3821 	case SIOCSIFCAP:
3822 		HN_LOCK(sc);
3823 
3824 		if (hn_xpnt_vf_isready(sc)) {
3825 			ifr_vf = *ifr;
3826 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3827 			    sizeof(ifr_vf.ifr_name));
3828 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3829 			HN_UNLOCK(sc);
3830 			break;
3831 		}
3832 
3833 		/*
3834 		 * Fix up requested capabilities w/ supported capabilities,
3835 		 * since the supported capabilities could have been changed.
3836 		 */
3837 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3838 		    ifp->if_capenable;
3839 
3840 		if (mask & IFCAP_TXCSUM) {
3841 			ifp->if_capenable ^= IFCAP_TXCSUM;
3842 			if (ifp->if_capenable & IFCAP_TXCSUM)
3843 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3844 			else
3845 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3846 		}
3847 		if (mask & IFCAP_TXCSUM_IPV6) {
3848 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3849 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3850 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3851 			else
3852 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3853 		}
3854 
3855 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3856 		if (mask & IFCAP_RXCSUM)
3857 			ifp->if_capenable ^= IFCAP_RXCSUM;
3858 #ifdef foo
3859 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3860 		if (mask & IFCAP_RXCSUM_IPV6)
3861 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3862 #endif
3863 
3864 		if (mask & IFCAP_LRO)
3865 			ifp->if_capenable ^= IFCAP_LRO;
3866 
3867 		if (mask & IFCAP_TSO4) {
3868 			ifp->if_capenable ^= IFCAP_TSO4;
3869 			if (ifp->if_capenable & IFCAP_TSO4)
3870 				ifp->if_hwassist |= CSUM_IP_TSO;
3871 			else
3872 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3873 		}
3874 		if (mask & IFCAP_TSO6) {
3875 			ifp->if_capenable ^= IFCAP_TSO6;
3876 			if (ifp->if_capenable & IFCAP_TSO6)
3877 				ifp->if_hwassist |= CSUM_IP6_TSO;
3878 			else
3879 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3880 		}
3881 
3882 		HN_UNLOCK(sc);
3883 		break;
3884 
3885 	case SIOCADDMULTI:
3886 	case SIOCDELMULTI:
3887 		HN_LOCK(sc);
3888 
3889 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3890 			HN_UNLOCK(sc);
3891 			break;
3892 		}
3893 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3894 			/*
3895 			 * Multicast uses mutex; use busy-wait for
3896 			 * the RNDIS reply.
3897 			 */
3898 			HN_NO_SLEEPING(sc);
3899 			hn_rxfilter_config(sc);
3900 			HN_SLEEPING_OK(sc);
3901 		}
3902 
3903 		/* XXX vlan(4) style mcast addr maintenance */
3904 		if (hn_xpnt_vf_isready(sc)) {
3905 			int old_if_flags;
3906 
3907 			old_if_flags = sc->hn_vf_ifp->if_flags;
3908 			hn_xpnt_vf_saveifflags(sc);
3909 
3910 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3911 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3912 			     IFF_ALLMULTI))
3913 				error = hn_xpnt_vf_iocsetflags(sc);
3914 		}
3915 
3916 		HN_UNLOCK(sc);
3917 		break;
3918 
3919 	case SIOCSIFMEDIA:
3920 	case SIOCGIFMEDIA:
3921 		HN_LOCK(sc);
3922 		if (hn_xpnt_vf_isready(sc)) {
3923 			/*
3924 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3925 			 * create and pass ifr_vf to the VF here; just
3926 			 * replace the ifr_name.
3927 			 */
3928 			vf_ifp = sc->hn_vf_ifp;
3929 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3930 			    sizeof(ifr->ifr_name));
3931 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3932 			/* Restore the ifr_name. */
3933 			strlcpy(ifr->ifr_name, ifp->if_xname,
3934 			    sizeof(ifr->ifr_name));
3935 			HN_UNLOCK(sc);
3936 			break;
3937 		}
3938 		HN_UNLOCK(sc);
3939 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3940 		break;
3941 
3942 	case SIOCGIFRSSHASH:
3943 		ifrh = (struct ifrsshash *)data;
3944 		HN_LOCK(sc);
3945 		if (sc->hn_rx_ring_inuse == 1) {
3946 			HN_UNLOCK(sc);
3947 			ifrh->ifrh_func = RSS_FUNC_NONE;
3948 			ifrh->ifrh_types = 0;
3949 			break;
3950 		}
3951 
3952 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3953 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3954 		else
3955 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3956 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3957 		HN_UNLOCK(sc);
3958 		break;
3959 
3960 	case SIOCGIFRSSKEY:
3961 		ifrk = (struct ifrsskey *)data;
3962 		HN_LOCK(sc);
3963 		if (sc->hn_rx_ring_inuse == 1) {
3964 			HN_UNLOCK(sc);
3965 			ifrk->ifrk_func = RSS_FUNC_NONE;
3966 			ifrk->ifrk_keylen = 0;
3967 			break;
3968 		}
3969 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3970 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3971 		else
3972 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3973 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3974 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3975 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
3976 		HN_UNLOCK(sc);
3977 		break;
3978 
3979 	default:
3980 		error = ether_ioctl(ifp, cmd, data);
3981 		break;
3982 	}
3983 	return (error);
3984 }
3985 
3986 static void
3987 hn_stop(struct hn_softc *sc, bool detaching)
3988 {
3989 	struct ifnet *ifp = sc->hn_ifp;
3990 	int i;
3991 
3992 	HN_LOCK_ASSERT(sc);
3993 
3994 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3995 	    ("synthetic parts were not attached"));
3996 
3997 	/* Clear RUNNING bit ASAP. */
3998 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3999 
4000 	/* Disable polling. */
4001 	hn_polling(sc, 0);
4002 
4003 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4004 		KASSERT(sc->hn_vf_ifp != NULL,
4005 		    ("%s: VF is not attached", ifp->if_xname));
4006 
4007 		/* Mark transparent mode VF as disabled. */
4008 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4009 
4010 		/*
4011 		 * NOTE:
4012 		 * Datapath setting must happen _before_ bringing
4013 		 * the VF down.
4014 		 */
4015 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4016 
4017 		/*
4018 		 * Bring the VF down.
4019 		 */
4020 		hn_xpnt_vf_saveifflags(sc);
4021 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4022 		hn_xpnt_vf_iocsetflags(sc);
4023 	}
4024 
4025 	/* Suspend data transfers. */
4026 	hn_suspend_data(sc);
4027 
4028 	/* Clear OACTIVE bit. */
4029 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4030 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4031 		sc->hn_tx_ring[i].hn_oactive = 0;
4032 
4033 	/*
4034 	 * If the non-transparent mode VF is active, make sure
4035 	 * that the RX filter still allows packet reception.
4036 	 */
4037 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4038 		hn_rxfilter_config(sc);
4039 }
4040 
4041 static void
4042 hn_init_locked(struct hn_softc *sc)
4043 {
4044 	struct ifnet *ifp = sc->hn_ifp;
4045 	int i;
4046 
4047 	HN_LOCK_ASSERT(sc);
4048 
4049 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4050 		return;
4051 
4052 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4053 		return;
4054 
4055 	/* Configure RX filter */
4056 	hn_rxfilter_config(sc);
4057 
4058 	/* Clear OACTIVE bit. */
4059 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4060 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4061 		sc->hn_tx_ring[i].hn_oactive = 0;
4062 
4063 	/* Clear TX 'suspended' bit. */
4064 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4065 
4066 	if (hn_xpnt_vf_isready(sc)) {
4067 		/* Initialize transparent VF. */
4068 		hn_xpnt_vf_init(sc);
4069 	}
4070 
4071 	/* Everything is ready; unleash! */
4072 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4073 
4074 	/* Re-enable polling if requested. */
4075 	if (sc->hn_pollhz > 0)
4076 		hn_polling(sc, sc->hn_pollhz);
4077 }
4078 
4079 static void
4080 hn_init(void *xsc)
4081 {
4082 	struct hn_softc *sc = xsc;
4083 
4084 	HN_LOCK(sc);
4085 	hn_init_locked(sc);
4086 	HN_UNLOCK(sc);
4087 }
4088 
4089 #if __FreeBSD_version >= 1100099
4090 
4091 static int
4092 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4093 {
4094 	struct hn_softc *sc = arg1;
4095 	unsigned int lenlim;
4096 	int error;
4097 
4098 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4099 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4100 	if (error || req->newptr == NULL)
4101 		return error;
4102 
4103 	HN_LOCK(sc);
4104 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4105 	    lenlim > TCP_LRO_LENGTH_MAX) {
4106 		HN_UNLOCK(sc);
4107 		return EINVAL;
4108 	}
4109 	hn_set_lro_lenlim(sc, lenlim);
4110 	HN_UNLOCK(sc);
4111 
4112 	return 0;
4113 }
4114 
4115 static int
4116 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4117 {
4118 	struct hn_softc *sc = arg1;
4119 	int ackcnt, error, i;
4120 
4121 	/*
4122 	 * lro_ackcnt_lim is append count limit,
4123 	 * +1 to turn it into aggregation limit.
4124 	 */
4125 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4126 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4127 	if (error || req->newptr == NULL)
4128 		return error;
4129 
4130 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4131 		return EINVAL;
4132 
4133 	/*
4134 	 * Convert aggregation limit back to append
4135 	 * count limit.
4136 	 */
4137 	--ackcnt;
4138 	HN_LOCK(sc);
4139 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4140 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4141 	HN_UNLOCK(sc);
4142 	return 0;
4143 }
4144 
4145 #endif
4146 
4147 static int
4148 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4149 {
4150 	struct hn_softc *sc = arg1;
4151 	int hcsum = arg2;
4152 	int on, error, i;
4153 
4154 	on = 0;
4155 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4156 		on = 1;
4157 
4158 	error = sysctl_handle_int(oidp, &on, 0, req);
4159 	if (error || req->newptr == NULL)
4160 		return error;
4161 
4162 	HN_LOCK(sc);
4163 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4164 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4165 
4166 		if (on)
4167 			rxr->hn_trust_hcsum |= hcsum;
4168 		else
4169 			rxr->hn_trust_hcsum &= ~hcsum;
4170 	}
4171 	HN_UNLOCK(sc);
4172 	return 0;
4173 }
4174 
4175 static int
4176 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4177 {
4178 	struct hn_softc *sc = arg1;
4179 	int chim_size, error;
4180 
4181 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4182 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4183 	if (error || req->newptr == NULL)
4184 		return error;
4185 
4186 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4187 		return EINVAL;
4188 
4189 	HN_LOCK(sc);
4190 	hn_set_chim_size(sc, chim_size);
4191 	HN_UNLOCK(sc);
4192 	return 0;
4193 }
4194 
4195 #if __FreeBSD_version < 1100095
4196 static int
4197 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4198 {
4199 	struct hn_softc *sc = arg1;
4200 	int ofs = arg2, i, error;
4201 	struct hn_rx_ring *rxr;
4202 	uint64_t stat;
4203 
4204 	stat = 0;
4205 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4206 		rxr = &sc->hn_rx_ring[i];
4207 		stat += *((int *)((uint8_t *)rxr + ofs));
4208 	}
4209 
4210 	error = sysctl_handle_64(oidp, &stat, 0, req);
4211 	if (error || req->newptr == NULL)
4212 		return error;
4213 
4214 	/* Zero out this stat. */
4215 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4216 		rxr = &sc->hn_rx_ring[i];
4217 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4218 	}
4219 	return 0;
4220 }
4221 #else
4222 static int
4223 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4224 {
4225 	struct hn_softc *sc = arg1;
4226 	int ofs = arg2, i, error;
4227 	struct hn_rx_ring *rxr;
4228 	uint64_t stat;
4229 
4230 	stat = 0;
4231 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4232 		rxr = &sc->hn_rx_ring[i];
4233 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4234 	}
4235 
4236 	error = sysctl_handle_64(oidp, &stat, 0, req);
4237 	if (error || req->newptr == NULL)
4238 		return error;
4239 
4240 	/* Zero out this stat. */
4241 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4242 		rxr = &sc->hn_rx_ring[i];
4243 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4244 	}
4245 	return 0;
4246 }
4247 
4248 #endif
4249 
4250 static int
4251 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4252 {
4253 	struct hn_softc *sc = arg1;
4254 	int ofs = arg2, i, error;
4255 	struct hn_rx_ring *rxr;
4256 	u_long stat;
4257 
4258 	stat = 0;
4259 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4260 		rxr = &sc->hn_rx_ring[i];
4261 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4262 	}
4263 
4264 	error = sysctl_handle_long(oidp, &stat, 0, req);
4265 	if (error || req->newptr == NULL)
4266 		return error;
4267 
4268 	/* Zero out this stat. */
4269 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4270 		rxr = &sc->hn_rx_ring[i];
4271 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4272 	}
4273 	return 0;
4274 }
4275 
4276 static int
4277 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4278 {
4279 	struct hn_softc *sc = arg1;
4280 	int ofs = arg2, i, error;
4281 	struct hn_tx_ring *txr;
4282 	u_long stat;
4283 
4284 	stat = 0;
4285 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4286 		txr = &sc->hn_tx_ring[i];
4287 		stat += *((u_long *)((uint8_t *)txr + ofs));
4288 	}
4289 
4290 	error = sysctl_handle_long(oidp, &stat, 0, req);
4291 	if (error || req->newptr == NULL)
4292 		return error;
4293 
4294 	/* Zero out this stat. */
4295 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4296 		txr = &sc->hn_tx_ring[i];
4297 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4298 	}
4299 	return 0;
4300 }
4301 
4302 static int
4303 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4304 {
4305 	struct hn_softc *sc = arg1;
4306 	int ofs = arg2, i, error, conf;
4307 	struct hn_tx_ring *txr;
4308 
4309 	txr = &sc->hn_tx_ring[0];
4310 	conf = *((int *)((uint8_t *)txr + ofs));
4311 
4312 	error = sysctl_handle_int(oidp, &conf, 0, req);
4313 	if (error || req->newptr == NULL)
4314 		return error;
4315 
4316 	HN_LOCK(sc);
4317 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4318 		txr = &sc->hn_tx_ring[i];
4319 		*((int *)((uint8_t *)txr + ofs)) = conf;
4320 	}
4321 	HN_UNLOCK(sc);
4322 
4323 	return 0;
4324 }
4325 
4326 static int
4327 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4328 {
4329 	struct hn_softc *sc = arg1;
4330 	int error, size;
4331 
4332 	size = sc->hn_agg_size;
4333 	error = sysctl_handle_int(oidp, &size, 0, req);
4334 	if (error || req->newptr == NULL)
4335 		return (error);
4336 
4337 	HN_LOCK(sc);
4338 	sc->hn_agg_size = size;
4339 	hn_set_txagg(sc);
4340 	HN_UNLOCK(sc);
4341 
4342 	return (0);
4343 }
4344 
4345 static int
4346 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4347 {
4348 	struct hn_softc *sc = arg1;
4349 	int error, pkts;
4350 
4351 	pkts = sc->hn_agg_pkts;
4352 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4353 	if (error || req->newptr == NULL)
4354 		return (error);
4355 
4356 	HN_LOCK(sc);
4357 	sc->hn_agg_pkts = pkts;
4358 	hn_set_txagg(sc);
4359 	HN_UNLOCK(sc);
4360 
4361 	return (0);
4362 }
4363 
4364 static int
4365 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4366 {
4367 	struct hn_softc *sc = arg1;
4368 	int pkts;
4369 
4370 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4371 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4372 }
4373 
4374 static int
4375 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4376 {
4377 	struct hn_softc *sc = arg1;
4378 	int align;
4379 
4380 	align = sc->hn_tx_ring[0].hn_agg_align;
4381 	return (sysctl_handle_int(oidp, &align, 0, req));
4382 }
4383 
4384 static void
4385 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4386 {
4387 	if (pollhz == 0)
4388 		vmbus_chan_poll_disable(chan);
4389 	else
4390 		vmbus_chan_poll_enable(chan, pollhz);
4391 }
4392 
4393 static void
4394 hn_polling(struct hn_softc *sc, u_int pollhz)
4395 {
4396 	int nsubch = sc->hn_rx_ring_inuse - 1;
4397 
4398 	HN_LOCK_ASSERT(sc);
4399 
4400 	if (nsubch > 0) {
4401 		struct vmbus_channel **subch;
4402 		int i;
4403 
4404 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4405 		for (i = 0; i < nsubch; ++i)
4406 			hn_chan_polling(subch[i], pollhz);
4407 		vmbus_subchan_rel(subch, nsubch);
4408 	}
4409 	hn_chan_polling(sc->hn_prichan, pollhz);
4410 }
4411 
4412 static int
4413 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4414 {
4415 	struct hn_softc *sc = arg1;
4416 	int pollhz, error;
4417 
4418 	pollhz = sc->hn_pollhz;
4419 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4420 	if (error || req->newptr == NULL)
4421 		return (error);
4422 
4423 	if (pollhz != 0 &&
4424 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4425 		return (EINVAL);
4426 
4427 	HN_LOCK(sc);
4428 	if (sc->hn_pollhz != pollhz) {
4429 		sc->hn_pollhz = pollhz;
4430 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4431 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4432 			hn_polling(sc, sc->hn_pollhz);
4433 	}
4434 	HN_UNLOCK(sc);
4435 
4436 	return (0);
4437 }
4438 
4439 static int
4440 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4441 {
4442 	struct hn_softc *sc = arg1;
4443 	char verstr[16];
4444 
4445 	snprintf(verstr, sizeof(verstr), "%u.%u",
4446 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4447 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4448 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4449 }
4450 
4451 static int
4452 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4453 {
4454 	struct hn_softc *sc = arg1;
4455 	char caps_str[128];
4456 	uint32_t caps;
4457 
4458 	HN_LOCK(sc);
4459 	caps = sc->hn_caps;
4460 	HN_UNLOCK(sc);
4461 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4462 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4463 }
4464 
4465 static int
4466 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4467 {
4468 	struct hn_softc *sc = arg1;
4469 	char assist_str[128];
4470 	uint32_t hwassist;
4471 
4472 	HN_LOCK(sc);
4473 	hwassist = sc->hn_ifp->if_hwassist;
4474 	HN_UNLOCK(sc);
4475 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4476 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4477 }
4478 
4479 static int
4480 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4481 {
4482 	struct hn_softc *sc = arg1;
4483 	char filter_str[128];
4484 	uint32_t filter;
4485 
4486 	HN_LOCK(sc);
4487 	filter = sc->hn_rx_filter;
4488 	HN_UNLOCK(sc);
4489 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4490 	    NDIS_PACKET_TYPES);
4491 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4492 }
4493 
4494 #ifndef RSS
4495 
4496 static int
4497 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4498 {
4499 	struct hn_softc *sc = arg1;
4500 	int error;
4501 
4502 	HN_LOCK(sc);
4503 
4504 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4505 	if (error || req->newptr == NULL)
4506 		goto back;
4507 
4508 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4509 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4510 		/*
4511 		 * RSS key is synchronized w/ VF's, don't allow users
4512 		 * to change it.
4513 		 */
4514 		error = EBUSY;
4515 		goto back;
4516 	}
4517 
4518 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4519 	if (error)
4520 		goto back;
4521 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4522 
4523 	if (sc->hn_rx_ring_inuse > 1) {
4524 		error = hn_rss_reconfig(sc);
4525 	} else {
4526 		/* Not RSS capable, at least for now; just save the RSS key. */
4527 		error = 0;
4528 	}
4529 back:
4530 	HN_UNLOCK(sc);
4531 	return (error);
4532 }
4533 
4534 static int
4535 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4536 {
4537 	struct hn_softc *sc = arg1;
4538 	int error;
4539 
4540 	HN_LOCK(sc);
4541 
4542 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4543 	if (error || req->newptr == NULL)
4544 		goto back;
4545 
4546 	/*
4547 	 * Don't allow RSS indirect table change, if this interface is not
4548 	 * RSS capable currently.
4549 	 */
4550 	if (sc->hn_rx_ring_inuse == 1) {
4551 		error = EOPNOTSUPP;
4552 		goto back;
4553 	}
4554 
4555 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4556 	if (error)
4557 		goto back;
4558 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4559 
4560 	hn_rss_ind_fixup(sc);
4561 	error = hn_rss_reconfig(sc);
4562 back:
4563 	HN_UNLOCK(sc);
4564 	return (error);
4565 }
4566 
4567 #endif	/* !RSS */
4568 
4569 static int
4570 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4571 {
4572 	struct hn_softc *sc = arg1;
4573 	char hash_str[128];
4574 	uint32_t hash;
4575 
4576 	HN_LOCK(sc);
4577 	hash = sc->hn_rss_hash;
4578 	HN_UNLOCK(sc);
4579 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4580 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4581 }
4582 
4583 static int
4584 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4585 {
4586 	struct hn_softc *sc = arg1;
4587 	char hash_str[128];
4588 	uint32_t hash;
4589 
4590 	HN_LOCK(sc);
4591 	hash = sc->hn_rss_hcap;
4592 	HN_UNLOCK(sc);
4593 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4594 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4595 }
4596 
4597 static int
4598 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4599 {
4600 	struct hn_softc *sc = arg1;
4601 	char hash_str[128];
4602 	uint32_t hash;
4603 
4604 	HN_LOCK(sc);
4605 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4606 	HN_UNLOCK(sc);
4607 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4608 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4609 }
4610 
4611 static int
4612 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4613 {
4614 	struct hn_softc *sc = arg1;
4615 	char vf_name[IFNAMSIZ + 1];
4616 	struct ifnet *vf_ifp;
4617 
4618 	HN_LOCK(sc);
4619 	vf_name[0] = '\0';
4620 	vf_ifp = sc->hn_vf_ifp;
4621 	if (vf_ifp != NULL)
4622 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4623 	HN_UNLOCK(sc);
4624 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4625 }
4626 
4627 static int
4628 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4629 {
4630 	struct hn_softc *sc = arg1;
4631 	char vf_name[IFNAMSIZ + 1];
4632 	struct ifnet *vf_ifp;
4633 
4634 	HN_LOCK(sc);
4635 	vf_name[0] = '\0';
4636 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4637 	if (vf_ifp != NULL)
4638 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4639 	HN_UNLOCK(sc);
4640 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4641 }
4642 
4643 static int
4644 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4645 {
4646 	struct rm_priotracker pt;
4647 	struct sbuf *sb;
4648 	int error, i;
4649 	bool first;
4650 
4651 	error = sysctl_wire_old_buffer(req, 0);
4652 	if (error != 0)
4653 		return (error);
4654 
4655 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4656 	if (sb == NULL)
4657 		return (ENOMEM);
4658 
4659 	rm_rlock(&hn_vfmap_lock, &pt);
4660 
4661 	first = true;
4662 	for (i = 0; i < hn_vfmap_size; ++i) {
4663 		struct ifnet *ifp;
4664 
4665 		if (hn_vfmap[i] == NULL)
4666 			continue;
4667 
4668 		ifp = ifnet_byindex(i);
4669 		if (ifp != NULL) {
4670 			if (first)
4671 				sbuf_printf(sb, "%s", ifp->if_xname);
4672 			else
4673 				sbuf_printf(sb, " %s", ifp->if_xname);
4674 			first = false;
4675 		}
4676 	}
4677 
4678 	rm_runlock(&hn_vfmap_lock, &pt);
4679 
4680 	error = sbuf_finish(sb);
4681 	sbuf_delete(sb);
4682 	return (error);
4683 }
4684 
4685 static int
4686 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4687 {
4688 	struct rm_priotracker pt;
4689 	struct sbuf *sb;
4690 	int error, i;
4691 	bool first;
4692 
4693 	error = sysctl_wire_old_buffer(req, 0);
4694 	if (error != 0)
4695 		return (error);
4696 
4697 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4698 	if (sb == NULL)
4699 		return (ENOMEM);
4700 
4701 	rm_rlock(&hn_vfmap_lock, &pt);
4702 
4703 	first = true;
4704 	for (i = 0; i < hn_vfmap_size; ++i) {
4705 		struct ifnet *ifp, *hn_ifp;
4706 
4707 		hn_ifp = hn_vfmap[i];
4708 		if (hn_ifp == NULL)
4709 			continue;
4710 
4711 		ifp = ifnet_byindex(i);
4712 		if (ifp != NULL) {
4713 			if (first) {
4714 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4715 				    hn_ifp->if_xname);
4716 			} else {
4717 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4718 				    hn_ifp->if_xname);
4719 			}
4720 			first = false;
4721 		}
4722 	}
4723 
4724 	rm_runlock(&hn_vfmap_lock, &pt);
4725 
4726 	error = sbuf_finish(sb);
4727 	sbuf_delete(sb);
4728 	return (error);
4729 }
4730 
4731 static int
4732 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4733 {
4734 	struct hn_softc *sc = arg1;
4735 	int error, onoff = 0;
4736 
4737 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4738 		onoff = 1;
4739 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4740 	if (error || req->newptr == NULL)
4741 		return (error);
4742 
4743 	HN_LOCK(sc);
4744 	/* NOTE: hn_vf_lock for hn_transmit() */
4745 	rm_wlock(&sc->hn_vf_lock);
4746 	if (onoff)
4747 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4748 	else
4749 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4750 	rm_wunlock(&sc->hn_vf_lock);
4751 	HN_UNLOCK(sc);
4752 
4753 	return (0);
4754 }
4755 
4756 static int
4757 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4758 {
4759 	struct hn_softc *sc = arg1;
4760 	int enabled = 0;
4761 
4762 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4763 		enabled = 1;
4764 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4765 }
4766 
4767 static int
4768 hn_check_iplen(const struct mbuf *m, int hoff)
4769 {
4770 	const struct ip *ip;
4771 	int len, iphlen, iplen;
4772 	const struct tcphdr *th;
4773 	int thoff;				/* TCP data offset */
4774 
4775 	len = hoff + sizeof(struct ip);
4776 
4777 	/* The packet must be at least the size of an IP header. */
4778 	if (m->m_pkthdr.len < len)
4779 		return IPPROTO_DONE;
4780 
4781 	/* The fixed IP header must reside completely in the first mbuf. */
4782 	if (m->m_len < len)
4783 		return IPPROTO_DONE;
4784 
4785 	ip = mtodo(m, hoff);
4786 
4787 	/* Bound check the packet's stated IP header length. */
4788 	iphlen = ip->ip_hl << 2;
4789 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4790 		return IPPROTO_DONE;
4791 
4792 	/* The full IP header must reside completely in the one mbuf. */
4793 	if (m->m_len < hoff + iphlen)
4794 		return IPPROTO_DONE;
4795 
4796 	iplen = ntohs(ip->ip_len);
4797 
4798 	/*
4799 	 * Check that the amount of data in the buffers is as
4800 	 * at least much as the IP header would have us expect.
4801 	 */
4802 	if (m->m_pkthdr.len < hoff + iplen)
4803 		return IPPROTO_DONE;
4804 
4805 	/*
4806 	 * Ignore IP fragments.
4807 	 */
4808 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4809 		return IPPROTO_DONE;
4810 
4811 	/*
4812 	 * The TCP/IP or UDP/IP header must be entirely contained within
4813 	 * the first fragment of a packet.
4814 	 */
4815 	switch (ip->ip_p) {
4816 	case IPPROTO_TCP:
4817 		if (iplen < iphlen + sizeof(struct tcphdr))
4818 			return IPPROTO_DONE;
4819 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4820 			return IPPROTO_DONE;
4821 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4822 		thoff = th->th_off << 2;
4823 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4824 			return IPPROTO_DONE;
4825 		if (m->m_len < hoff + iphlen + thoff)
4826 			return IPPROTO_DONE;
4827 		break;
4828 	case IPPROTO_UDP:
4829 		if (iplen < iphlen + sizeof(struct udphdr))
4830 			return IPPROTO_DONE;
4831 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4832 			return IPPROTO_DONE;
4833 		break;
4834 	default:
4835 		if (iplen < iphlen)
4836 			return IPPROTO_DONE;
4837 		break;
4838 	}
4839 	return ip->ip_p;
4840 }
4841 
4842 static int
4843 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4844 {
4845 	struct sysctl_oid_list *child;
4846 	struct sysctl_ctx_list *ctx;
4847 	device_t dev = sc->hn_dev;
4848 #if defined(INET) || defined(INET6)
4849 #if __FreeBSD_version >= 1100095
4850 	int lroent_cnt;
4851 #endif
4852 #endif
4853 	int i;
4854 
4855 	/*
4856 	 * Create RXBUF for reception.
4857 	 *
4858 	 * NOTE:
4859 	 * - It is shared by all channels.
4860 	 * - A large enough buffer is allocated, certain version of NVSes
4861 	 *   may further limit the usable space.
4862 	 */
4863 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4864 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4865 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4866 	if (sc->hn_rxbuf == NULL) {
4867 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4868 		return (ENOMEM);
4869 	}
4870 
4871 	sc->hn_rx_ring_cnt = ring_cnt;
4872 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4873 
4874 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4875 	    M_DEVBUF, M_WAITOK | M_ZERO);
4876 
4877 #if defined(INET) || defined(INET6)
4878 #if __FreeBSD_version >= 1100095
4879 	lroent_cnt = hn_lro_entry_count;
4880 	if (lroent_cnt < TCP_LRO_ENTRIES)
4881 		lroent_cnt = TCP_LRO_ENTRIES;
4882 	if (bootverbose)
4883 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4884 #endif
4885 #endif	/* INET || INET6 */
4886 
4887 	ctx = device_get_sysctl_ctx(dev);
4888 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4889 
4890 	/* Create dev.hn.UNIT.rx sysctl tree */
4891 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4892 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4893 
4894 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4895 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4896 
4897 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4898 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4899 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4900 		if (rxr->hn_br == NULL) {
4901 			device_printf(dev, "allocate bufring failed\n");
4902 			return (ENOMEM);
4903 		}
4904 
4905 		if (hn_trust_hosttcp)
4906 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4907 		if (hn_trust_hostudp)
4908 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4909 		if (hn_trust_hostip)
4910 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4911 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4912 		rxr->hn_ifp = sc->hn_ifp;
4913 		if (i < sc->hn_tx_ring_cnt)
4914 			rxr->hn_txr = &sc->hn_tx_ring[i];
4915 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4916 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4917 		rxr->hn_rx_idx = i;
4918 		rxr->hn_rxbuf = sc->hn_rxbuf;
4919 
4920 		/*
4921 		 * Initialize LRO.
4922 		 */
4923 #if defined(INET) || defined(INET6)
4924 #if __FreeBSD_version >= 1100095
4925 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4926 		    hn_lro_mbufq_depth);
4927 #else
4928 		tcp_lro_init(&rxr->hn_lro);
4929 		rxr->hn_lro.ifp = sc->hn_ifp;
4930 #endif
4931 #if __FreeBSD_version >= 1100099
4932 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4933 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4934 #endif
4935 #endif	/* INET || INET6 */
4936 
4937 		if (sc->hn_rx_sysctl_tree != NULL) {
4938 			char name[16];
4939 
4940 			/*
4941 			 * Create per RX ring sysctl tree:
4942 			 * dev.hn.UNIT.rx.RINGID
4943 			 */
4944 			snprintf(name, sizeof(name), "%d", i);
4945 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4946 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4947 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4948 
4949 			if (rxr->hn_rx_sysctl_tree != NULL) {
4950 				SYSCTL_ADD_ULONG(ctx,
4951 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4952 				    OID_AUTO, "packets", CTLFLAG_RW,
4953 				    &rxr->hn_pkts, "# of packets received");
4954 				SYSCTL_ADD_ULONG(ctx,
4955 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4956 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
4957 				    &rxr->hn_rss_pkts,
4958 				    "# of packets w/ RSS info received");
4959 				SYSCTL_ADD_INT(ctx,
4960 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4961 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4962 				    &rxr->hn_pktbuf_len, 0,
4963 				    "Temporary channel packet buffer length");
4964 			}
4965 		}
4966 	}
4967 
4968 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4969 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4970 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4971 #if __FreeBSD_version < 1100095
4972 	    hn_rx_stat_int_sysctl,
4973 #else
4974 	    hn_rx_stat_u64_sysctl,
4975 #endif
4976 	    "LU", "LRO queued");
4977 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4978 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4979 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4980 #if __FreeBSD_version < 1100095
4981 	    hn_rx_stat_int_sysctl,
4982 #else
4983 	    hn_rx_stat_u64_sysctl,
4984 #endif
4985 	    "LU", "LRO flushed");
4986 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4987 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4988 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
4989 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4990 #if __FreeBSD_version >= 1100099
4991 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4992 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4993 	    hn_lro_lenlim_sysctl, "IU",
4994 	    "Max # of data bytes to be aggregated by LRO");
4995 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4996 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4997 	    hn_lro_ackcnt_sysctl, "I",
4998 	    "Max # of ACKs to be aggregated by LRO");
4999 #endif
5000 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5001 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5002 	    hn_trust_hcsum_sysctl, "I",
5003 	    "Trust tcp segement verification on host side, "
5004 	    "when csum info is missing");
5005 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5006 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5007 	    hn_trust_hcsum_sysctl, "I",
5008 	    "Trust udp datagram verification on host side, "
5009 	    "when csum info is missing");
5010 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5011 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5012 	    hn_trust_hcsum_sysctl, "I",
5013 	    "Trust ip packet verification on host side, "
5014 	    "when csum info is missing");
5015 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5016 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5017 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5018 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5019 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5020 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5021 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5022 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5023 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5024 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5025 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5026 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5027 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5028 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5029 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5030 	    hn_rx_stat_ulong_sysctl, "LU",
5031 	    "# of packets that we trust host's csum verification");
5032 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5033 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5034 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5035 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5036 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5037 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5038 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5039 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5040 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5041 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5042 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5043 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5044 
5045 	return (0);
5046 }
5047 
5048 static void
5049 hn_destroy_rx_data(struct hn_softc *sc)
5050 {
5051 	int i;
5052 
5053 	if (sc->hn_rxbuf != NULL) {
5054 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5055 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5056 		else
5057 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5058 		sc->hn_rxbuf = NULL;
5059 	}
5060 
5061 	if (sc->hn_rx_ring_cnt == 0)
5062 		return;
5063 
5064 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5065 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5066 
5067 		if (rxr->hn_br == NULL)
5068 			continue;
5069 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5070 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5071 		} else {
5072 			device_printf(sc->hn_dev,
5073 			    "%dth channel bufring is referenced", i);
5074 		}
5075 		rxr->hn_br = NULL;
5076 
5077 #if defined(INET) || defined(INET6)
5078 		tcp_lro_free(&rxr->hn_lro);
5079 #endif
5080 		free(rxr->hn_pktbuf, M_DEVBUF);
5081 	}
5082 	free(sc->hn_rx_ring, M_DEVBUF);
5083 	sc->hn_rx_ring = NULL;
5084 
5085 	sc->hn_rx_ring_cnt = 0;
5086 	sc->hn_rx_ring_inuse = 0;
5087 }
5088 
5089 static int
5090 hn_tx_ring_create(struct hn_softc *sc, int id)
5091 {
5092 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5093 	device_t dev = sc->hn_dev;
5094 	bus_dma_tag_t parent_dtag;
5095 	int error, i;
5096 
5097 	txr->hn_sc = sc;
5098 	txr->hn_tx_idx = id;
5099 
5100 #ifndef HN_USE_TXDESC_BUFRING
5101 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5102 #endif
5103 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5104 
5105 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5106 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5107 	    M_DEVBUF, M_WAITOK | M_ZERO);
5108 #ifndef HN_USE_TXDESC_BUFRING
5109 	SLIST_INIT(&txr->hn_txlist);
5110 #else
5111 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5112 	    M_WAITOK, &txr->hn_tx_lock);
5113 #endif
5114 
5115 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5116 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5117 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5118 	} else {
5119 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5120 	}
5121 
5122 #ifdef HN_IFSTART_SUPPORT
5123 	if (hn_use_if_start) {
5124 		txr->hn_txeof = hn_start_txeof;
5125 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5126 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5127 	} else
5128 #endif
5129 	{
5130 		int br_depth;
5131 
5132 		txr->hn_txeof = hn_xmit_txeof;
5133 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5134 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5135 
5136 		br_depth = hn_get_txswq_depth(txr);
5137 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5138 		    M_WAITOK, &txr->hn_tx_lock);
5139 	}
5140 
5141 	txr->hn_direct_tx_size = hn_direct_tx_size;
5142 
5143 	/*
5144 	 * Always schedule transmission instead of trying to do direct
5145 	 * transmission.  This one gives the best performance so far.
5146 	 */
5147 	txr->hn_sched_tx = 1;
5148 
5149 	parent_dtag = bus_get_dma_tag(dev);
5150 
5151 	/* DMA tag for RNDIS packet messages. */
5152 	error = bus_dma_tag_create(parent_dtag, /* parent */
5153 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5154 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5155 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5156 	    BUS_SPACE_MAXADDR,		/* highaddr */
5157 	    NULL, NULL,			/* filter, filterarg */
5158 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5159 	    1,				/* nsegments */
5160 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5161 	    0,				/* flags */
5162 	    NULL,			/* lockfunc */
5163 	    NULL,			/* lockfuncarg */
5164 	    &txr->hn_tx_rndis_dtag);
5165 	if (error) {
5166 		device_printf(dev, "failed to create rndis dmatag\n");
5167 		return error;
5168 	}
5169 
5170 	/* DMA tag for data. */
5171 	error = bus_dma_tag_create(parent_dtag, /* parent */
5172 	    1,				/* alignment */
5173 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5174 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5175 	    BUS_SPACE_MAXADDR,		/* highaddr */
5176 	    NULL, NULL,			/* filter, filterarg */
5177 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5178 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5179 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5180 	    0,				/* flags */
5181 	    NULL,			/* lockfunc */
5182 	    NULL,			/* lockfuncarg */
5183 	    &txr->hn_tx_data_dtag);
5184 	if (error) {
5185 		device_printf(dev, "failed to create data dmatag\n");
5186 		return error;
5187 	}
5188 
5189 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5190 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5191 
5192 		txd->txr = txr;
5193 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5194 		STAILQ_INIT(&txd->agg_list);
5195 
5196 		/*
5197 		 * Allocate and load RNDIS packet message.
5198 		 */
5199         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5200 		    (void **)&txd->rndis_pkt,
5201 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5202 		    &txd->rndis_pkt_dmap);
5203 		if (error) {
5204 			device_printf(dev,
5205 			    "failed to allocate rndis_packet_msg, %d\n", i);
5206 			return error;
5207 		}
5208 
5209 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5210 		    txd->rndis_pkt_dmap,
5211 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5212 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5213 		    BUS_DMA_NOWAIT);
5214 		if (error) {
5215 			device_printf(dev,
5216 			    "failed to load rndis_packet_msg, %d\n", i);
5217 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5218 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5219 			return error;
5220 		}
5221 
5222 		/* DMA map for TX data. */
5223 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5224 		    &txd->data_dmap);
5225 		if (error) {
5226 			device_printf(dev,
5227 			    "failed to allocate tx data dmamap\n");
5228 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5229 			    txd->rndis_pkt_dmap);
5230 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5231 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5232 			return error;
5233 		}
5234 
5235 		/* All set, put it to list */
5236 		txd->flags |= HN_TXD_FLAG_ONLIST;
5237 #ifndef HN_USE_TXDESC_BUFRING
5238 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5239 #else
5240 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5241 #endif
5242 	}
5243 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5244 
5245 	if (sc->hn_tx_sysctl_tree != NULL) {
5246 		struct sysctl_oid_list *child;
5247 		struct sysctl_ctx_list *ctx;
5248 		char name[16];
5249 
5250 		/*
5251 		 * Create per TX ring sysctl tree:
5252 		 * dev.hn.UNIT.tx.RINGID
5253 		 */
5254 		ctx = device_get_sysctl_ctx(dev);
5255 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5256 
5257 		snprintf(name, sizeof(name), "%d", id);
5258 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5259 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5260 
5261 		if (txr->hn_tx_sysctl_tree != NULL) {
5262 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5263 
5264 #ifdef HN_DEBUG
5265 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5266 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5267 			    "# of available TX descs");
5268 #endif
5269 #ifdef HN_IFSTART_SUPPORT
5270 			if (!hn_use_if_start)
5271 #endif
5272 			{
5273 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5274 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5275 				    "over active");
5276 			}
5277 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5278 			    CTLFLAG_RW, &txr->hn_pkts,
5279 			    "# of packets transmitted");
5280 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5281 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5282 		}
5283 	}
5284 
5285 	return 0;
5286 }
5287 
5288 static void
5289 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5290 {
5291 	struct hn_tx_ring *txr = txd->txr;
5292 
5293 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5294 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5295 
5296 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5297 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5298 	    txd->rndis_pkt_dmap);
5299 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5300 }
5301 
5302 static void
5303 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5304 {
5305 
5306 	KASSERT(txd->refs == 0 || txd->refs == 1,
5307 	    ("invalid txd refs %d", txd->refs));
5308 
5309 	/* Aggregated txds will be freed by their aggregating txd. */
5310 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5311 		int freed;
5312 
5313 		freed = hn_txdesc_put(txr, txd);
5314 		KASSERT(freed, ("can't free txdesc"));
5315 	}
5316 }
5317 
5318 static void
5319 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5320 {
5321 	int i;
5322 
5323 	if (txr->hn_txdesc == NULL)
5324 		return;
5325 
5326 	/*
5327 	 * NOTE:
5328 	 * Because the freeing of aggregated txds will be deferred
5329 	 * to the aggregating txd, two passes are used here:
5330 	 * - The first pass GCes any pending txds.  This GC is necessary,
5331 	 *   since if the channels are revoked, hypervisor will not
5332 	 *   deliver send-done for all pending txds.
5333 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5334 	 *   were freed.
5335 	 */
5336 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5337 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5338 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5339 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5340 
5341 	if (txr->hn_tx_data_dtag != NULL)
5342 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5343 	if (txr->hn_tx_rndis_dtag != NULL)
5344 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5345 
5346 #ifdef HN_USE_TXDESC_BUFRING
5347 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5348 #endif
5349 
5350 	free(txr->hn_txdesc, M_DEVBUF);
5351 	txr->hn_txdesc = NULL;
5352 
5353 	if (txr->hn_mbuf_br != NULL)
5354 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5355 
5356 #ifndef HN_USE_TXDESC_BUFRING
5357 	mtx_destroy(&txr->hn_txlist_spin);
5358 #endif
5359 	mtx_destroy(&txr->hn_tx_lock);
5360 }
5361 
5362 static int
5363 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5364 {
5365 	struct sysctl_oid_list *child;
5366 	struct sysctl_ctx_list *ctx;
5367 	int i;
5368 
5369 	/*
5370 	 * Create TXBUF for chimney sending.
5371 	 *
5372 	 * NOTE: It is shared by all channels.
5373 	 */
5374 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5375 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5376 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5377 	if (sc->hn_chim == NULL) {
5378 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5379 		return (ENOMEM);
5380 	}
5381 
5382 	sc->hn_tx_ring_cnt = ring_cnt;
5383 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5384 
5385 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5386 	    M_DEVBUF, M_WAITOK | M_ZERO);
5387 
5388 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5389 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5390 
5391 	/* Create dev.hn.UNIT.tx sysctl tree */
5392 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5393 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5394 
5395 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5396 		int error;
5397 
5398 		error = hn_tx_ring_create(sc, i);
5399 		if (error)
5400 			return error;
5401 	}
5402 
5403 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5404 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5405 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5406 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5407 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5408 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5409 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5410 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5411 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5412 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5413 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5414 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5415 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5416 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5417 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5418 	    hn_tx_stat_ulong_sysctl, "LU",
5419 	    "# of packet transmission aggregation flush failure");
5420 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5421 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5422 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5423 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5424 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5425 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5426 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5427 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5428 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5429 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5430 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5431 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5432 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5433 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5434 	    "# of total TX descs");
5435 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5436 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5437 	    "Chimney send packet size upper boundary");
5438 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5439 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5440 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5441 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5442 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5443 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5444 	    hn_tx_conf_int_sysctl, "I",
5445 	    "Size of the packet for direct transmission");
5446 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5447 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5448 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5449 	    hn_tx_conf_int_sysctl, "I",
5450 	    "Always schedule transmission "
5451 	    "instead of doing direct transmission");
5452 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5453 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5454 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5455 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5456 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5457 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5458 	    "Applied packet transmission aggregation size");
5459 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5460 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5461 	    hn_txagg_pktmax_sysctl, "I",
5462 	    "Applied packet transmission aggregation packets");
5463 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5464 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5465 	    hn_txagg_align_sysctl, "I",
5466 	    "Applied packet transmission aggregation alignment");
5467 
5468 	return 0;
5469 }
5470 
5471 static void
5472 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5473 {
5474 	int i;
5475 
5476 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5477 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5478 }
5479 
5480 static void
5481 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5482 {
5483 	struct ifnet *ifp = sc->hn_ifp;
5484 	u_int hw_tsomax;
5485 	int tso_minlen;
5486 
5487 	HN_LOCK_ASSERT(sc);
5488 
5489 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5490 		return;
5491 
5492 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5493 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5494 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5495 
5496 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5497 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5498 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5499 
5500 	if (tso_maxlen < tso_minlen)
5501 		tso_maxlen = tso_minlen;
5502 	else if (tso_maxlen > IP_MAXPACKET)
5503 		tso_maxlen = IP_MAXPACKET;
5504 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5505 		tso_maxlen = sc->hn_ndis_tso_szmax;
5506 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5507 
5508 	if (hn_xpnt_vf_isready(sc)) {
5509 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5510 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5511 	}
5512 	ifp->if_hw_tsomax = hw_tsomax;
5513 	if (bootverbose)
5514 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5515 }
5516 
5517 static void
5518 hn_fixup_tx_data(struct hn_softc *sc)
5519 {
5520 	uint64_t csum_assist;
5521 	int i;
5522 
5523 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5524 	if (hn_tx_chimney_size > 0 &&
5525 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5526 		hn_set_chim_size(sc, hn_tx_chimney_size);
5527 
5528 	csum_assist = 0;
5529 	if (sc->hn_caps & HN_CAP_IPCS)
5530 		csum_assist |= CSUM_IP;
5531 	if (sc->hn_caps & HN_CAP_TCP4CS)
5532 		csum_assist |= CSUM_IP_TCP;
5533 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5534 		csum_assist |= CSUM_IP_UDP;
5535 	if (sc->hn_caps & HN_CAP_TCP6CS)
5536 		csum_assist |= CSUM_IP6_TCP;
5537 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5538 		csum_assist |= CSUM_IP6_UDP;
5539 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5540 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5541 
5542 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5543 		/*
5544 		 * Support HASHVAL pktinfo on TX path.
5545 		 */
5546 		if (bootverbose)
5547 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5548 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5549 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5550 	}
5551 }
5552 
5553 static void
5554 hn_destroy_tx_data(struct hn_softc *sc)
5555 {
5556 	int i;
5557 
5558 	if (sc->hn_chim != NULL) {
5559 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5560 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5561 		} else {
5562 			device_printf(sc->hn_dev,
5563 			    "chimney sending buffer is referenced");
5564 		}
5565 		sc->hn_chim = NULL;
5566 	}
5567 
5568 	if (sc->hn_tx_ring_cnt == 0)
5569 		return;
5570 
5571 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5572 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5573 
5574 	free(sc->hn_tx_ring, M_DEVBUF);
5575 	sc->hn_tx_ring = NULL;
5576 
5577 	sc->hn_tx_ring_cnt = 0;
5578 	sc->hn_tx_ring_inuse = 0;
5579 }
5580 
5581 #ifdef HN_IFSTART_SUPPORT
5582 
5583 static void
5584 hn_start_taskfunc(void *xtxr, int pending __unused)
5585 {
5586 	struct hn_tx_ring *txr = xtxr;
5587 
5588 	mtx_lock(&txr->hn_tx_lock);
5589 	hn_start_locked(txr, 0);
5590 	mtx_unlock(&txr->hn_tx_lock);
5591 }
5592 
5593 static int
5594 hn_start_locked(struct hn_tx_ring *txr, int len)
5595 {
5596 	struct hn_softc *sc = txr->hn_sc;
5597 	struct ifnet *ifp = sc->hn_ifp;
5598 	int sched = 0;
5599 
5600 	KASSERT(hn_use_if_start,
5601 	    ("hn_start_locked is called, when if_start is disabled"));
5602 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5603 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5604 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5605 
5606 	if (__predict_false(txr->hn_suspended))
5607 		return (0);
5608 
5609 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5610 	    IFF_DRV_RUNNING)
5611 		return (0);
5612 
5613 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5614 		struct hn_txdesc *txd;
5615 		struct mbuf *m_head;
5616 		int error;
5617 
5618 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5619 		if (m_head == NULL)
5620 			break;
5621 
5622 		if (len > 0 && m_head->m_pkthdr.len > len) {
5623 			/*
5624 			 * This sending could be time consuming; let callers
5625 			 * dispatch this packet sending (and sending of any
5626 			 * following up packets) to tx taskqueue.
5627 			 */
5628 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5629 			sched = 1;
5630 			break;
5631 		}
5632 
5633 #if defined(INET6) || defined(INET)
5634 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5635 			m_head = hn_tso_fixup(m_head);
5636 			if (__predict_false(m_head == NULL)) {
5637 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5638 				continue;
5639 			}
5640 		} else if (m_head->m_pkthdr.csum_flags &
5641 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5642 			m_head = hn_set_hlen(m_head);
5643 			if (__predict_false(m_head == NULL)) {
5644 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5645 				continue;
5646 			}
5647 		}
5648 #endif
5649 
5650 		txd = hn_txdesc_get(txr);
5651 		if (txd == NULL) {
5652 			txr->hn_no_txdescs++;
5653 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5654 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5655 			break;
5656 		}
5657 
5658 		error = hn_encap(ifp, txr, txd, &m_head);
5659 		if (error) {
5660 			/* Both txd and m_head are freed */
5661 			KASSERT(txr->hn_agg_txd == NULL,
5662 			    ("encap failed w/ pending aggregating txdesc"));
5663 			continue;
5664 		}
5665 
5666 		if (txr->hn_agg_pktleft == 0) {
5667 			if (txr->hn_agg_txd != NULL) {
5668 				KASSERT(m_head == NULL,
5669 				    ("pending mbuf for aggregating txdesc"));
5670 				error = hn_flush_txagg(ifp, txr);
5671 				if (__predict_false(error)) {
5672 					atomic_set_int(&ifp->if_drv_flags,
5673 					    IFF_DRV_OACTIVE);
5674 					break;
5675 				}
5676 			} else {
5677 				KASSERT(m_head != NULL, ("mbuf was freed"));
5678 				error = hn_txpkt(ifp, txr, txd);
5679 				if (__predict_false(error)) {
5680 					/* txd is freed, but m_head is not */
5681 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5682 					atomic_set_int(&ifp->if_drv_flags,
5683 					    IFF_DRV_OACTIVE);
5684 					break;
5685 				}
5686 			}
5687 		}
5688 #ifdef INVARIANTS
5689 		else {
5690 			KASSERT(txr->hn_agg_txd != NULL,
5691 			    ("no aggregating txdesc"));
5692 			KASSERT(m_head == NULL,
5693 			    ("pending mbuf for aggregating txdesc"));
5694 		}
5695 #endif
5696 	}
5697 
5698 	/* Flush pending aggerated transmission. */
5699 	if (txr->hn_agg_txd != NULL)
5700 		hn_flush_txagg(ifp, txr);
5701 	return (sched);
5702 }
5703 
5704 static void
5705 hn_start(struct ifnet *ifp)
5706 {
5707 	struct hn_softc *sc = ifp->if_softc;
5708 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5709 
5710 	if (txr->hn_sched_tx)
5711 		goto do_sched;
5712 
5713 	if (mtx_trylock(&txr->hn_tx_lock)) {
5714 		int sched;
5715 
5716 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5717 		mtx_unlock(&txr->hn_tx_lock);
5718 		if (!sched)
5719 			return;
5720 	}
5721 do_sched:
5722 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5723 }
5724 
5725 static void
5726 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5727 {
5728 	struct hn_tx_ring *txr = xtxr;
5729 
5730 	mtx_lock(&txr->hn_tx_lock);
5731 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5732 	hn_start_locked(txr, 0);
5733 	mtx_unlock(&txr->hn_tx_lock);
5734 }
5735 
5736 static void
5737 hn_start_txeof(struct hn_tx_ring *txr)
5738 {
5739 	struct hn_softc *sc = txr->hn_sc;
5740 	struct ifnet *ifp = sc->hn_ifp;
5741 
5742 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5743 
5744 	if (txr->hn_sched_tx)
5745 		goto do_sched;
5746 
5747 	if (mtx_trylock(&txr->hn_tx_lock)) {
5748 		int sched;
5749 
5750 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5751 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5752 		mtx_unlock(&txr->hn_tx_lock);
5753 		if (sched) {
5754 			taskqueue_enqueue(txr->hn_tx_taskq,
5755 			    &txr->hn_tx_task);
5756 		}
5757 	} else {
5758 do_sched:
5759 		/*
5760 		 * Release the OACTIVE earlier, with the hope, that
5761 		 * others could catch up.  The task will clear the
5762 		 * flag again with the hn_tx_lock to avoid possible
5763 		 * races.
5764 		 */
5765 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5766 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5767 	}
5768 }
5769 
5770 #endif	/* HN_IFSTART_SUPPORT */
5771 
5772 static int
5773 hn_xmit(struct hn_tx_ring *txr, int len)
5774 {
5775 	struct hn_softc *sc = txr->hn_sc;
5776 	struct ifnet *ifp = sc->hn_ifp;
5777 	struct mbuf *m_head;
5778 	int sched = 0;
5779 
5780 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5781 #ifdef HN_IFSTART_SUPPORT
5782 	KASSERT(hn_use_if_start == 0,
5783 	    ("hn_xmit is called, when if_start is enabled"));
5784 #endif
5785 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5786 
5787 	if (__predict_false(txr->hn_suspended))
5788 		return (0);
5789 
5790 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5791 		return (0);
5792 
5793 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5794 		struct hn_txdesc *txd;
5795 		int error;
5796 
5797 		if (len > 0 && m_head->m_pkthdr.len > len) {
5798 			/*
5799 			 * This sending could be time consuming; let callers
5800 			 * dispatch this packet sending (and sending of any
5801 			 * following up packets) to tx taskqueue.
5802 			 */
5803 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5804 			sched = 1;
5805 			break;
5806 		}
5807 
5808 		txd = hn_txdesc_get(txr);
5809 		if (txd == NULL) {
5810 			txr->hn_no_txdescs++;
5811 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5812 			txr->hn_oactive = 1;
5813 			break;
5814 		}
5815 
5816 		error = hn_encap(ifp, txr, txd, &m_head);
5817 		if (error) {
5818 			/* Both txd and m_head are freed; discard */
5819 			KASSERT(txr->hn_agg_txd == NULL,
5820 			    ("encap failed w/ pending aggregating txdesc"));
5821 			drbr_advance(ifp, txr->hn_mbuf_br);
5822 			continue;
5823 		}
5824 
5825 		if (txr->hn_agg_pktleft == 0) {
5826 			if (txr->hn_agg_txd != NULL) {
5827 				KASSERT(m_head == NULL,
5828 				    ("pending mbuf for aggregating txdesc"));
5829 				error = hn_flush_txagg(ifp, txr);
5830 				if (__predict_false(error)) {
5831 					txr->hn_oactive = 1;
5832 					break;
5833 				}
5834 			} else {
5835 				KASSERT(m_head != NULL, ("mbuf was freed"));
5836 				error = hn_txpkt(ifp, txr, txd);
5837 				if (__predict_false(error)) {
5838 					/* txd is freed, but m_head is not */
5839 					drbr_putback(ifp, txr->hn_mbuf_br,
5840 					    m_head);
5841 					txr->hn_oactive = 1;
5842 					break;
5843 				}
5844 			}
5845 		}
5846 #ifdef INVARIANTS
5847 		else {
5848 			KASSERT(txr->hn_agg_txd != NULL,
5849 			    ("no aggregating txdesc"));
5850 			KASSERT(m_head == NULL,
5851 			    ("pending mbuf for aggregating txdesc"));
5852 		}
5853 #endif
5854 
5855 		/* Sent */
5856 		drbr_advance(ifp, txr->hn_mbuf_br);
5857 	}
5858 
5859 	/* Flush pending aggerated transmission. */
5860 	if (txr->hn_agg_txd != NULL)
5861 		hn_flush_txagg(ifp, txr);
5862 	return (sched);
5863 }
5864 
5865 static int
5866 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5867 {
5868 	struct hn_softc *sc = ifp->if_softc;
5869 	struct hn_tx_ring *txr;
5870 	int error, idx = 0;
5871 
5872 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5873 		struct rm_priotracker pt;
5874 
5875 		rm_rlock(&sc->hn_vf_lock, &pt);
5876 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5877 			struct mbuf *m_bpf = NULL;
5878 			int obytes, omcast;
5879 
5880 			obytes = m->m_pkthdr.len;
5881 			if (m->m_flags & M_MCAST)
5882 				omcast = 1;
5883 
5884 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5885 				if (bpf_peers_present(ifp->if_bpf)) {
5886 					m_bpf = m_copypacket(m, M_NOWAIT);
5887 					if (m_bpf == NULL) {
5888 						/*
5889 						 * Failed to grab a shallow
5890 						 * copy; tap now.
5891 						 */
5892 						ETHER_BPF_MTAP(ifp, m);
5893 					}
5894 				}
5895 			} else {
5896 				ETHER_BPF_MTAP(ifp, m);
5897 			}
5898 
5899 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5900 			rm_runlock(&sc->hn_vf_lock, &pt);
5901 
5902 			if (m_bpf != NULL) {
5903 				if (!error)
5904 					ETHER_BPF_MTAP(ifp, m_bpf);
5905 				m_freem(m_bpf);
5906 			}
5907 
5908 			if (error == ENOBUFS) {
5909 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5910 			} else if (error) {
5911 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5912 			} else {
5913 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5914 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5915 				if (omcast) {
5916 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5917 					    omcast);
5918 				}
5919 			}
5920 			return (error);
5921 		}
5922 		rm_runlock(&sc->hn_vf_lock, &pt);
5923 	}
5924 
5925 #if defined(INET6) || defined(INET)
5926 	/*
5927 	 * Perform TSO packet header fixup or get l2/l3 header length now,
5928 	 * since packet headers should be cache-hot.
5929 	 */
5930 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5931 		m = hn_tso_fixup(m);
5932 		if (__predict_false(m == NULL)) {
5933 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5934 			return EIO;
5935 		}
5936 	} else if (m->m_pkthdr.csum_flags &
5937 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5938 		m = hn_set_hlen(m);
5939 		if (__predict_false(m == NULL)) {
5940 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5941 			return EIO;
5942 		}
5943 	}
5944 #endif
5945 
5946 	/*
5947 	 * Select the TX ring based on flowid
5948 	 */
5949 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5950 #ifdef RSS
5951 		uint32_t bid;
5952 
5953 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
5954 		    &bid) == 0)
5955 			idx = bid % sc->hn_tx_ring_inuse;
5956 		else
5957 #endif
5958 		{
5959 #if defined(INET6) || defined(INET)
5960 			int tcpsyn = 0;
5961 
5962 			if (m->m_pkthdr.len < 128 &&
5963 			    (m->m_pkthdr.csum_flags &
5964 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5965 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5966 				m = hn_check_tcpsyn(m, &tcpsyn);
5967 				if (__predict_false(m == NULL)) {
5968 					if_inc_counter(ifp,
5969 					    IFCOUNTER_OERRORS, 1);
5970 					return (EIO);
5971 				}
5972 			}
5973 #else
5974 			const int tcpsyn = 0;
5975 #endif
5976 			if (tcpsyn)
5977 				idx = 0;
5978 			else
5979 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5980 		}
5981 	}
5982 	txr = &sc->hn_tx_ring[idx];
5983 
5984 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5985 	if (error) {
5986 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5987 		return error;
5988 	}
5989 
5990 	if (txr->hn_oactive)
5991 		return 0;
5992 
5993 	if (txr->hn_sched_tx)
5994 		goto do_sched;
5995 
5996 	if (mtx_trylock(&txr->hn_tx_lock)) {
5997 		int sched;
5998 
5999 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6000 		mtx_unlock(&txr->hn_tx_lock);
6001 		if (!sched)
6002 			return 0;
6003 	}
6004 do_sched:
6005 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6006 	return 0;
6007 }
6008 
6009 static void
6010 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6011 {
6012 	struct mbuf *m;
6013 
6014 	mtx_lock(&txr->hn_tx_lock);
6015 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6016 		m_freem(m);
6017 	mtx_unlock(&txr->hn_tx_lock);
6018 }
6019 
6020 static void
6021 hn_xmit_qflush(struct ifnet *ifp)
6022 {
6023 	struct hn_softc *sc = ifp->if_softc;
6024 	struct rm_priotracker pt;
6025 	int i;
6026 
6027 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6028 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6029 	if_qflush(ifp);
6030 
6031 	rm_rlock(&sc->hn_vf_lock, &pt);
6032 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6033 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6034 	rm_runlock(&sc->hn_vf_lock, &pt);
6035 }
6036 
6037 static void
6038 hn_xmit_txeof(struct hn_tx_ring *txr)
6039 {
6040 
6041 	if (txr->hn_sched_tx)
6042 		goto do_sched;
6043 
6044 	if (mtx_trylock(&txr->hn_tx_lock)) {
6045 		int sched;
6046 
6047 		txr->hn_oactive = 0;
6048 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6049 		mtx_unlock(&txr->hn_tx_lock);
6050 		if (sched) {
6051 			taskqueue_enqueue(txr->hn_tx_taskq,
6052 			    &txr->hn_tx_task);
6053 		}
6054 	} else {
6055 do_sched:
6056 		/*
6057 		 * Release the oactive earlier, with the hope, that
6058 		 * others could catch up.  The task will clear the
6059 		 * oactive again with the hn_tx_lock to avoid possible
6060 		 * races.
6061 		 */
6062 		txr->hn_oactive = 0;
6063 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6064 	}
6065 }
6066 
6067 static void
6068 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6069 {
6070 	struct hn_tx_ring *txr = xtxr;
6071 
6072 	mtx_lock(&txr->hn_tx_lock);
6073 	hn_xmit(txr, 0);
6074 	mtx_unlock(&txr->hn_tx_lock);
6075 }
6076 
6077 static void
6078 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6079 {
6080 	struct hn_tx_ring *txr = xtxr;
6081 
6082 	mtx_lock(&txr->hn_tx_lock);
6083 	txr->hn_oactive = 0;
6084 	hn_xmit(txr, 0);
6085 	mtx_unlock(&txr->hn_tx_lock);
6086 }
6087 
6088 static int
6089 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6090 {
6091 	struct vmbus_chan_br cbr;
6092 	struct hn_rx_ring *rxr;
6093 	struct hn_tx_ring *txr = NULL;
6094 	int idx, error;
6095 
6096 	idx = vmbus_chan_subidx(chan);
6097 
6098 	/*
6099 	 * Link this channel to RX/TX ring.
6100 	 */
6101 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6102 	    ("invalid channel index %d, should > 0 && < %d",
6103 	     idx, sc->hn_rx_ring_inuse));
6104 	rxr = &sc->hn_rx_ring[idx];
6105 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6106 	    ("RX ring %d already attached", idx));
6107 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6108 	rxr->hn_chan = chan;
6109 
6110 	if (bootverbose) {
6111 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6112 		    idx, vmbus_chan_id(chan));
6113 	}
6114 
6115 	if (idx < sc->hn_tx_ring_inuse) {
6116 		txr = &sc->hn_tx_ring[idx];
6117 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6118 		    ("TX ring %d already attached", idx));
6119 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6120 
6121 		txr->hn_chan = chan;
6122 		if (bootverbose) {
6123 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6124 			    idx, vmbus_chan_id(chan));
6125 		}
6126 	}
6127 
6128 	/* Bind this channel to a proper CPU. */
6129 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6130 
6131 	/*
6132 	 * Open this channel
6133 	 */
6134 	cbr.cbr = rxr->hn_br;
6135 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6136 	cbr.cbr_txsz = HN_TXBR_SIZE;
6137 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6138 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6139 	if (error) {
6140 		if (error == EISCONN) {
6141 			if_printf(sc->hn_ifp, "bufring is connected after "
6142 			    "chan%u open failure\n", vmbus_chan_id(chan));
6143 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6144 		} else {
6145 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6146 			    vmbus_chan_id(chan), error);
6147 		}
6148 	}
6149 	return (error);
6150 }
6151 
6152 static void
6153 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6154 {
6155 	struct hn_rx_ring *rxr;
6156 	int idx, error;
6157 
6158 	idx = vmbus_chan_subidx(chan);
6159 
6160 	/*
6161 	 * Link this channel to RX/TX ring.
6162 	 */
6163 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6164 	    ("invalid channel index %d, should > 0 && < %d",
6165 	     idx, sc->hn_rx_ring_inuse));
6166 	rxr = &sc->hn_rx_ring[idx];
6167 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6168 	    ("RX ring %d is not attached", idx));
6169 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6170 
6171 	if (idx < sc->hn_tx_ring_inuse) {
6172 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6173 
6174 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6175 		    ("TX ring %d is not attached attached", idx));
6176 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6177 	}
6178 
6179 	/*
6180 	 * Close this channel.
6181 	 *
6182 	 * NOTE:
6183 	 * Channel closing does _not_ destroy the target channel.
6184 	 */
6185 	error = vmbus_chan_close_direct(chan);
6186 	if (error == EISCONN) {
6187 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6188 		    "after being closed\n", vmbus_chan_id(chan));
6189 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6190 	} else if (error) {
6191 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6192 		    vmbus_chan_id(chan), error);
6193 	}
6194 }
6195 
6196 static int
6197 hn_attach_subchans(struct hn_softc *sc)
6198 {
6199 	struct vmbus_channel **subchans;
6200 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6201 	int i, error = 0;
6202 
6203 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6204 
6205 	/* Attach the sub-channels. */
6206 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6207 	for (i = 0; i < subchan_cnt; ++i) {
6208 		int error1;
6209 
6210 		error1 = hn_chan_attach(sc, subchans[i]);
6211 		if (error1) {
6212 			error = error1;
6213 			/* Move on; all channels will be detached later. */
6214 		}
6215 	}
6216 	vmbus_subchan_rel(subchans, subchan_cnt);
6217 
6218 	if (error) {
6219 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6220 	} else {
6221 		if (bootverbose) {
6222 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6223 			    subchan_cnt);
6224 		}
6225 	}
6226 	return (error);
6227 }
6228 
6229 static void
6230 hn_detach_allchans(struct hn_softc *sc)
6231 {
6232 	struct vmbus_channel **subchans;
6233 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6234 	int i;
6235 
6236 	if (subchan_cnt == 0)
6237 		goto back;
6238 
6239 	/* Detach the sub-channels. */
6240 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6241 	for (i = 0; i < subchan_cnt; ++i)
6242 		hn_chan_detach(sc, subchans[i]);
6243 	vmbus_subchan_rel(subchans, subchan_cnt);
6244 
6245 back:
6246 	/*
6247 	 * Detach the primary channel, _after_ all sub-channels
6248 	 * are detached.
6249 	 */
6250 	hn_chan_detach(sc, sc->hn_prichan);
6251 
6252 	/* Wait for sub-channels to be destroyed, if any. */
6253 	vmbus_subchan_drain(sc->hn_prichan);
6254 
6255 #ifdef INVARIANTS
6256 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6257 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6258 		    HN_RX_FLAG_ATTACHED) == 0,
6259 		    ("%dth RX ring is still attached", i));
6260 	}
6261 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6262 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6263 		    HN_TX_FLAG_ATTACHED) == 0,
6264 		    ("%dth TX ring is still attached", i));
6265 	}
6266 #endif
6267 }
6268 
6269 static int
6270 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6271 {
6272 	struct vmbus_channel **subchans;
6273 	int nchan, rxr_cnt, error;
6274 
6275 	nchan = *nsubch + 1;
6276 	if (nchan == 1) {
6277 		/*
6278 		 * Multiple RX/TX rings are not requested.
6279 		 */
6280 		*nsubch = 0;
6281 		return (0);
6282 	}
6283 
6284 	/*
6285 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6286 	 * table entries.
6287 	 */
6288 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6289 	if (error) {
6290 		/* No RSS; this is benign. */
6291 		*nsubch = 0;
6292 		return (0);
6293 	}
6294 	if (bootverbose) {
6295 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6296 		    rxr_cnt, nchan);
6297 	}
6298 
6299 	if (nchan > rxr_cnt)
6300 		nchan = rxr_cnt;
6301 	if (nchan == 1) {
6302 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6303 		*nsubch = 0;
6304 		return (0);
6305 	}
6306 
6307 	/*
6308 	 * Allocate sub-channels from NVS.
6309 	 */
6310 	*nsubch = nchan - 1;
6311 	error = hn_nvs_alloc_subchans(sc, nsubch);
6312 	if (error || *nsubch == 0) {
6313 		/* Failed to allocate sub-channels. */
6314 		*nsubch = 0;
6315 		return (0);
6316 	}
6317 
6318 	/*
6319 	 * Wait for all sub-channels to become ready before moving on.
6320 	 */
6321 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6322 	vmbus_subchan_rel(subchans, *nsubch);
6323 	return (0);
6324 }
6325 
6326 static bool
6327 hn_synth_attachable(const struct hn_softc *sc)
6328 {
6329 	int i;
6330 
6331 	if (sc->hn_flags & HN_FLAG_ERRORS)
6332 		return (false);
6333 
6334 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6335 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6336 
6337 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6338 			return (false);
6339 	}
6340 	return (true);
6341 }
6342 
6343 /*
6344  * Make sure that the RX filter is zero after the successful
6345  * RNDIS initialization.
6346  *
6347  * NOTE:
6348  * Under certain conditions on certain versions of Hyper-V,
6349  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6350  * after the successful RNDIS initialization, which breaks
6351  * the assumption of any following code (well, it breaks the
6352  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6353  * explicitly, drain packets sneaking through, and drain the
6354  * interrupt taskqueues scheduled due to the stealth packets.
6355  */
6356 static void
6357 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6358 {
6359 
6360 	hn_disable_rx(sc);
6361 	hn_drain_rxtx(sc, nchan);
6362 }
6363 
6364 static int
6365 hn_synth_attach(struct hn_softc *sc, int mtu)
6366 {
6367 #define ATTACHED_NVS		0x0002
6368 #define ATTACHED_RNDIS		0x0004
6369 
6370 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6371 	int error, nsubch, nchan = 1, i, rndis_inited;
6372 	uint32_t old_caps, attached = 0;
6373 
6374 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6375 	    ("synthetic parts were attached"));
6376 
6377 	if (!hn_synth_attachable(sc))
6378 		return (ENXIO);
6379 
6380 	/* Save capabilities for later verification. */
6381 	old_caps = sc->hn_caps;
6382 	sc->hn_caps = 0;
6383 
6384 	/* Clear RSS stuffs. */
6385 	sc->hn_rss_ind_size = 0;
6386 	sc->hn_rss_hash = 0;
6387 	sc->hn_rss_hcap = 0;
6388 
6389 	/*
6390 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6391 	 */
6392 	error = hn_chan_attach(sc, sc->hn_prichan);
6393 	if (error)
6394 		goto failed;
6395 
6396 	/*
6397 	 * Attach NVS.
6398 	 */
6399 	error = hn_nvs_attach(sc, mtu);
6400 	if (error)
6401 		goto failed;
6402 	attached |= ATTACHED_NVS;
6403 
6404 	/*
6405 	 * Attach RNDIS _after_ NVS is attached.
6406 	 */
6407 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6408 	if (rndis_inited)
6409 		attached |= ATTACHED_RNDIS;
6410 	if (error)
6411 		goto failed;
6412 
6413 	/*
6414 	 * Make sure capabilities are not changed.
6415 	 */
6416 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6417 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6418 		    old_caps, sc->hn_caps);
6419 		error = ENXIO;
6420 		goto failed;
6421 	}
6422 
6423 	/*
6424 	 * Allocate sub-channels for multi-TX/RX rings.
6425 	 *
6426 	 * NOTE:
6427 	 * The # of RX rings that can be used is equivalent to the # of
6428 	 * channels to be requested.
6429 	 */
6430 	nsubch = sc->hn_rx_ring_cnt - 1;
6431 	error = hn_synth_alloc_subchans(sc, &nsubch);
6432 	if (error)
6433 		goto failed;
6434 	/* NOTE: _Full_ synthetic parts detach is required now. */
6435 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6436 
6437 	/*
6438 	 * Set the # of TX/RX rings that could be used according to
6439 	 * the # of channels that NVS offered.
6440 	 */
6441 	nchan = nsubch + 1;
6442 	hn_set_ring_inuse(sc, nchan);
6443 	if (nchan == 1) {
6444 		/* Only the primary channel can be used; done */
6445 		goto back;
6446 	}
6447 
6448 	/*
6449 	 * Attach the sub-channels.
6450 	 *
6451 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6452 	 */
6453 	error = hn_attach_subchans(sc);
6454 	if (error)
6455 		goto failed;
6456 
6457 	/*
6458 	 * Configure RSS key and indirect table _after_ all sub-channels
6459 	 * are attached.
6460 	 */
6461 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6462 		/*
6463 		 * RSS key is not set yet; set it to the default RSS key.
6464 		 */
6465 		if (bootverbose)
6466 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6467 #ifdef RSS
6468 		rss_getkey(rss->rss_key);
6469 #else
6470 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6471 #endif
6472 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6473 	}
6474 
6475 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6476 		/*
6477 		 * RSS indirect table is not set yet; set it up in round-
6478 		 * robin fashion.
6479 		 */
6480 		if (bootverbose) {
6481 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6482 			    "table\n");
6483 		}
6484 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6485 			uint32_t subidx;
6486 
6487 #ifdef RSS
6488 			subidx = rss_get_indirection_to_bucket(i);
6489 #else
6490 			subidx = i;
6491 #endif
6492 			rss->rss_ind[i] = subidx % nchan;
6493 		}
6494 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6495 	} else {
6496 		/*
6497 		 * # of usable channels may be changed, so we have to
6498 		 * make sure that all entries in RSS indirect table
6499 		 * are valid.
6500 		 *
6501 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6502 		 */
6503 		hn_rss_ind_fixup(sc);
6504 	}
6505 
6506 	sc->hn_rss_hash = sc->hn_rss_hcap;
6507 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6508 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6509 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6510 		hn_vf_rss_fixup(sc, false);
6511 	}
6512 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6513 	if (error)
6514 		goto failed;
6515 back:
6516 	/*
6517 	 * Fixup transmission aggregation setup.
6518 	 */
6519 	hn_set_txagg(sc);
6520 	hn_rndis_init_fixat(sc, nchan);
6521 	return (0);
6522 
6523 failed:
6524 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6525 		hn_rndis_init_fixat(sc, nchan);
6526 		hn_synth_detach(sc);
6527 	} else {
6528 		if (attached & ATTACHED_RNDIS) {
6529 			hn_rndis_init_fixat(sc, nchan);
6530 			hn_rndis_detach(sc);
6531 		}
6532 		if (attached & ATTACHED_NVS)
6533 			hn_nvs_detach(sc);
6534 		hn_chan_detach(sc, sc->hn_prichan);
6535 		/* Restore old capabilities. */
6536 		sc->hn_caps = old_caps;
6537 	}
6538 	return (error);
6539 
6540 #undef ATTACHED_RNDIS
6541 #undef ATTACHED_NVS
6542 }
6543 
6544 /*
6545  * NOTE:
6546  * The interface must have been suspended though hn_suspend(), before
6547  * this function get called.
6548  */
6549 static void
6550 hn_synth_detach(struct hn_softc *sc)
6551 {
6552 
6553 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6554 	    ("synthetic parts were not attached"));
6555 
6556 	/* Detach the RNDIS first. */
6557 	hn_rndis_detach(sc);
6558 
6559 	/* Detach NVS. */
6560 	hn_nvs_detach(sc);
6561 
6562 	/* Detach all of the channels. */
6563 	hn_detach_allchans(sc);
6564 
6565 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6566 }
6567 
6568 static void
6569 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6570 {
6571 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6572 	    ("invalid ring count %d", ring_cnt));
6573 
6574 	if (sc->hn_tx_ring_cnt > ring_cnt)
6575 		sc->hn_tx_ring_inuse = ring_cnt;
6576 	else
6577 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6578 	sc->hn_rx_ring_inuse = ring_cnt;
6579 
6580 #ifdef RSS
6581 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6582 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6583 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6584 		    rss_getnumbuckets());
6585 	}
6586 #endif
6587 
6588 	if (bootverbose) {
6589 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6590 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6591 	}
6592 }
6593 
6594 static void
6595 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6596 {
6597 
6598 	/*
6599 	 * NOTE:
6600 	 * The TX bufring will not be drained by the hypervisor,
6601 	 * if the primary channel is revoked.
6602 	 */
6603 	while (!vmbus_chan_rx_empty(chan) ||
6604 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6605 	     !vmbus_chan_tx_empty(chan)))
6606 		pause("waitch", 1);
6607 	vmbus_chan_intr_drain(chan);
6608 }
6609 
6610 static void
6611 hn_disable_rx(struct hn_softc *sc)
6612 {
6613 
6614 	/*
6615 	 * Disable RX by clearing RX filter forcefully.
6616 	 */
6617 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6618 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6619 
6620 	/*
6621 	 * Give RNDIS enough time to flush all pending data packets.
6622 	 */
6623 	pause("waitrx", (200 * hz) / 1000);
6624 }
6625 
6626 /*
6627  * NOTE:
6628  * RX/TX _must_ have been suspended/disabled, before this function
6629  * is called.
6630  */
6631 static void
6632 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6633 {
6634 	struct vmbus_channel **subch = NULL;
6635 	int nsubch;
6636 
6637 	/*
6638 	 * Drain RX/TX bufrings and interrupts.
6639 	 */
6640 	nsubch = nchan - 1;
6641 	if (nsubch > 0)
6642 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6643 
6644 	if (subch != NULL) {
6645 		int i;
6646 
6647 		for (i = 0; i < nsubch; ++i)
6648 			hn_chan_drain(sc, subch[i]);
6649 	}
6650 	hn_chan_drain(sc, sc->hn_prichan);
6651 
6652 	if (subch != NULL)
6653 		vmbus_subchan_rel(subch, nsubch);
6654 }
6655 
6656 static void
6657 hn_suspend_data(struct hn_softc *sc)
6658 {
6659 	struct hn_tx_ring *txr;
6660 	int i;
6661 
6662 	HN_LOCK_ASSERT(sc);
6663 
6664 	/*
6665 	 * Suspend TX.
6666 	 */
6667 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6668 		txr = &sc->hn_tx_ring[i];
6669 
6670 		mtx_lock(&txr->hn_tx_lock);
6671 		txr->hn_suspended = 1;
6672 		mtx_unlock(&txr->hn_tx_lock);
6673 		/* No one is able send more packets now. */
6674 
6675 		/*
6676 		 * Wait for all pending sends to finish.
6677 		 *
6678 		 * NOTE:
6679 		 * We will _not_ receive all pending send-done, if the
6680 		 * primary channel is revoked.
6681 		 */
6682 		while (hn_tx_ring_pending(txr) &&
6683 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6684 			pause("hnwtx", 1 /* 1 tick */);
6685 	}
6686 
6687 	/*
6688 	 * Disable RX.
6689 	 */
6690 	hn_disable_rx(sc);
6691 
6692 	/*
6693 	 * Drain RX/TX.
6694 	 */
6695 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6696 
6697 	/*
6698 	 * Drain any pending TX tasks.
6699 	 *
6700 	 * NOTE:
6701 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6702 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6703 	 */
6704 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6705 		txr = &sc->hn_tx_ring[i];
6706 
6707 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6708 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6709 	}
6710 }
6711 
6712 static void
6713 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6714 {
6715 
6716 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6717 }
6718 
6719 static void
6720 hn_suspend_mgmt(struct hn_softc *sc)
6721 {
6722 	struct task task;
6723 
6724 	HN_LOCK_ASSERT(sc);
6725 
6726 	/*
6727 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6728 	 * through hn_mgmt_taskq.
6729 	 */
6730 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6731 	vmbus_chan_run_task(sc->hn_prichan, &task);
6732 
6733 	/*
6734 	 * Make sure that all pending management tasks are completed.
6735 	 */
6736 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6737 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6738 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6739 }
6740 
6741 static void
6742 hn_suspend(struct hn_softc *sc)
6743 {
6744 
6745 	/* Disable polling. */
6746 	hn_polling(sc, 0);
6747 
6748 	/*
6749 	 * If the non-transparent mode VF is activated, the synthetic
6750 	 * device is receiving packets, so the data path of the
6751 	 * synthetic device must be suspended.
6752 	 */
6753 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6754 	    (sc->hn_flags & HN_FLAG_RXVF))
6755 		hn_suspend_data(sc);
6756 	hn_suspend_mgmt(sc);
6757 }
6758 
6759 static void
6760 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6761 {
6762 	int i;
6763 
6764 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6765 	    ("invalid TX ring count %d", tx_ring_cnt));
6766 
6767 	for (i = 0; i < tx_ring_cnt; ++i) {
6768 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6769 
6770 		mtx_lock(&txr->hn_tx_lock);
6771 		txr->hn_suspended = 0;
6772 		mtx_unlock(&txr->hn_tx_lock);
6773 	}
6774 }
6775 
6776 static void
6777 hn_resume_data(struct hn_softc *sc)
6778 {
6779 	int i;
6780 
6781 	HN_LOCK_ASSERT(sc);
6782 
6783 	/*
6784 	 * Re-enable RX.
6785 	 */
6786 	hn_rxfilter_config(sc);
6787 
6788 	/*
6789 	 * Make sure to clear suspend status on "all" TX rings,
6790 	 * since hn_tx_ring_inuse can be changed after
6791 	 * hn_suspend_data().
6792 	 */
6793 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6794 
6795 #ifdef HN_IFSTART_SUPPORT
6796 	if (!hn_use_if_start)
6797 #endif
6798 	{
6799 		/*
6800 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6801 		 * reduced.
6802 		 */
6803 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6804 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6805 	}
6806 
6807 	/*
6808 	 * Kick start TX.
6809 	 */
6810 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6811 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6812 
6813 		/*
6814 		 * Use txeof task, so that any pending oactive can be
6815 		 * cleared properly.
6816 		 */
6817 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6818 	}
6819 }
6820 
6821 static void
6822 hn_resume_mgmt(struct hn_softc *sc)
6823 {
6824 
6825 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6826 
6827 	/*
6828 	 * Kick off network change detection, if it was pending.
6829 	 * If no network change was pending, start link status
6830 	 * checks, which is more lightweight than network change
6831 	 * detection.
6832 	 */
6833 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6834 		hn_change_network(sc);
6835 	else
6836 		hn_update_link_status(sc);
6837 }
6838 
6839 static void
6840 hn_resume(struct hn_softc *sc)
6841 {
6842 
6843 	/*
6844 	 * If the non-transparent mode VF is activated, the synthetic
6845 	 * device have to receive packets, so the data path of the
6846 	 * synthetic device must be resumed.
6847 	 */
6848 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6849 	    (sc->hn_flags & HN_FLAG_RXVF))
6850 		hn_resume_data(sc);
6851 
6852 	/*
6853 	 * Don't resume link status change if VF is attached/activated.
6854 	 * - In the non-transparent VF mode, the synthetic device marks
6855 	 *   link down until the VF is deactivated; i.e. VF is down.
6856 	 * - In transparent VF mode, VF's media status is used until
6857 	 *   the VF is detached.
6858 	 */
6859 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6860 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6861 		hn_resume_mgmt(sc);
6862 
6863 	/*
6864 	 * Re-enable polling if this interface is running and
6865 	 * the polling is requested.
6866 	 */
6867 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6868 		hn_polling(sc, sc->hn_pollhz);
6869 }
6870 
6871 static void
6872 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6873 {
6874 	const struct rndis_status_msg *msg;
6875 	int ofs;
6876 
6877 	if (dlen < sizeof(*msg)) {
6878 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6879 		return;
6880 	}
6881 	msg = data;
6882 
6883 	switch (msg->rm_status) {
6884 	case RNDIS_STATUS_MEDIA_CONNECT:
6885 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6886 		hn_update_link_status(sc);
6887 		break;
6888 
6889 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6890 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6891 		/* Not really useful; ignore. */
6892 		break;
6893 
6894 	case RNDIS_STATUS_NETWORK_CHANGE:
6895 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6896 		if (dlen < ofs + msg->rm_stbuflen ||
6897 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6898 			if_printf(sc->hn_ifp, "network changed\n");
6899 		} else {
6900 			uint32_t change;
6901 
6902 			memcpy(&change, ((const uint8_t *)msg) + ofs,
6903 			    sizeof(change));
6904 			if_printf(sc->hn_ifp, "network changed, change %u\n",
6905 			    change);
6906 		}
6907 		hn_change_network(sc);
6908 		break;
6909 
6910 	default:
6911 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6912 		    msg->rm_status);
6913 		break;
6914 	}
6915 }
6916 
6917 static int
6918 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6919 {
6920 	const struct rndis_pktinfo *pi = info_data;
6921 	uint32_t mask = 0;
6922 
6923 	while (info_dlen != 0) {
6924 		const void *data;
6925 		uint32_t dlen;
6926 
6927 		if (__predict_false(info_dlen < sizeof(*pi)))
6928 			return (EINVAL);
6929 		if (__predict_false(info_dlen < pi->rm_size))
6930 			return (EINVAL);
6931 		info_dlen -= pi->rm_size;
6932 
6933 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6934 			return (EINVAL);
6935 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6936 			return (EINVAL);
6937 		dlen = pi->rm_size - pi->rm_pktinfooffset;
6938 		data = pi->rm_data;
6939 
6940 		switch (pi->rm_type) {
6941 		case NDIS_PKTINFO_TYPE_VLAN:
6942 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6943 				return (EINVAL);
6944 			info->vlan_info = *((const uint32_t *)data);
6945 			mask |= HN_RXINFO_VLAN;
6946 			break;
6947 
6948 		case NDIS_PKTINFO_TYPE_CSUM:
6949 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6950 				return (EINVAL);
6951 			info->csum_info = *((const uint32_t *)data);
6952 			mask |= HN_RXINFO_CSUM;
6953 			break;
6954 
6955 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6956 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6957 				return (EINVAL);
6958 			info->hash_value = *((const uint32_t *)data);
6959 			mask |= HN_RXINFO_HASHVAL;
6960 			break;
6961 
6962 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
6963 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6964 				return (EINVAL);
6965 			info->hash_info = *((const uint32_t *)data);
6966 			mask |= HN_RXINFO_HASHINF;
6967 			break;
6968 
6969 		default:
6970 			goto next;
6971 		}
6972 
6973 		if (mask == HN_RXINFO_ALL) {
6974 			/* All found; done */
6975 			break;
6976 		}
6977 next:
6978 		pi = (const struct rndis_pktinfo *)
6979 		    ((const uint8_t *)pi + pi->rm_size);
6980 	}
6981 
6982 	/*
6983 	 * Final fixup.
6984 	 * - If there is no hash value, invalidate the hash info.
6985 	 */
6986 	if ((mask & HN_RXINFO_HASHVAL) == 0)
6987 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6988 	return (0);
6989 }
6990 
6991 static __inline bool
6992 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6993 {
6994 
6995 	if (off < check_off) {
6996 		if (__predict_true(off + len <= check_off))
6997 			return (false);
6998 	} else if (off > check_off) {
6999 		if (__predict_true(check_off + check_len <= off))
7000 			return (false);
7001 	}
7002 	return (true);
7003 }
7004 
7005 static void
7006 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7007 {
7008 	const struct rndis_packet_msg *pkt;
7009 	struct hn_rxinfo info;
7010 	int data_off, pktinfo_off, data_len, pktinfo_len;
7011 
7012 	/*
7013 	 * Check length.
7014 	 */
7015 	if (__predict_false(dlen < sizeof(*pkt))) {
7016 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7017 		return;
7018 	}
7019 	pkt = data;
7020 
7021 	if (__predict_false(dlen < pkt->rm_len)) {
7022 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7023 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7024 		return;
7025 	}
7026 	if (__predict_false(pkt->rm_len <
7027 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7028 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7029 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7030 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7031 		    pkt->rm_pktinfolen);
7032 		return;
7033 	}
7034 	if (__predict_false(pkt->rm_datalen == 0)) {
7035 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7036 		return;
7037 	}
7038 
7039 	/*
7040 	 * Check offests.
7041 	 */
7042 #define IS_OFFSET_INVALID(ofs)			\
7043 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7044 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7045 
7046 	/* XXX Hyper-V does not meet data offset alignment requirement */
7047 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7048 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7049 		    "data offset %u\n", pkt->rm_dataoffset);
7050 		return;
7051 	}
7052 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7053 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7054 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7055 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7056 		return;
7057 	}
7058 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7059 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7060 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7061 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7062 		return;
7063 	}
7064 
7065 #undef IS_OFFSET_INVALID
7066 
7067 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7068 	data_len = pkt->rm_datalen;
7069 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7070 	pktinfo_len = pkt->rm_pktinfolen;
7071 
7072 	/*
7073 	 * Check OOB coverage.
7074 	 */
7075 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7076 		int oob_off, oob_len;
7077 
7078 		if_printf(rxr->hn_ifp, "got oobdata\n");
7079 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7080 		oob_len = pkt->rm_oobdatalen;
7081 
7082 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7083 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7084 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7085 			    pkt->rm_len, oob_off, oob_len);
7086 			return;
7087 		}
7088 
7089 		/*
7090 		 * Check against data.
7091 		 */
7092 		if (hn_rndis_check_overlap(oob_off, oob_len,
7093 		    data_off, data_len)) {
7094 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7095 			    "oob overlaps data, oob abs %d len %d, "
7096 			    "data abs %d len %d\n",
7097 			    oob_off, oob_len, data_off, data_len);
7098 			return;
7099 		}
7100 
7101 		/*
7102 		 * Check against pktinfo.
7103 		 */
7104 		if (pktinfo_len != 0 &&
7105 		    hn_rndis_check_overlap(oob_off, oob_len,
7106 		    pktinfo_off, pktinfo_len)) {
7107 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7108 			    "oob overlaps pktinfo, oob abs %d len %d, "
7109 			    "pktinfo abs %d len %d\n",
7110 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7111 			return;
7112 		}
7113 	}
7114 
7115 	/*
7116 	 * Check per-packet-info coverage and find useful per-packet-info.
7117 	 */
7118 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7119 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7120 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7121 	if (__predict_true(pktinfo_len != 0)) {
7122 		bool overlap;
7123 		int error;
7124 
7125 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7126 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7127 			    "pktinfo overflow, msglen %u, "
7128 			    "pktinfo abs %d len %d\n",
7129 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7130 			return;
7131 		}
7132 
7133 		/*
7134 		 * Check packet info coverage.
7135 		 */
7136 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7137 		    data_off, data_len);
7138 		if (__predict_false(overlap)) {
7139 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7140 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7141 			    "data abs %d len %d\n",
7142 			    pktinfo_off, pktinfo_len, data_off, data_len);
7143 			return;
7144 		}
7145 
7146 		/*
7147 		 * Find useful per-packet-info.
7148 		 */
7149 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7150 		    pktinfo_len, &info);
7151 		if (__predict_false(error)) {
7152 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7153 			    "pktinfo\n");
7154 			return;
7155 		}
7156 	}
7157 
7158 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7159 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7160 		    "data overflow, msglen %u, data abs %d len %d\n",
7161 		    pkt->rm_len, data_off, data_len);
7162 		return;
7163 	}
7164 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7165 }
7166 
7167 static __inline void
7168 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7169 {
7170 	const struct rndis_msghdr *hdr;
7171 
7172 	if (__predict_false(dlen < sizeof(*hdr))) {
7173 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7174 		return;
7175 	}
7176 	hdr = data;
7177 
7178 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7179 		/* Hot data path. */
7180 		hn_rndis_rx_data(rxr, data, dlen);
7181 		/* Done! */
7182 		return;
7183 	}
7184 
7185 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7186 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7187 	else
7188 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7189 }
7190 
7191 static void
7192 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7193 {
7194 	const struct hn_nvs_hdr *hdr;
7195 
7196 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7197 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7198 		return;
7199 	}
7200 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7201 
7202 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7203 		/* Useless; ignore */
7204 		return;
7205 	}
7206 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7207 }
7208 
7209 static void
7210 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7211     const struct vmbus_chanpkt_hdr *pkt)
7212 {
7213 	struct hn_nvs_sendctx *sndc;
7214 
7215 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7216 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7217 	    VMBUS_CHANPKT_DATALEN(pkt));
7218 	/*
7219 	 * NOTE:
7220 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7221 	 * its callback.
7222 	 */
7223 }
7224 
7225 static void
7226 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7227     const struct vmbus_chanpkt_hdr *pkthdr)
7228 {
7229 	const struct vmbus_chanpkt_rxbuf *pkt;
7230 	const struct hn_nvs_hdr *nvs_hdr;
7231 	int count, i, hlen;
7232 
7233 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7234 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7235 		return;
7236 	}
7237 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7238 
7239 	/* Make sure that this is a RNDIS message. */
7240 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7241 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7242 		    nvs_hdr->nvs_type);
7243 		return;
7244 	}
7245 
7246 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7247 	if (__predict_false(hlen < sizeof(*pkt))) {
7248 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7249 		return;
7250 	}
7251 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7252 
7253 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7254 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7255 		    pkt->cp_rxbuf_id);
7256 		return;
7257 	}
7258 
7259 	count = pkt->cp_rxbuf_cnt;
7260 	if (__predict_false(hlen <
7261 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7262 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7263 		return;
7264 	}
7265 
7266 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7267 	for (i = 0; i < count; ++i) {
7268 		int ofs, len;
7269 
7270 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7271 		len = pkt->cp_rxbuf[i].rb_len;
7272 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7273 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7274 			    "ofs %d, len %d\n", i, ofs, len);
7275 			continue;
7276 		}
7277 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7278 	}
7279 
7280 	/*
7281 	 * Ack the consumed RXBUF associated w/ this channel packet,
7282 	 * so that this RXBUF can be recycled by the hypervisor.
7283 	 */
7284 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7285 }
7286 
7287 static void
7288 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7289     uint64_t tid)
7290 {
7291 	struct hn_nvs_rndis_ack ack;
7292 	int retries, error;
7293 
7294 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7295 	ack.nvs_status = HN_NVS_STATUS_OK;
7296 
7297 	retries = 0;
7298 again:
7299 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7300 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7301 	if (__predict_false(error == EAGAIN)) {
7302 		/*
7303 		 * NOTE:
7304 		 * This should _not_ happen in real world, since the
7305 		 * consumption of the TX bufring from the TX path is
7306 		 * controlled.
7307 		 */
7308 		if (rxr->hn_ack_failed == 0)
7309 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7310 		rxr->hn_ack_failed++;
7311 		retries++;
7312 		if (retries < 10) {
7313 			DELAY(100);
7314 			goto again;
7315 		}
7316 		/* RXBUF leaks! */
7317 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7318 	}
7319 }
7320 
7321 static void
7322 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7323 {
7324 	struct hn_rx_ring *rxr = xrxr;
7325 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7326 
7327 	for (;;) {
7328 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7329 		int error, pktlen;
7330 
7331 		pktlen = rxr->hn_pktbuf_len;
7332 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7333 		if (__predict_false(error == ENOBUFS)) {
7334 			void *nbuf;
7335 			int nlen;
7336 
7337 			/*
7338 			 * Expand channel packet buffer.
7339 			 *
7340 			 * XXX
7341 			 * Use M_WAITOK here, since allocation failure
7342 			 * is fatal.
7343 			 */
7344 			nlen = rxr->hn_pktbuf_len * 2;
7345 			while (nlen < pktlen)
7346 				nlen *= 2;
7347 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7348 
7349 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7350 			    rxr->hn_pktbuf_len, nlen);
7351 
7352 			free(rxr->hn_pktbuf, M_DEVBUF);
7353 			rxr->hn_pktbuf = nbuf;
7354 			rxr->hn_pktbuf_len = nlen;
7355 			/* Retry! */
7356 			continue;
7357 		} else if (__predict_false(error == EAGAIN)) {
7358 			/* No more channel packets; done! */
7359 			break;
7360 		}
7361 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7362 
7363 		switch (pkt->cph_type) {
7364 		case VMBUS_CHANPKT_TYPE_COMP:
7365 			hn_nvs_handle_comp(sc, chan, pkt);
7366 			break;
7367 
7368 		case VMBUS_CHANPKT_TYPE_RXBUF:
7369 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7370 			break;
7371 
7372 		case VMBUS_CHANPKT_TYPE_INBAND:
7373 			hn_nvs_handle_notify(sc, pkt);
7374 			break;
7375 
7376 		default:
7377 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7378 			    pkt->cph_type);
7379 			break;
7380 		}
7381 	}
7382 	hn_chan_rollup(rxr, rxr->hn_txr);
7383 }
7384 
7385 static void
7386 hn_sysinit(void *arg __unused)
7387 {
7388 	int i;
7389 
7390 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7391 
7392 #ifdef HN_IFSTART_SUPPORT
7393 	/*
7394 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7395 	 * mainly due to the IFF_DRV_OACTIVE flag.
7396 	 */
7397 	if (hn_xpnt_vf && hn_use_if_start) {
7398 		hn_use_if_start = 0;
7399 		printf("hn: tranparent VF mode, if_transmit will be used, "
7400 		    "instead of if_start\n");
7401 	}
7402 #endif
7403 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7404 		printf("hn: invalid transparent VF attach routing "
7405 		    "wait timeout %d, reset to %d\n",
7406 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7407 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7408 	}
7409 
7410 	/*
7411 	 * Initialize VF map.
7412 	 */
7413 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7414 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7415 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7416 	    M_WAITOK | M_ZERO);
7417 
7418 	/*
7419 	 * Fix the # of TX taskqueues.
7420 	 */
7421 	if (hn_tx_taskq_cnt <= 0)
7422 		hn_tx_taskq_cnt = 1;
7423 	else if (hn_tx_taskq_cnt > mp_ncpus)
7424 		hn_tx_taskq_cnt = mp_ncpus;
7425 
7426 	/*
7427 	 * Fix the TX taskqueue mode.
7428 	 */
7429 	switch (hn_tx_taskq_mode) {
7430 	case HN_TX_TASKQ_M_INDEP:
7431 	case HN_TX_TASKQ_M_GLOBAL:
7432 	case HN_TX_TASKQ_M_EVTTQ:
7433 		break;
7434 	default:
7435 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7436 		break;
7437 	}
7438 
7439 	if (vm_guest != VM_GUEST_HV)
7440 		return;
7441 
7442 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7443 		return;
7444 
7445 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7446 	    M_DEVBUF, M_WAITOK);
7447 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7448 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7449 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7450 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7451 		    "hn tx%d", i);
7452 	}
7453 }
7454 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7455 
7456 static void
7457 hn_sysuninit(void *arg __unused)
7458 {
7459 
7460 	if (hn_tx_taskque != NULL) {
7461 		int i;
7462 
7463 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7464 			taskqueue_free(hn_tx_taskque[i]);
7465 		free(hn_tx_taskque, M_DEVBUF);
7466 	}
7467 
7468 	if (hn_vfmap != NULL)
7469 		free(hn_vfmap, M_DEVBUF);
7470 	rm_destroy(&hn_vfmap_lock);
7471 
7472 	counter_u64_free(hn_udpcs_fixup);
7473 }
7474 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7475