xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 094fc1ed0f2627525c7b0342efcbad5be7a8546a)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87 
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99 
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107 
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112 
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118 
119 #include "vmbus_if.h"
120 
121 #define HN_IFSTART_SUPPORT
122 
123 #define HN_RING_CNT_DEF_MAX		8
124 
125 #define HN_VFMAP_SIZE_DEF		8
126 
127 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
128 
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT			512
131 
132 #define HN_RNDIS_PKT_LEN					\
133 	(sizeof(struct rndis_packet_msg) +			\
134 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
140 
141 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
146 
147 #define HN_DIRECT_TX_SIZE_DEF		128
148 
149 #define HN_EARLY_TXEOF_THRESH		8
150 
151 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
152 
153 #define HN_LROENT_CNT_DEF		128
154 
155 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
159 
160 #define HN_LRO_ACKCNT_DEF		1
161 
162 #define HN_LOCK_INIT(sc)		\
163 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc)					\
167 do {							\
168 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
169 		DELAY(1000);				\
170 } while (0)
171 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
172 
173 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc)		\
176 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc)	\
178 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 
180 #define HN_PKTSIZE_MIN(align)		\
181 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 	    HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align)		\
184 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191 
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 	SLIST_ENTRY(hn_txdesc)		link;
195 #endif
196 	STAILQ_ENTRY(hn_txdesc)		agg_link;
197 
198 	/* Aggregated txdescs, in sending order. */
199 	STAILQ_HEAD(, hn_txdesc)	agg_list;
200 
201 	/* The oldest packet, if transmission aggregation happens. */
202 	struct mbuf			*m;
203 	struct hn_tx_ring		*txr;
204 	int				refs;
205 	uint32_t			flags;	/* HN_TXD_FLAG_ */
206 	struct hn_nvs_sendctx		send_ctx;
207 	uint32_t			chim_index;
208 	int				chim_size;
209 
210 	bus_dmamap_t			data_dmap;
211 
212 	bus_addr_t			rndis_pkt_paddr;
213 	struct rndis_packet_msg		*rndis_pkt;
214 	bus_dmamap_t			rndis_pkt_dmap;
215 };
216 
217 #define HN_TXD_FLAG_ONLIST		0x0001
218 #define HN_TXD_FLAG_DMAMAP		0x0002
219 #define HN_TXD_FLAG_ONAGG		0x0004
220 
221 struct hn_rxinfo {
222 	uint32_t			vlan_info;
223 	uint32_t			csum_info;
224 	uint32_t			hash_info;
225 	uint32_t			hash_value;
226 };
227 
228 struct hn_rxvf_setarg {
229 	struct hn_rx_ring	*rxr;
230 	struct ifnet		*vf_ifp;
231 };
232 
233 #define HN_RXINFO_VLAN			0x0001
234 #define HN_RXINFO_CSUM			0x0002
235 #define HN_RXINFO_HASHINF		0x0004
236 #define HN_RXINFO_HASHVAL		0x0008
237 #define HN_RXINFO_ALL			\
238 	(HN_RXINFO_VLAN |		\
239 	 HN_RXINFO_CSUM |		\
240 	 HN_RXINFO_HASHINF |		\
241 	 HN_RXINFO_HASHVAL)
242 
243 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID	0
245 #define HN_NDIS_HASH_INFO_INVALID	0
246 
247 static int			hn_probe(device_t);
248 static int			hn_attach(device_t);
249 static int			hn_detach(device_t);
250 static int			hn_shutdown(device_t);
251 static void			hn_chan_callback(struct vmbus_channel *,
252 				    void *);
253 
254 static void			hn_init(void *);
255 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void			hn_start(struct ifnet *);
258 #endif
259 static int			hn_transmit(struct ifnet *, struct mbuf *);
260 static void			hn_xmit_qflush(struct ifnet *);
261 static int			hn_ifmedia_upd(struct ifnet *);
262 static void			hn_ifmedia_sts(struct ifnet *,
263 				    struct ifmediareq *);
264 
265 static void			hn_ifnet_event(void *, struct ifnet *, int);
266 static void			hn_ifaddr_event(void *, struct ifnet *);
267 static void			hn_ifnet_attevent(void *, struct ifnet *);
268 static void			hn_ifnet_detevent(void *, struct ifnet *);
269 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 
271 static bool			hn_ismyvf(const struct hn_softc *,
272 				    const struct ifnet *);
273 static void			hn_rxvf_change(struct hn_softc *,
274 				    struct ifnet *, bool);
275 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void			hn_rxvf_set_task(void *, int);
277 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 				    struct ifreq *);
281 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool			hn_xpnt_vf_isready(struct hn_softc *);
283 static void			hn_xpnt_vf_setready(struct hn_softc *);
284 static void			hn_xpnt_vf_init_taskfunc(void *, int);
285 static void			hn_xpnt_vf_init(struct hn_softc *);
286 static void			hn_xpnt_vf_setenable(struct hn_softc *);
287 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void			hn_vf_rss_restore(struct hn_softc *);
290 
291 static int			hn_rndis_rxinfo(const void *, int,
292 				    struct hn_rxinfo *);
293 static void			hn_rndis_rx_data(struct hn_rx_ring *,
294 				    const void *, int);
295 static void			hn_rndis_rx_status(struct hn_softc *,
296 				    const void *, int);
297 static void			hn_rndis_init_fixat(struct hn_softc *, int);
298 
299 static void			hn_nvs_handle_notify(struct hn_softc *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_comp(struct hn_softc *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *,
306 				    const struct vmbus_chanpkt_hdr *);
307 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 				    struct vmbus_channel *, uint64_t);
309 
310 #if __FreeBSD_version >= 1100099
311 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 
347 static void			hn_stop(struct hn_softc *, bool);
348 static void			hn_init_locked(struct hn_softc *);
349 static int			hn_chan_attach(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_chan_detach(struct hn_softc *,
352 				    struct vmbus_channel *);
353 static int			hn_attach_subchans(struct hn_softc *);
354 static void			hn_detach_allchans(struct hn_softc *);
355 static void			hn_chan_rollup(struct hn_rx_ring *,
356 				    struct hn_tx_ring *);
357 static void			hn_set_ring_inuse(struct hn_softc *, int);
358 static int			hn_synth_attach(struct hn_softc *, int);
359 static void			hn_synth_detach(struct hn_softc *);
360 static int			hn_synth_alloc_subchans(struct hn_softc *,
361 				    int *);
362 static bool			hn_synth_attachable(const struct hn_softc *);
363 static void			hn_suspend(struct hn_softc *);
364 static void			hn_suspend_data(struct hn_softc *);
365 static void			hn_suspend_mgmt(struct hn_softc *);
366 static void			hn_resume(struct hn_softc *);
367 static void			hn_resume_data(struct hn_softc *);
368 static void			hn_resume_mgmt(struct hn_softc *);
369 static void			hn_suspend_mgmt_taskfunc(void *, int);
370 static void			hn_chan_drain(struct hn_softc *,
371 				    struct vmbus_channel *);
372 static void			hn_disable_rx(struct hn_softc *);
373 static void			hn_drain_rxtx(struct hn_softc *, int);
374 static void			hn_polling(struct hn_softc *, u_int);
375 static void			hn_chan_polling(struct vmbus_channel *, u_int);
376 static void			hn_mtu_change_fixup(struct hn_softc *);
377 
378 static void			hn_update_link_status(struct hn_softc *);
379 static void			hn_change_network(struct hn_softc *);
380 static void			hn_link_taskfunc(void *, int);
381 static void			hn_netchg_init_taskfunc(void *, int);
382 static void			hn_netchg_status_taskfunc(void *, int);
383 static void			hn_link_status(struct hn_softc *);
384 
385 static int			hn_create_rx_data(struct hn_softc *, int);
386 static void			hn_destroy_rx_data(struct hn_softc *);
387 static int			hn_check_iplen(const struct mbuf *, int);
388 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
389 static int			hn_rxfilter_config(struct hn_softc *);
390 static int			hn_rss_reconfig(struct hn_softc *);
391 static void			hn_rss_ind_fixup(struct hn_softc *);
392 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
393 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
394 				    int, const struct hn_rxinfo *);
395 static uint32_t			hn_rss_type_fromndis(uint32_t);
396 static uint32_t			hn_rss_type_tondis(uint32_t);
397 
398 static int			hn_tx_ring_create(struct hn_softc *, int);
399 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
400 static int			hn_create_tx_data(struct hn_softc *, int);
401 static void			hn_fixup_tx_data(struct hn_softc *);
402 static void			hn_destroy_tx_data(struct hn_softc *);
403 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
404 static void			hn_txdesc_gc(struct hn_tx_ring *,
405 				    struct hn_txdesc *);
406 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
407 				    struct hn_txdesc *, struct mbuf **);
408 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
409 				    struct hn_txdesc *);
410 static void			hn_set_chim_size(struct hn_softc *, int);
411 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
412 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
413 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
414 static void			hn_resume_tx(struct hn_softc *, int);
415 static void			hn_set_txagg(struct hn_softc *);
416 static void			*hn_try_txagg(struct ifnet *,
417 				    struct hn_tx_ring *, struct hn_txdesc *,
418 				    int);
419 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
420 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
421 				    struct hn_softc *, struct vmbus_channel *,
422 				    const void *, int);
423 static int			hn_txpkt_sglist(struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static int			hn_txpkt_chim(struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static int			hn_xmit(struct hn_tx_ring *, int);
428 static void			hn_xmit_taskfunc(void *, int);
429 static void			hn_xmit_txeof(struct hn_tx_ring *);
430 static void			hn_xmit_txeof_taskfunc(void *, int);
431 #ifdef HN_IFSTART_SUPPORT
432 static int			hn_start_locked(struct hn_tx_ring *, int);
433 static void			hn_start_taskfunc(void *, int);
434 static void			hn_start_txeof(struct hn_tx_ring *);
435 static void			hn_start_txeof_taskfunc(void *, int);
436 #endif
437 
438 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
439     "Hyper-V network interface");
440 
441 /* Trust tcp segements verification on host side. */
442 static int			hn_trust_hosttcp = 1;
443 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
444     &hn_trust_hosttcp, 0,
445     "Trust tcp segement verification on host side, "
446     "when csum info is missing (global setting)");
447 
448 /* Trust udp datagrams verification on host side. */
449 static int			hn_trust_hostudp = 1;
450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
451     &hn_trust_hostudp, 0,
452     "Trust udp datagram verification on host side, "
453     "when csum info is missing (global setting)");
454 
455 /* Trust ip packets verification on host side. */
456 static int			hn_trust_hostip = 1;
457 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
458     &hn_trust_hostip, 0,
459     "Trust ip packet verification on host side, "
460     "when csum info is missing (global setting)");
461 
462 /*
463  * Offload UDP/IPv4 checksum.
464  */
465 static int			hn_enable_udp4cs = 1;
466 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
467     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
468 
469 /*
470  * Offload UDP/IPv6 checksum.
471  */
472 static int			hn_enable_udp6cs = 1;
473 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
474     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
475 
476 /* Stats. */
477 static counter_u64_t		hn_udpcs_fixup;
478 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
479     &hn_udpcs_fixup, "# of UDP checksum fixup");
480 
481 /*
482  * See hn_set_hlen().
483  *
484  * This value is for Azure.  For Hyper-V, set this above
485  * 65536 to disable UDP datagram checksum fixup.
486  */
487 static int			hn_udpcs_fixup_mtu = 1420;
488 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
489     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
490 
491 /* Limit TSO burst size */
492 static int			hn_tso_maxlen = IP_MAXPACKET;
493 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
494     &hn_tso_maxlen, 0, "TSO burst limit");
495 
496 /* Limit chimney send size */
497 static int			hn_tx_chimney_size = 0;
498 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
499     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
500 
501 /* Limit the size of packet for direct transmission */
502 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
503 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
504     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
505 
506 /* # of LRO entries per RX ring */
507 #if defined(INET) || defined(INET6)
508 #if __FreeBSD_version >= 1100095
509 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
510 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
511     &hn_lro_entry_count, 0, "LRO entry count");
512 #endif
513 #endif
514 
515 static int			hn_tx_taskq_cnt = 1;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
517     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
518 
519 #define HN_TX_TASKQ_M_INDEP	0
520 #define HN_TX_TASKQ_M_GLOBAL	1
521 #define HN_TX_TASKQ_M_EVTTQ	2
522 
523 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
524 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
525     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
526     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
527 
528 #ifndef HN_USE_TXDESC_BUFRING
529 static int			hn_use_txdesc_bufring = 0;
530 #else
531 static int			hn_use_txdesc_bufring = 1;
532 #endif
533 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
534     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
535 
536 #ifdef HN_IFSTART_SUPPORT
537 /* Use ifnet.if_start instead of ifnet.if_transmit */
538 static int			hn_use_if_start = 0;
539 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
540     &hn_use_if_start, 0, "Use if_start TX method");
541 #endif
542 
543 /* # of channels to use */
544 static int			hn_chan_cnt = 0;
545 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
546     &hn_chan_cnt, 0,
547     "# of channels to use; each channel has one RX ring and one TX ring");
548 
549 /* # of transmit rings to use */
550 static int			hn_tx_ring_cnt = 0;
551 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
552     &hn_tx_ring_cnt, 0, "# of TX rings to use");
553 
554 /* Software TX ring deptch */
555 static int			hn_tx_swq_depth = 0;
556 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
557     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
558 
559 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
560 #if __FreeBSD_version >= 1100095
561 static u_int			hn_lro_mbufq_depth = 0;
562 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
563     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
564 #endif
565 
566 /* Packet transmission aggregation size limit */
567 static int			hn_tx_agg_size = -1;
568 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
569     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
570 
571 /* Packet transmission aggregation count limit */
572 static int			hn_tx_agg_pkts = -1;
573 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
574     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
575 
576 /* VF list */
577 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
578     0, 0, hn_vflist_sysctl, "A", "VF list");
579 
580 /* VF mapping */
581 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
582     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
583 
584 /* Transparent VF */
585 static int			hn_xpnt_vf = 0;
586 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
587     &hn_xpnt_vf, 0, "Transparent VF mod");
588 
589 /* Accurate BPF support for Transparent VF */
590 static int			hn_xpnt_vf_accbpf = 0;
591 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
592     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
593 
594 /* Extra wait for transparent VF attach routing; unit seconds. */
595 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
596 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
597     &hn_xpnt_vf_attwait, 0,
598     "Extra wait for transparent VF attach routing; unit: seconds");
599 
600 static u_int			hn_cpu_index;	/* next CPU for channel */
601 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
602 
603 static struct rmlock		hn_vfmap_lock;
604 static int			hn_vfmap_size;
605 static struct ifnet		**hn_vfmap;
606 
607 #ifndef RSS
608 static const uint8_t
609 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
610 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
611 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
612 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
613 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
614 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
615 };
616 #endif	/* !RSS */
617 
618 static const struct hyperv_guid	hn_guid = {
619 	.hv_guid = {
620 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
621 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
622 };
623 
624 static device_method_t hn_methods[] = {
625 	/* Device interface */
626 	DEVMETHOD(device_probe,		hn_probe),
627 	DEVMETHOD(device_attach,	hn_attach),
628 	DEVMETHOD(device_detach,	hn_detach),
629 	DEVMETHOD(device_shutdown,	hn_shutdown),
630 	DEVMETHOD_END
631 };
632 
633 static driver_t hn_driver = {
634 	"hn",
635 	hn_methods,
636 	sizeof(struct hn_softc)
637 };
638 
639 static devclass_t hn_devclass;
640 
641 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
642 MODULE_VERSION(hn, 1);
643 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
644 
645 #if __FreeBSD_version >= 1100099
646 static void
647 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
648 {
649 	int i;
650 
651 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
652 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
653 }
654 #endif
655 
656 static int
657 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
658 {
659 
660 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
661 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
662 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
663 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
664 }
665 
666 static int
667 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
668 {
669 	struct hn_nvs_rndis rndis;
670 
671 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
672 	    txd->chim_size > 0, ("invalid rndis chim txd"));
673 
674 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
675 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
676 	rndis.nvs_chim_idx = txd->chim_index;
677 	rndis.nvs_chim_sz = txd->chim_size;
678 
679 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
680 	    &rndis, sizeof(rndis), &txd->send_ctx));
681 }
682 
683 static __inline uint32_t
684 hn_chim_alloc(struct hn_softc *sc)
685 {
686 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
687 	u_long *bmap = sc->hn_chim_bmap;
688 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
689 
690 	for (i = 0; i < bmap_cnt; ++i) {
691 		int idx;
692 
693 		idx = ffsl(~bmap[i]);
694 		if (idx == 0)
695 			continue;
696 
697 		--idx; /* ffsl is 1-based */
698 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
699 		    ("invalid i %d and idx %d", i, idx));
700 
701 		if (atomic_testandset_long(&bmap[i], idx))
702 			continue;
703 
704 		ret = i * LONG_BIT + idx;
705 		break;
706 	}
707 	return (ret);
708 }
709 
710 static __inline void
711 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
712 {
713 	u_long mask;
714 	uint32_t idx;
715 
716 	idx = chim_idx / LONG_BIT;
717 	KASSERT(idx < sc->hn_chim_bmap_cnt,
718 	    ("invalid chimney index 0x%x", chim_idx));
719 
720 	mask = 1UL << (chim_idx % LONG_BIT);
721 	KASSERT(sc->hn_chim_bmap[idx] & mask,
722 	    ("index bitmap 0x%lx, chimney index %u, "
723 	     "bitmap idx %d, bitmask 0x%lx",
724 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
725 
726 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
727 }
728 
729 #if defined(INET6) || defined(INET)
730 
731 #define PULLUP_HDR(m, len)				\
732 do {							\
733 	if (__predict_false((m)->m_len < (len))) {	\
734 		(m) = m_pullup((m), (len));		\
735 		if ((m) == NULL)			\
736 			return (NULL);			\
737 	}						\
738 } while (0)
739 
740 /*
741  * NOTE: If this function failed, the m_head would be freed.
742  */
743 static __inline struct mbuf *
744 hn_tso_fixup(struct mbuf *m_head)
745 {
746 	struct ether_vlan_header *evl;
747 	struct tcphdr *th;
748 	int ehlen;
749 
750 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
751 
752 	PULLUP_HDR(m_head, sizeof(*evl));
753 	evl = mtod(m_head, struct ether_vlan_header *);
754 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
755 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
756 	else
757 		ehlen = ETHER_HDR_LEN;
758 	m_head->m_pkthdr.l2hlen = ehlen;
759 
760 #ifdef INET
761 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
762 		struct ip *ip;
763 		int iphlen;
764 
765 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
766 		ip = mtodo(m_head, ehlen);
767 		iphlen = ip->ip_hl << 2;
768 		m_head->m_pkthdr.l3hlen = iphlen;
769 
770 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
771 		th = mtodo(m_head, ehlen + iphlen);
772 
773 		ip->ip_len = 0;
774 		ip->ip_sum = 0;
775 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
776 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
777 	}
778 #endif
779 #if defined(INET6) && defined(INET)
780 	else
781 #endif
782 #ifdef INET6
783 	{
784 		struct ip6_hdr *ip6;
785 
786 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
787 		ip6 = mtodo(m_head, ehlen);
788 		if (ip6->ip6_nxt != IPPROTO_TCP) {
789 			m_freem(m_head);
790 			return (NULL);
791 		}
792 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
793 
794 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
795 		th = mtodo(m_head, ehlen + sizeof(*ip6));
796 
797 		ip6->ip6_plen = 0;
798 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
799 	}
800 #endif
801 	return (m_head);
802 }
803 
804 /*
805  * NOTE: If this function failed, the m_head would be freed.
806  */
807 static __inline struct mbuf *
808 hn_set_hlen(struct mbuf *m_head)
809 {
810 	const struct ether_vlan_header *evl;
811 	int ehlen;
812 
813 	PULLUP_HDR(m_head, sizeof(*evl));
814 	evl = mtod(m_head, const struct ether_vlan_header *);
815 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
816 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
817 	else
818 		ehlen = ETHER_HDR_LEN;
819 	m_head->m_pkthdr.l2hlen = ehlen;
820 
821 #ifdef INET
822 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
823 		const struct ip *ip;
824 		int iphlen;
825 
826 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
827 		ip = mtodo(m_head, ehlen);
828 		iphlen = ip->ip_hl << 2;
829 		m_head->m_pkthdr.l3hlen = iphlen;
830 
831 		/*
832 		 * UDP checksum offload does not work in Azure, if the
833 		 * following conditions meet:
834 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
835 		 * - IP_DF is not set in the IP hdr.
836 		 *
837 		 * Fallback to software checksum for these UDP datagrams.
838 		 */
839 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
840 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
841 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
842 			uint16_t off = ehlen + iphlen;
843 
844 			counter_u64_add(hn_udpcs_fixup, 1);
845 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
846 			*(uint16_t *)(m_head->m_data + off +
847                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
848 			    m_head, m_head->m_pkthdr.len, off);
849 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
850 		}
851 	}
852 #endif
853 #if defined(INET6) && defined(INET)
854 	else
855 #endif
856 #ifdef INET6
857 	{
858 		const struct ip6_hdr *ip6;
859 
860 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
861 		ip6 = mtodo(m_head, ehlen);
862 		if (ip6->ip6_nxt != IPPROTO_TCP) {
863 			m_freem(m_head);
864 			return (NULL);
865 		}
866 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
867 	}
868 #endif
869 	return (m_head);
870 }
871 
872 /*
873  * NOTE: If this function failed, the m_head would be freed.
874  */
875 static __inline struct mbuf *
876 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
877 {
878 	const struct tcphdr *th;
879 	int ehlen, iphlen;
880 
881 	*tcpsyn = 0;
882 	ehlen = m_head->m_pkthdr.l2hlen;
883 	iphlen = m_head->m_pkthdr.l3hlen;
884 
885 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
886 	th = mtodo(m_head, ehlen + iphlen);
887 	if (th->th_flags & TH_SYN)
888 		*tcpsyn = 1;
889 	return (m_head);
890 }
891 
892 #undef PULLUP_HDR
893 
894 #endif	/* INET6 || INET */
895 
896 static int
897 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
898 {
899 	int error = 0;
900 
901 	HN_LOCK_ASSERT(sc);
902 
903 	if (sc->hn_rx_filter != filter) {
904 		error = hn_rndis_set_rxfilter(sc, filter);
905 		if (!error)
906 			sc->hn_rx_filter = filter;
907 	}
908 	return (error);
909 }
910 
911 static int
912 hn_rxfilter_config(struct hn_softc *sc)
913 {
914 	struct ifnet *ifp = sc->hn_ifp;
915 	uint32_t filter;
916 
917 	HN_LOCK_ASSERT(sc);
918 
919 	/*
920 	 * If the non-transparent mode VF is activated, we don't know how
921 	 * its RX filter is configured, so stick the synthetic device in
922 	 * the promiscous mode.
923 	 */
924 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
925 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
926 	} else {
927 		filter = NDIS_PACKET_TYPE_DIRECTED;
928 		if (ifp->if_flags & IFF_BROADCAST)
929 			filter |= NDIS_PACKET_TYPE_BROADCAST;
930 		/* TODO: support multicast list */
931 		if ((ifp->if_flags & IFF_ALLMULTI) ||
932 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
933 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
934 	}
935 	return (hn_set_rxfilter(sc, filter));
936 }
937 
938 static void
939 hn_set_txagg(struct hn_softc *sc)
940 {
941 	uint32_t size, pkts;
942 	int i;
943 
944 	/*
945 	 * Setup aggregation size.
946 	 */
947 	if (sc->hn_agg_size < 0)
948 		size = UINT32_MAX;
949 	else
950 		size = sc->hn_agg_size;
951 
952 	if (sc->hn_rndis_agg_size < size)
953 		size = sc->hn_rndis_agg_size;
954 
955 	/* NOTE: We only aggregate packets using chimney sending buffers. */
956 	if (size > (uint32_t)sc->hn_chim_szmax)
957 		size = sc->hn_chim_szmax;
958 
959 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
960 		/* Disable */
961 		size = 0;
962 		pkts = 0;
963 		goto done;
964 	}
965 
966 	/* NOTE: Type of the per TX ring setting is 'int'. */
967 	if (size > INT_MAX)
968 		size = INT_MAX;
969 
970 	/*
971 	 * Setup aggregation packet count.
972 	 */
973 	if (sc->hn_agg_pkts < 0)
974 		pkts = UINT32_MAX;
975 	else
976 		pkts = sc->hn_agg_pkts;
977 
978 	if (sc->hn_rndis_agg_pkts < pkts)
979 		pkts = sc->hn_rndis_agg_pkts;
980 
981 	if (pkts <= 1) {
982 		/* Disable */
983 		size = 0;
984 		pkts = 0;
985 		goto done;
986 	}
987 
988 	/* NOTE: Type of the per TX ring setting is 'short'. */
989 	if (pkts > SHRT_MAX)
990 		pkts = SHRT_MAX;
991 
992 done:
993 	/* NOTE: Type of the per TX ring setting is 'short'. */
994 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
995 		/* Disable */
996 		size = 0;
997 		pkts = 0;
998 	}
999 
1000 	if (bootverbose) {
1001 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1002 		    size, pkts, sc->hn_rndis_agg_align);
1003 	}
1004 
1005 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1006 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1007 
1008 		mtx_lock(&txr->hn_tx_lock);
1009 		txr->hn_agg_szmax = size;
1010 		txr->hn_agg_pktmax = pkts;
1011 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1012 		mtx_unlock(&txr->hn_tx_lock);
1013 	}
1014 }
1015 
1016 static int
1017 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1018 {
1019 
1020 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1021 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1022 		return txr->hn_txdesc_cnt;
1023 	return hn_tx_swq_depth;
1024 }
1025 
1026 static int
1027 hn_rss_reconfig(struct hn_softc *sc)
1028 {
1029 	int error;
1030 
1031 	HN_LOCK_ASSERT(sc);
1032 
1033 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1034 		return (ENXIO);
1035 
1036 	/*
1037 	 * Disable RSS first.
1038 	 *
1039 	 * NOTE:
1040 	 * Direct reconfiguration by setting the UNCHG flags does
1041 	 * _not_ work properly.
1042 	 */
1043 	if (bootverbose)
1044 		if_printf(sc->hn_ifp, "disable RSS\n");
1045 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1046 	if (error) {
1047 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1048 		return (error);
1049 	}
1050 
1051 	/*
1052 	 * Reenable the RSS w/ the updated RSS key or indirect
1053 	 * table.
1054 	 */
1055 	if (bootverbose)
1056 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1057 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1058 	if (error) {
1059 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1060 		return (error);
1061 	}
1062 	return (0);
1063 }
1064 
1065 static void
1066 hn_rss_ind_fixup(struct hn_softc *sc)
1067 {
1068 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1069 	int i, nchan;
1070 
1071 	nchan = sc->hn_rx_ring_inuse;
1072 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1073 
1074 	/*
1075 	 * Check indirect table to make sure that all channels in it
1076 	 * can be used.
1077 	 */
1078 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1079 		if (rss->rss_ind[i] >= nchan) {
1080 			if_printf(sc->hn_ifp,
1081 			    "RSS indirect table %d fixup: %u -> %d\n",
1082 			    i, rss->rss_ind[i], nchan - 1);
1083 			rss->rss_ind[i] = nchan - 1;
1084 		}
1085 	}
1086 }
1087 
1088 static int
1089 hn_ifmedia_upd(struct ifnet *ifp __unused)
1090 {
1091 
1092 	return EOPNOTSUPP;
1093 }
1094 
1095 static void
1096 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1097 {
1098 	struct hn_softc *sc = ifp->if_softc;
1099 
1100 	ifmr->ifm_status = IFM_AVALID;
1101 	ifmr->ifm_active = IFM_ETHER;
1102 
1103 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1104 		ifmr->ifm_active |= IFM_NONE;
1105 		return;
1106 	}
1107 	ifmr->ifm_status |= IFM_ACTIVE;
1108 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1109 }
1110 
1111 static void
1112 hn_rxvf_set_task(void *xarg, int pending __unused)
1113 {
1114 	struct hn_rxvf_setarg *arg = xarg;
1115 
1116 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1117 }
1118 
1119 static void
1120 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1121 {
1122 	struct hn_rx_ring *rxr;
1123 	struct hn_rxvf_setarg arg;
1124 	struct task task;
1125 	int i;
1126 
1127 	HN_LOCK_ASSERT(sc);
1128 
1129 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1130 
1131 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1132 		rxr = &sc->hn_rx_ring[i];
1133 
1134 		if (i < sc->hn_rx_ring_inuse) {
1135 			arg.rxr = rxr;
1136 			arg.vf_ifp = vf_ifp;
1137 			vmbus_chan_run_task(rxr->hn_chan, &task);
1138 		} else {
1139 			rxr->hn_rxvf_ifp = vf_ifp;
1140 		}
1141 	}
1142 }
1143 
1144 static bool
1145 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1146 {
1147 	const struct ifnet *hn_ifp;
1148 
1149 	hn_ifp = sc->hn_ifp;
1150 
1151 	if (ifp == hn_ifp)
1152 		return (false);
1153 
1154 	if (ifp->if_alloctype != IFT_ETHER)
1155 		return (false);
1156 
1157 	/* Ignore lagg/vlan interfaces */
1158 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1159 	    strcmp(ifp->if_dname, "vlan") == 0)
1160 		return (false);
1161 
1162 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1163 		return (false);
1164 
1165 	return (true);
1166 }
1167 
1168 static void
1169 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1170 {
1171 	struct ifnet *hn_ifp;
1172 
1173 	HN_LOCK(sc);
1174 
1175 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1176 		goto out;
1177 
1178 	if (!hn_ismyvf(sc, ifp))
1179 		goto out;
1180 	hn_ifp = sc->hn_ifp;
1181 
1182 	if (rxvf) {
1183 		if (sc->hn_flags & HN_FLAG_RXVF)
1184 			goto out;
1185 
1186 		sc->hn_flags |= HN_FLAG_RXVF;
1187 		hn_rxfilter_config(sc);
1188 	} else {
1189 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1190 			goto out;
1191 
1192 		sc->hn_flags &= ~HN_FLAG_RXVF;
1193 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1194 			hn_rxfilter_config(sc);
1195 		else
1196 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1197 	}
1198 
1199 	hn_nvs_set_datapath(sc,
1200 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1201 
1202 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1203 
1204 	if (rxvf) {
1205 		hn_vf_rss_fixup(sc, true);
1206 		hn_suspend_mgmt(sc);
1207 		sc->hn_link_flags &=
1208 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1209 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1210 	} else {
1211 		hn_vf_rss_restore(sc);
1212 		hn_resume_mgmt(sc);
1213 	}
1214 
1215 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1216 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1217 
1218 	if (bootverbose) {
1219 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1220 		    rxvf ? "to" : "from", ifp->if_xname);
1221 	}
1222 out:
1223 	HN_UNLOCK(sc);
1224 }
1225 
1226 static void
1227 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1228 {
1229 
1230 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1231 		return;
1232 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1233 }
1234 
1235 static void
1236 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1237 {
1238 
1239 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1240 }
1241 
1242 static int
1243 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1244 {
1245 	struct ifnet *ifp, *vf_ifp;
1246 	uint64_t tmp;
1247 	int error;
1248 
1249 	HN_LOCK_ASSERT(sc);
1250 	ifp = sc->hn_ifp;
1251 	vf_ifp = sc->hn_vf_ifp;
1252 
1253 	/*
1254 	 * Fix up requested capabilities w/ supported capabilities,
1255 	 * since the supported capabilities could have been changed.
1256 	 */
1257 	ifr->ifr_reqcap &= ifp->if_capabilities;
1258 	/* Pass SIOCSIFCAP to VF. */
1259 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1260 
1261 	/*
1262 	 * NOTE:
1263 	 * The error will be propagated to the callers, however, it
1264 	 * is _not_ useful here.
1265 	 */
1266 
1267 	/*
1268 	 * Merge VF's enabled capabilities.
1269 	 */
1270 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1271 
1272 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1273 	if (ifp->if_capenable & IFCAP_TXCSUM)
1274 		ifp->if_hwassist |= tmp;
1275 	else
1276 		ifp->if_hwassist &= ~tmp;
1277 
1278 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1279 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1280 		ifp->if_hwassist |= tmp;
1281 	else
1282 		ifp->if_hwassist &= ~tmp;
1283 
1284 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1285 	if (ifp->if_capenable & IFCAP_TSO4)
1286 		ifp->if_hwassist |= tmp;
1287 	else
1288 		ifp->if_hwassist &= ~tmp;
1289 
1290 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1291 	if (ifp->if_capenable & IFCAP_TSO6)
1292 		ifp->if_hwassist |= tmp;
1293 	else
1294 		ifp->if_hwassist &= ~tmp;
1295 
1296 	return (error);
1297 }
1298 
1299 static int
1300 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1301 {
1302 	struct ifnet *vf_ifp;
1303 	struct ifreq ifr;
1304 
1305 	HN_LOCK_ASSERT(sc);
1306 	vf_ifp = sc->hn_vf_ifp;
1307 
1308 	memset(&ifr, 0, sizeof(ifr));
1309 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1310 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1311 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1312 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1313 }
1314 
1315 static void
1316 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1317 {
1318 	struct ifnet *ifp = sc->hn_ifp;
1319 	int allmulti = 0;
1320 
1321 	HN_LOCK_ASSERT(sc);
1322 
1323 	/* XXX vlan(4) style mcast addr maintenance */
1324 	if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1325 		allmulti = IFF_ALLMULTI;
1326 
1327 	/* Always set the VF's if_flags */
1328 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1329 }
1330 
1331 static void
1332 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1333 {
1334 	struct rm_priotracker pt;
1335 	struct ifnet *hn_ifp = NULL;
1336 	struct mbuf *mn;
1337 
1338 	/*
1339 	 * XXX racy, if hn(4) ever detached.
1340 	 */
1341 	rm_rlock(&hn_vfmap_lock, &pt);
1342 	if (vf_ifp->if_index < hn_vfmap_size)
1343 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1344 	rm_runlock(&hn_vfmap_lock, &pt);
1345 
1346 	if (hn_ifp != NULL) {
1347 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1348 			/*
1349 			 * Allow tapping on the VF.
1350 			 */
1351 			ETHER_BPF_MTAP(vf_ifp, mn);
1352 
1353 			/*
1354 			 * Update VF stats.
1355 			 */
1356 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1357 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1358 				    mn->m_pkthdr.len);
1359 			}
1360 			/*
1361 			 * XXX IFCOUNTER_IMCAST
1362 			 * This stat updating is kinda invasive, since it
1363 			 * requires two checks on the mbuf: the length check
1364 			 * and the ethernet header check.  As of this write,
1365 			 * all multicast packets go directly to hn(4), which
1366 			 * makes imcast stat updating in the VF a try in vian.
1367 			 */
1368 
1369 			/*
1370 			 * Fix up rcvif and increase hn(4)'s ipackets.
1371 			 */
1372 			mn->m_pkthdr.rcvif = hn_ifp;
1373 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1374 		}
1375 		/*
1376 		 * Go through hn(4)'s if_input.
1377 		 */
1378 		hn_ifp->if_input(hn_ifp, m);
1379 	} else {
1380 		/*
1381 		 * In the middle of the transition; free this
1382 		 * mbuf chain.
1383 		 */
1384 		while (m != NULL) {
1385 			mn = m->m_nextpkt;
1386 			m->m_nextpkt = NULL;
1387 			m_freem(m);
1388 			m = mn;
1389 		}
1390 	}
1391 }
1392 
1393 static void
1394 hn_mtu_change_fixup(struct hn_softc *sc)
1395 {
1396 	struct ifnet *ifp;
1397 
1398 	HN_LOCK_ASSERT(sc);
1399 	ifp = sc->hn_ifp;
1400 
1401 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1402 #if __FreeBSD_version >= 1100099
1403 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1404 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1405 #endif
1406 }
1407 
1408 static uint32_t
1409 hn_rss_type_fromndis(uint32_t rss_hash)
1410 {
1411 	uint32_t types = 0;
1412 
1413 	if (rss_hash & NDIS_HASH_IPV4)
1414 		types |= RSS_TYPE_IPV4;
1415 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1416 		types |= RSS_TYPE_TCP_IPV4;
1417 	if (rss_hash & NDIS_HASH_IPV6)
1418 		types |= RSS_TYPE_IPV6;
1419 	if (rss_hash & NDIS_HASH_IPV6_EX)
1420 		types |= RSS_TYPE_IPV6_EX;
1421 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1422 		types |= RSS_TYPE_TCP_IPV6;
1423 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1424 		types |= RSS_TYPE_TCP_IPV6_EX;
1425 	return (types);
1426 }
1427 
1428 static uint32_t
1429 hn_rss_type_tondis(uint32_t types)
1430 {
1431 	uint32_t rss_hash = 0;
1432 
1433 	KASSERT((types &
1434 	(RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1435 	("UDP4, UDP6 and UDP6EX are not supported"));
1436 
1437 	if (types & RSS_TYPE_IPV4)
1438 		rss_hash |= NDIS_HASH_IPV4;
1439 	if (types & RSS_TYPE_TCP_IPV4)
1440 		rss_hash |= NDIS_HASH_TCP_IPV4;
1441 	if (types & RSS_TYPE_IPV6)
1442 		rss_hash |= NDIS_HASH_IPV6;
1443 	if (types & RSS_TYPE_IPV6_EX)
1444 		rss_hash |= NDIS_HASH_IPV6_EX;
1445 	if (types & RSS_TYPE_TCP_IPV6)
1446 		rss_hash |= NDIS_HASH_TCP_IPV6;
1447 	if (types & RSS_TYPE_TCP_IPV6_EX)
1448 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1449 	return (rss_hash);
1450 }
1451 
1452 static void
1453 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1454 {
1455 	int i;
1456 
1457 	HN_LOCK_ASSERT(sc);
1458 
1459 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1460 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1461 }
1462 
1463 static void
1464 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1465 {
1466 	struct ifnet *ifp, *vf_ifp;
1467 	struct ifrsshash ifrh;
1468 	struct ifrsskey ifrk;
1469 	int error;
1470 	uint32_t my_types, diff_types, mbuf_types = 0;
1471 
1472 	HN_LOCK_ASSERT(sc);
1473 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1474 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1475 
1476 	if (sc->hn_rx_ring_inuse == 1) {
1477 		/* No RSS on synthetic parts; done. */
1478 		return;
1479 	}
1480 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1481 		/* Synthetic parts do not support Toeplitz; done. */
1482 		return;
1483 	}
1484 
1485 	ifp = sc->hn_ifp;
1486 	vf_ifp = sc->hn_vf_ifp;
1487 
1488 	/*
1489 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1490 	 * supported.
1491 	 */
1492 	memset(&ifrk, 0, sizeof(ifrk));
1493 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1494 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1495 	if (error) {
1496 		if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1497 		    vf_ifp->if_xname, error);
1498 		goto done;
1499 	}
1500 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1501 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1502 		    vf_ifp->if_xname, ifrk.ifrk_func);
1503 		goto done;
1504 	}
1505 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1506 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1507 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1508 		goto done;
1509 	}
1510 
1511 	/*
1512 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1513 	 */
1514 	memset(&ifrh, 0, sizeof(ifrh));
1515 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1516 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1517 	if (error) {
1518 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1519 		    vf_ifp->if_xname, error);
1520 		goto done;
1521 	}
1522 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1523 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1524 		    vf_ifp->if_xname, ifrh.ifrh_func);
1525 		goto done;
1526 	}
1527 
1528 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1529 	if ((ifrh.ifrh_types & my_types) == 0) {
1530 		/* This disables RSS; ignore it then */
1531 		if_printf(ifp, "%s intersection of RSS types failed.  "
1532 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1533 		    ifrh.ifrh_types, my_types);
1534 		goto done;
1535 	}
1536 
1537 	diff_types = my_types ^ ifrh.ifrh_types;
1538 	my_types &= ifrh.ifrh_types;
1539 	mbuf_types = my_types;
1540 
1541 	/*
1542 	 * Detect RSS hash value/type confliction.
1543 	 *
1544 	 * NOTE:
1545 	 * We don't disable the hash type, but stop delivery the hash
1546 	 * value/type through mbufs on RX path.
1547 	 */
1548 	if ((my_types & RSS_TYPE_IPV4) &&
1549 	    (diff_types & ifrh.ifrh_types &
1550 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1551 		/* Conflict; disable IPV4 hash type/value delivery. */
1552 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1553 		mbuf_types &= ~RSS_TYPE_IPV4;
1554 	}
1555 	if ((my_types & RSS_TYPE_IPV6) &&
1556 	    (diff_types & ifrh.ifrh_types &
1557 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1558 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1559 	      RSS_TYPE_IPV6_EX))) {
1560 		/* Conflict; disable IPV6 hash type/value delivery. */
1561 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1562 		mbuf_types &= ~RSS_TYPE_IPV6;
1563 	}
1564 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1565 	    (diff_types & ifrh.ifrh_types &
1566 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1567 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1568 	      RSS_TYPE_IPV6))) {
1569 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1570 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1571 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1572 	}
1573 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1574 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1575 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1576 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1577 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1578 	}
1579 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1580 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1581 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1582 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1583 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1584 	}
1585 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1586 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1587 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1588 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1589 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1590 	}
1591 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1592 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1593 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1594 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1595 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1596 	}
1597 
1598 	/*
1599 	 * Indirect table does not matter.
1600 	 */
1601 
1602 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1603 	    hn_rss_type_tondis(my_types);
1604 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1605 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1606 
1607 	if (reconf) {
1608 		error = hn_rss_reconfig(sc);
1609 		if (error) {
1610 			/* XXX roll-back? */
1611 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1612 			/* XXX keep going. */
1613 		}
1614 	}
1615 done:
1616 	/* Hash deliverability for mbufs. */
1617 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1618 }
1619 
1620 static void
1621 hn_vf_rss_restore(struct hn_softc *sc)
1622 {
1623 
1624 	HN_LOCK_ASSERT(sc);
1625 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1626 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1627 
1628 	if (sc->hn_rx_ring_inuse == 1)
1629 		goto done;
1630 
1631 	/*
1632 	 * Restore hash types.  Key does _not_ matter.
1633 	 */
1634 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1635 		int error;
1636 
1637 		sc->hn_rss_hash = sc->hn_rss_hcap;
1638 		error = hn_rss_reconfig(sc);
1639 		if (error) {
1640 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1641 			    error);
1642 			/* XXX keep going. */
1643 		}
1644 	}
1645 done:
1646 	/* Hash deliverability for mbufs. */
1647 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1648 }
1649 
1650 static void
1651 hn_xpnt_vf_setready(struct hn_softc *sc)
1652 {
1653 	struct ifnet *ifp, *vf_ifp;
1654 	struct ifreq ifr;
1655 
1656 	HN_LOCK_ASSERT(sc);
1657 	ifp = sc->hn_ifp;
1658 	vf_ifp = sc->hn_vf_ifp;
1659 
1660 	/*
1661 	 * Mark the VF ready.
1662 	 */
1663 	sc->hn_vf_rdytick = 0;
1664 
1665 	/*
1666 	 * Save information for restoration.
1667 	 */
1668 	sc->hn_saved_caps = ifp->if_capabilities;
1669 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1670 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1671 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1672 
1673 	/*
1674 	 * Intersect supported/enabled capabilities.
1675 	 *
1676 	 * NOTE:
1677 	 * if_hwassist is not changed here.
1678 	 */
1679 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1680 	ifp->if_capenable &= ifp->if_capabilities;
1681 
1682 	/*
1683 	 * Fix TSO settings.
1684 	 */
1685 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1686 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1687 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1688 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1689 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1690 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1691 
1692 	/*
1693 	 * Change VF's enabled capabilities.
1694 	 */
1695 	memset(&ifr, 0, sizeof(ifr));
1696 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1697 	ifr.ifr_reqcap = ifp->if_capenable;
1698 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1699 
1700 	if (ifp->if_mtu != ETHERMTU) {
1701 		int error;
1702 
1703 		/*
1704 		 * Change VF's MTU.
1705 		 */
1706 		memset(&ifr, 0, sizeof(ifr));
1707 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1708 		ifr.ifr_mtu = ifp->if_mtu;
1709 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1710 		if (error) {
1711 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1712 			    vf_ifp->if_xname, ifp->if_mtu);
1713 			if (ifp->if_mtu > ETHERMTU) {
1714 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1715 
1716 				/*
1717 				 * XXX
1718 				 * No need to adjust the synthetic parts' MTU;
1719 				 * failure of the adjustment will cause us
1720 				 * infinite headache.
1721 				 */
1722 				ifp->if_mtu = ETHERMTU;
1723 				hn_mtu_change_fixup(sc);
1724 			}
1725 		}
1726 	}
1727 }
1728 
1729 static bool
1730 hn_xpnt_vf_isready(struct hn_softc *sc)
1731 {
1732 
1733 	HN_LOCK_ASSERT(sc);
1734 
1735 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1736 		return (false);
1737 
1738 	if (sc->hn_vf_rdytick == 0)
1739 		return (true);
1740 
1741 	if (sc->hn_vf_rdytick > ticks)
1742 		return (false);
1743 
1744 	/* Mark VF as ready. */
1745 	hn_xpnt_vf_setready(sc);
1746 	return (true);
1747 }
1748 
1749 static void
1750 hn_xpnt_vf_setenable(struct hn_softc *sc)
1751 {
1752 	int i;
1753 
1754 	HN_LOCK_ASSERT(sc);
1755 
1756 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1757 	rm_wlock(&sc->hn_vf_lock);
1758 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1759 	rm_wunlock(&sc->hn_vf_lock);
1760 
1761 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1762 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1763 }
1764 
1765 static void
1766 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1767 {
1768 	int i;
1769 
1770 	HN_LOCK_ASSERT(sc);
1771 
1772 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1773 	rm_wlock(&sc->hn_vf_lock);
1774 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1775 	if (clear_vf)
1776 		sc->hn_vf_ifp = NULL;
1777 	rm_wunlock(&sc->hn_vf_lock);
1778 
1779 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1780 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1781 }
1782 
1783 static void
1784 hn_xpnt_vf_init(struct hn_softc *sc)
1785 {
1786 	int error;
1787 
1788 	HN_LOCK_ASSERT(sc);
1789 
1790 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1791 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1792 
1793 	if (bootverbose) {
1794 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1795 		    sc->hn_vf_ifp->if_xname);
1796 	}
1797 
1798 	/*
1799 	 * Bring the VF up.
1800 	 */
1801 	hn_xpnt_vf_saveifflags(sc);
1802 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1803 	error = hn_xpnt_vf_iocsetflags(sc);
1804 	if (error) {
1805 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1806 		    sc->hn_vf_ifp->if_xname, error);
1807 		return;
1808 	}
1809 
1810 	/*
1811 	 * NOTE:
1812 	 * Datapath setting must happen _after_ bringing the VF up.
1813 	 */
1814 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1815 
1816 	/*
1817 	 * NOTE:
1818 	 * Fixup RSS related bits _after_ the VF is brought up, since
1819 	 * many VFs generate RSS key during it's initialization.
1820 	 */
1821 	hn_vf_rss_fixup(sc, true);
1822 
1823 	/* Mark transparent mode VF as enabled. */
1824 	hn_xpnt_vf_setenable(sc);
1825 }
1826 
1827 static void
1828 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1829 {
1830 	struct hn_softc *sc = xsc;
1831 
1832 	HN_LOCK(sc);
1833 
1834 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1835 		goto done;
1836 	if (sc->hn_vf_ifp == NULL)
1837 		goto done;
1838 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1839 		goto done;
1840 
1841 	if (sc->hn_vf_rdytick != 0) {
1842 		/* Mark VF as ready. */
1843 		hn_xpnt_vf_setready(sc);
1844 	}
1845 
1846 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1847 		/*
1848 		 * Delayed VF initialization.
1849 		 */
1850 		if (bootverbose) {
1851 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1852 			    sc->hn_vf_ifp->if_xname);
1853 		}
1854 		hn_xpnt_vf_init(sc);
1855 	}
1856 done:
1857 	HN_UNLOCK(sc);
1858 }
1859 
1860 static void
1861 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1862 {
1863 	struct hn_softc *sc = xsc;
1864 
1865 	HN_LOCK(sc);
1866 
1867 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1868 		goto done;
1869 
1870 	if (!hn_ismyvf(sc, ifp))
1871 		goto done;
1872 
1873 	if (sc->hn_vf_ifp != NULL) {
1874 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1875 		    sc->hn_vf_ifp->if_xname);
1876 		goto done;
1877 	}
1878 
1879 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1880 		/*
1881 		 * ifnet.if_start is _not_ supported by transparent
1882 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1883 		 */
1884 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1885 		    "in transparent VF mode.\n", ifp->if_xname);
1886 		goto done;
1887 	}
1888 
1889 	rm_wlock(&hn_vfmap_lock);
1890 
1891 	if (ifp->if_index >= hn_vfmap_size) {
1892 		struct ifnet **newmap;
1893 		int newsize;
1894 
1895 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1896 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1897 		    M_WAITOK | M_ZERO);
1898 
1899 		memcpy(newmap, hn_vfmap,
1900 		    sizeof(struct ifnet *) * hn_vfmap_size);
1901 		free(hn_vfmap, M_DEVBUF);
1902 		hn_vfmap = newmap;
1903 		hn_vfmap_size = newsize;
1904 	}
1905 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1906 	    ("%s: ifindex %d was mapped to %s",
1907 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1908 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1909 
1910 	rm_wunlock(&hn_vfmap_lock);
1911 
1912 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1913 	rm_wlock(&sc->hn_vf_lock);
1914 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1915 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1916 	sc->hn_vf_ifp = ifp;
1917 	rm_wunlock(&sc->hn_vf_lock);
1918 
1919 	if (hn_xpnt_vf) {
1920 		int wait_ticks;
1921 
1922 		/*
1923 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1924 		 * Save vf_ifp's current if_input for later restoration.
1925 		 */
1926 		sc->hn_vf_input = ifp->if_input;
1927 		ifp->if_input = hn_xpnt_vf_input;
1928 
1929 		/*
1930 		 * Stop link status management; use the VF's.
1931 		 */
1932 		hn_suspend_mgmt(sc);
1933 
1934 		/*
1935 		 * Give VF sometime to complete its attach routing.
1936 		 */
1937 		wait_ticks = hn_xpnt_vf_attwait * hz;
1938 		sc->hn_vf_rdytick = ticks + wait_ticks;
1939 
1940 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1941 		    wait_ticks);
1942 	}
1943 done:
1944 	HN_UNLOCK(sc);
1945 }
1946 
1947 static void
1948 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1949 {
1950 	struct hn_softc *sc = xsc;
1951 
1952 	HN_LOCK(sc);
1953 
1954 	if (sc->hn_vf_ifp == NULL)
1955 		goto done;
1956 
1957 	if (!hn_ismyvf(sc, ifp))
1958 		goto done;
1959 
1960 	if (hn_xpnt_vf) {
1961 		/*
1962 		 * Make sure that the delayed initialization is not running.
1963 		 *
1964 		 * NOTE:
1965 		 * - This lock _must_ be released, since the hn_vf_init task
1966 		 *   will try holding this lock.
1967 		 * - It is safe to release this lock here, since the
1968 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1969 		 *
1970 		 * XXX racy, if hn(4) ever detached.
1971 		 */
1972 		HN_UNLOCK(sc);
1973 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1974 		HN_LOCK(sc);
1975 
1976 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1977 		    sc->hn_ifp->if_xname));
1978 		ifp->if_input = sc->hn_vf_input;
1979 		sc->hn_vf_input = NULL;
1980 
1981 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1982 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1983 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1984 
1985 		if (sc->hn_vf_rdytick == 0) {
1986 			/*
1987 			 * The VF was ready; restore some settings.
1988 			 */
1989 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1990 			/*
1991 			 * NOTE:
1992 			 * There is _no_ need to fixup if_capenable and
1993 			 * if_hwassist, since the if_capabilities before
1994 			 * restoration was an intersection of the VF's
1995 			 * if_capabilites and the synthetic device's
1996 			 * if_capabilites.
1997 			 */
1998 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1999 			sc->hn_ifp->if_hw_tsomaxsegcount =
2000 			    sc->hn_saved_tsosegcnt;
2001 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2002 		}
2003 
2004 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2005 			/*
2006 			 * Restore RSS settings.
2007 			 */
2008 			hn_vf_rss_restore(sc);
2009 
2010 			/*
2011 			 * Resume link status management, which was suspended
2012 			 * by hn_ifnet_attevent().
2013 			 */
2014 			hn_resume_mgmt(sc);
2015 		}
2016 	}
2017 
2018 	/* Mark transparent mode VF as disabled. */
2019 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2020 
2021 	rm_wlock(&hn_vfmap_lock);
2022 
2023 	KASSERT(ifp->if_index < hn_vfmap_size,
2024 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2025 	if (hn_vfmap[ifp->if_index] != NULL) {
2026 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2027 		    ("%s: ifindex %d was mapped to %s",
2028 		     ifp->if_xname, ifp->if_index,
2029 		     hn_vfmap[ifp->if_index]->if_xname));
2030 		hn_vfmap[ifp->if_index] = NULL;
2031 	}
2032 
2033 	rm_wunlock(&hn_vfmap_lock);
2034 done:
2035 	HN_UNLOCK(sc);
2036 }
2037 
2038 static void
2039 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2040 {
2041 	struct hn_softc *sc = xsc;
2042 
2043 	if (sc->hn_vf_ifp == ifp)
2044 		if_link_state_change(sc->hn_ifp, link_state);
2045 }
2046 
2047 static int
2048 hn_probe(device_t dev)
2049 {
2050 
2051 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2052 		device_set_desc(dev, "Hyper-V Network Interface");
2053 		return BUS_PROBE_DEFAULT;
2054 	}
2055 	return ENXIO;
2056 }
2057 
2058 static int
2059 hn_attach(device_t dev)
2060 {
2061 	struct hn_softc *sc = device_get_softc(dev);
2062 	struct sysctl_oid_list *child;
2063 	struct sysctl_ctx_list *ctx;
2064 	uint8_t eaddr[ETHER_ADDR_LEN];
2065 	struct ifnet *ifp = NULL;
2066 	int error, ring_cnt, tx_ring_cnt;
2067 	uint32_t mtu;
2068 
2069 	sc->hn_dev = dev;
2070 	sc->hn_prichan = vmbus_get_channel(dev);
2071 	HN_LOCK_INIT(sc);
2072 	rm_init(&sc->hn_vf_lock, "hnvf");
2073 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2074 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2075 
2076 	/*
2077 	 * Initialize these tunables once.
2078 	 */
2079 	sc->hn_agg_size = hn_tx_agg_size;
2080 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2081 
2082 	/*
2083 	 * Setup taskqueue for transmission.
2084 	 */
2085 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2086 		int i;
2087 
2088 		sc->hn_tx_taskqs =
2089 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2090 		    M_DEVBUF, M_WAITOK);
2091 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2092 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2093 			    M_WAITOK, taskqueue_thread_enqueue,
2094 			    &sc->hn_tx_taskqs[i]);
2095 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2096 			    "%s tx%d", device_get_nameunit(dev), i);
2097 		}
2098 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2099 		sc->hn_tx_taskqs = hn_tx_taskque;
2100 	}
2101 
2102 	/*
2103 	 * Setup taskqueue for mangement tasks, e.g. link status.
2104 	 */
2105 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2106 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2107 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2108 	    device_get_nameunit(dev));
2109 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2110 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2111 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2112 	    hn_netchg_status_taskfunc, sc);
2113 
2114 	if (hn_xpnt_vf) {
2115 		/*
2116 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2117 		 */
2118 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2119 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2120 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2121 		    device_get_nameunit(dev));
2122 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2123 		    hn_xpnt_vf_init_taskfunc, sc);
2124 	}
2125 
2126 	/*
2127 	 * Allocate ifnet and setup its name earlier, so that if_printf
2128 	 * can be used by functions, which will be called after
2129 	 * ether_ifattach().
2130 	 */
2131 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2132 	ifp->if_softc = sc;
2133 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2134 
2135 	/*
2136 	 * Initialize ifmedia earlier so that it can be unconditionally
2137 	 * destroyed, if error happened later on.
2138 	 */
2139 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2140 
2141 	/*
2142 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2143 	 * to use (tx_ring_cnt).
2144 	 *
2145 	 * NOTE:
2146 	 * The # of RX rings to use is same as the # of channels to use.
2147 	 */
2148 	ring_cnt = hn_chan_cnt;
2149 	if (ring_cnt <= 0) {
2150 		/* Default */
2151 		ring_cnt = mp_ncpus;
2152 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2153 			ring_cnt = HN_RING_CNT_DEF_MAX;
2154 	} else if (ring_cnt > mp_ncpus) {
2155 		ring_cnt = mp_ncpus;
2156 	}
2157 #ifdef RSS
2158 	if (ring_cnt > rss_getnumbuckets())
2159 		ring_cnt = rss_getnumbuckets();
2160 #endif
2161 
2162 	tx_ring_cnt = hn_tx_ring_cnt;
2163 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2164 		tx_ring_cnt = ring_cnt;
2165 #ifdef HN_IFSTART_SUPPORT
2166 	if (hn_use_if_start) {
2167 		/* ifnet.if_start only needs one TX ring. */
2168 		tx_ring_cnt = 1;
2169 	}
2170 #endif
2171 
2172 	/*
2173 	 * Set the leader CPU for channels.
2174 	 */
2175 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2176 
2177 	/*
2178 	 * Create enough TX/RX rings, even if only limited number of
2179 	 * channels can be allocated.
2180 	 */
2181 	error = hn_create_tx_data(sc, tx_ring_cnt);
2182 	if (error)
2183 		goto failed;
2184 	error = hn_create_rx_data(sc, ring_cnt);
2185 	if (error)
2186 		goto failed;
2187 
2188 	/*
2189 	 * Create transaction context for NVS and RNDIS transactions.
2190 	 */
2191 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2192 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2193 	if (sc->hn_xact == NULL) {
2194 		error = ENXIO;
2195 		goto failed;
2196 	}
2197 
2198 	/*
2199 	 * Install orphan handler for the revocation of this device's
2200 	 * primary channel.
2201 	 *
2202 	 * NOTE:
2203 	 * The processing order is critical here:
2204 	 * Install the orphan handler, _before_ testing whether this
2205 	 * device's primary channel has been revoked or not.
2206 	 */
2207 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2208 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2209 		error = ENXIO;
2210 		goto failed;
2211 	}
2212 
2213 	/*
2214 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2215 	 */
2216 	error = hn_synth_attach(sc, ETHERMTU);
2217 	if (error)
2218 		goto failed;
2219 
2220 	error = hn_rndis_get_eaddr(sc, eaddr);
2221 	if (error)
2222 		goto failed;
2223 
2224 	error = hn_rndis_get_mtu(sc, &mtu);
2225 	if (error)
2226 		mtu = ETHERMTU;
2227 	else if (bootverbose)
2228 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2229 
2230 #if __FreeBSD_version >= 1100099
2231 	if (sc->hn_rx_ring_inuse > 1) {
2232 		/*
2233 		 * Reduce TCP segment aggregation limit for multiple
2234 		 * RX rings to increase ACK timeliness.
2235 		 */
2236 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2237 	}
2238 #endif
2239 
2240 	/*
2241 	 * Fixup TX stuffs after synthetic parts are attached.
2242 	 */
2243 	hn_fixup_tx_data(sc);
2244 
2245 	ctx = device_get_sysctl_ctx(dev);
2246 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2247 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2248 	    &sc->hn_nvs_ver, 0, "NVS version");
2249 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2250 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2251 	    hn_ndis_version_sysctl, "A", "NDIS version");
2252 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2253 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2254 	    hn_caps_sysctl, "A", "capabilities");
2255 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2256 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2257 	    hn_hwassist_sysctl, "A", "hwassist");
2258 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2259 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2260 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2261 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2262 	    "max # of TSO segments");
2263 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2264 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2265 	    "max size of TSO segment");
2266 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2267 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2268 	    hn_rxfilter_sysctl, "A", "rxfilter");
2269 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2270 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2271 	    hn_rss_hash_sysctl, "A", "RSS hash");
2272 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2273 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2274 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2275 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2276 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2277 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2278 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2279 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2280 #ifndef RSS
2281 	/*
2282 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2283 	 */
2284 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2285 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2286 	    hn_rss_key_sysctl, "IU", "RSS key");
2287 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2288 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2289 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2290 #endif
2291 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2292 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2293 	    "RNDIS offered packet transmission aggregation size limit");
2294 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2295 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2296 	    "RNDIS offered packet transmission aggregation count limit");
2297 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2298 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2299 	    "RNDIS packet transmission aggregation alignment");
2300 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2301 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2302 	    hn_txagg_size_sysctl, "I",
2303 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2304 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2305 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2306 	    hn_txagg_pkts_sysctl, "I",
2307 	    "Packet transmission aggregation packets, "
2308 	    "0 -- disable, -1 -- auto");
2309 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2310 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2311 	    hn_polling_sysctl, "I",
2312 	    "Polling frequency: [100,1000000], 0 disable polling");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2314 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_vf_sysctl, "A", "Virtual Function's name");
2316 	if (!hn_xpnt_vf) {
2317 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2318 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2320 	} else {
2321 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2322 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2323 		    hn_xpnt_vf_enabled_sysctl, "I",
2324 		    "Transparent VF enabled");
2325 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2326 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 		    hn_xpnt_vf_accbpf_sysctl, "I",
2328 		    "Accurate BPF for transparent VF");
2329 	}
2330 
2331 	/*
2332 	 * Setup the ifmedia, which has been initialized earlier.
2333 	 */
2334 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2335 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2336 	/* XXX ifmedia_set really should do this for us */
2337 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2338 
2339 	/*
2340 	 * Setup the ifnet for this interface.
2341 	 */
2342 
2343 	ifp->if_baudrate = IF_Gbps(10);
2344 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2345 	ifp->if_ioctl = hn_ioctl;
2346 	ifp->if_init = hn_init;
2347 #ifdef HN_IFSTART_SUPPORT
2348 	if (hn_use_if_start) {
2349 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2350 
2351 		ifp->if_start = hn_start;
2352 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2353 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2354 		IFQ_SET_READY(&ifp->if_snd);
2355 	} else
2356 #endif
2357 	{
2358 		ifp->if_transmit = hn_transmit;
2359 		ifp->if_qflush = hn_xmit_qflush;
2360 	}
2361 
2362 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2363 #ifdef foo
2364 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2365 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2366 #endif
2367 	if (sc->hn_caps & HN_CAP_VLAN) {
2368 		/* XXX not sure about VLAN_MTU. */
2369 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2370 	}
2371 
2372 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2373 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2374 		ifp->if_capabilities |= IFCAP_TXCSUM;
2375 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2376 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2377 	if (sc->hn_caps & HN_CAP_TSO4) {
2378 		ifp->if_capabilities |= IFCAP_TSO4;
2379 		ifp->if_hwassist |= CSUM_IP_TSO;
2380 	}
2381 	if (sc->hn_caps & HN_CAP_TSO6) {
2382 		ifp->if_capabilities |= IFCAP_TSO6;
2383 		ifp->if_hwassist |= CSUM_IP6_TSO;
2384 	}
2385 
2386 	/* Enable all available capabilities by default. */
2387 	ifp->if_capenable = ifp->if_capabilities;
2388 
2389 	/*
2390 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2391 	 * be enabled through SIOCSIFCAP.
2392 	 */
2393 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2394 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2395 
2396 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2397 		/*
2398 		 * Lock hn_set_tso_maxsize() to simplify its
2399 		 * internal logic.
2400 		 */
2401 		HN_LOCK(sc);
2402 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2403 		HN_UNLOCK(sc);
2404 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2405 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2406 	}
2407 
2408 	ether_ifattach(ifp, eaddr);
2409 
2410 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2411 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2412 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2413 	}
2414 	if (mtu < ETHERMTU) {
2415 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2416 		ifp->if_mtu = mtu;
2417 	}
2418 
2419 	/* Inform the upper layer about the long frame support. */
2420 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2421 
2422 	/*
2423 	 * Kick off link status check.
2424 	 */
2425 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2426 	hn_update_link_status(sc);
2427 
2428 	if (!hn_xpnt_vf) {
2429 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2430 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2431 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2432 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2433 	} else {
2434 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2435 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2436 	}
2437 
2438 	/*
2439 	 * NOTE:
2440 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2441 	 * since interface's LLADDR is needed; interface LLADDR is not
2442 	 * available when ifnet_arrival event is triggered.
2443 	 */
2444 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2445 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2446 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2447 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2448 
2449 	return (0);
2450 failed:
2451 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2452 		hn_synth_detach(sc);
2453 	hn_detach(dev);
2454 	return (error);
2455 }
2456 
2457 static int
2458 hn_detach(device_t dev)
2459 {
2460 	struct hn_softc *sc = device_get_softc(dev);
2461 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2462 
2463 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2464 		/*
2465 		 * In case that the vmbus missed the orphan handler
2466 		 * installation.
2467 		 */
2468 		vmbus_xact_ctx_orphan(sc->hn_xact);
2469 	}
2470 
2471 	if (sc->hn_ifaddr_evthand != NULL)
2472 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2473 	if (sc->hn_ifnet_evthand != NULL)
2474 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2475 	if (sc->hn_ifnet_atthand != NULL) {
2476 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2477 		    sc->hn_ifnet_atthand);
2478 	}
2479 	if (sc->hn_ifnet_dethand != NULL) {
2480 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2481 		    sc->hn_ifnet_dethand);
2482 	}
2483 	if (sc->hn_ifnet_lnkhand != NULL)
2484 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2485 
2486 	vf_ifp = sc->hn_vf_ifp;
2487 	__compiler_membar();
2488 	if (vf_ifp != NULL)
2489 		hn_ifnet_detevent(sc, vf_ifp);
2490 
2491 	if (device_is_attached(dev)) {
2492 		HN_LOCK(sc);
2493 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2494 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2495 				hn_stop(sc, true);
2496 			/*
2497 			 * NOTE:
2498 			 * hn_stop() only suspends data, so managment
2499 			 * stuffs have to be suspended manually here.
2500 			 */
2501 			hn_suspend_mgmt(sc);
2502 			hn_synth_detach(sc);
2503 		}
2504 		HN_UNLOCK(sc);
2505 		ether_ifdetach(ifp);
2506 	}
2507 
2508 	ifmedia_removeall(&sc->hn_media);
2509 	hn_destroy_rx_data(sc);
2510 	hn_destroy_tx_data(sc);
2511 
2512 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2513 		int i;
2514 
2515 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2516 			taskqueue_free(sc->hn_tx_taskqs[i]);
2517 		free(sc->hn_tx_taskqs, M_DEVBUF);
2518 	}
2519 	taskqueue_free(sc->hn_mgmt_taskq0);
2520 	if (sc->hn_vf_taskq != NULL)
2521 		taskqueue_free(sc->hn_vf_taskq);
2522 
2523 	if (sc->hn_xact != NULL) {
2524 		/*
2525 		 * Uninstall the orphan handler _before_ the xact is
2526 		 * destructed.
2527 		 */
2528 		vmbus_chan_unset_orphan(sc->hn_prichan);
2529 		vmbus_xact_ctx_destroy(sc->hn_xact);
2530 	}
2531 
2532 	if_free(ifp);
2533 
2534 	HN_LOCK_DESTROY(sc);
2535 	rm_destroy(&sc->hn_vf_lock);
2536 	return (0);
2537 }
2538 
2539 static int
2540 hn_shutdown(device_t dev)
2541 {
2542 
2543 	return (0);
2544 }
2545 
2546 static void
2547 hn_link_status(struct hn_softc *sc)
2548 {
2549 	uint32_t link_status;
2550 	int error;
2551 
2552 	error = hn_rndis_get_linkstatus(sc, &link_status);
2553 	if (error) {
2554 		/* XXX what to do? */
2555 		return;
2556 	}
2557 
2558 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2559 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2560 	else
2561 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2562 	if_link_state_change(sc->hn_ifp,
2563 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2564 	    LINK_STATE_UP : LINK_STATE_DOWN);
2565 }
2566 
2567 static void
2568 hn_link_taskfunc(void *xsc, int pending __unused)
2569 {
2570 	struct hn_softc *sc = xsc;
2571 
2572 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2573 		return;
2574 	hn_link_status(sc);
2575 }
2576 
2577 static void
2578 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2579 {
2580 	struct hn_softc *sc = xsc;
2581 
2582 	/* Prevent any link status checks from running. */
2583 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2584 
2585 	/*
2586 	 * Fake up a [link down --> link up] state change; 5 seconds
2587 	 * delay is used, which closely simulates miibus reaction
2588 	 * upon link down event.
2589 	 */
2590 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2591 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2592 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2593 	    &sc->hn_netchg_status, 5 * hz);
2594 }
2595 
2596 static void
2597 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2598 {
2599 	struct hn_softc *sc = xsc;
2600 
2601 	/* Re-allow link status checks. */
2602 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2603 	hn_link_status(sc);
2604 }
2605 
2606 static void
2607 hn_update_link_status(struct hn_softc *sc)
2608 {
2609 
2610 	if (sc->hn_mgmt_taskq != NULL)
2611 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2612 }
2613 
2614 static void
2615 hn_change_network(struct hn_softc *sc)
2616 {
2617 
2618 	if (sc->hn_mgmt_taskq != NULL)
2619 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2620 }
2621 
2622 static __inline int
2623 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2624     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2625 {
2626 	struct mbuf *m = *m_head;
2627 	int error;
2628 
2629 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2630 
2631 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2632 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2633 	if (error == EFBIG) {
2634 		struct mbuf *m_new;
2635 
2636 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2637 		if (m_new == NULL)
2638 			return ENOBUFS;
2639 		else
2640 			*m_head = m = m_new;
2641 		txr->hn_tx_collapsed++;
2642 
2643 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2644 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2645 	}
2646 	if (!error) {
2647 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2648 		    BUS_DMASYNC_PREWRITE);
2649 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2650 	}
2651 	return error;
2652 }
2653 
2654 static __inline int
2655 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2656 {
2657 
2658 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2659 	    ("put an onlist txd %#x", txd->flags));
2660 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2661 	    ("put an onagg txd %#x", txd->flags));
2662 
2663 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2664 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2665 		return 0;
2666 
2667 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2668 		struct hn_txdesc *tmp_txd;
2669 
2670 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2671 			int freed;
2672 
2673 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2674 			    ("resursive aggregation on aggregated txdesc"));
2675 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2676 			    ("not aggregated txdesc"));
2677 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2678 			    ("aggregated txdesc uses dmamap"));
2679 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2680 			    ("aggregated txdesc consumes "
2681 			     "chimney sending buffer"));
2682 			KASSERT(tmp_txd->chim_size == 0,
2683 			    ("aggregated txdesc has non-zero "
2684 			     "chimney sending size"));
2685 
2686 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2687 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2688 			freed = hn_txdesc_put(txr, tmp_txd);
2689 			KASSERT(freed, ("failed to free aggregated txdesc"));
2690 		}
2691 	}
2692 
2693 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2694 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2695 		    ("chim txd uses dmamap"));
2696 		hn_chim_free(txr->hn_sc, txd->chim_index);
2697 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2698 		txd->chim_size = 0;
2699 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2700 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2701 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2702 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2703 		    txd->data_dmap);
2704 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2705 	}
2706 
2707 	if (txd->m != NULL) {
2708 		m_freem(txd->m);
2709 		txd->m = NULL;
2710 	}
2711 
2712 	txd->flags |= HN_TXD_FLAG_ONLIST;
2713 #ifndef HN_USE_TXDESC_BUFRING
2714 	mtx_lock_spin(&txr->hn_txlist_spin);
2715 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2716 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2717 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2718 	txr->hn_txdesc_avail++;
2719 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2720 	mtx_unlock_spin(&txr->hn_txlist_spin);
2721 #else	/* HN_USE_TXDESC_BUFRING */
2722 #ifdef HN_DEBUG
2723 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2724 #endif
2725 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2726 #endif	/* !HN_USE_TXDESC_BUFRING */
2727 
2728 	return 1;
2729 }
2730 
2731 static __inline struct hn_txdesc *
2732 hn_txdesc_get(struct hn_tx_ring *txr)
2733 {
2734 	struct hn_txdesc *txd;
2735 
2736 #ifndef HN_USE_TXDESC_BUFRING
2737 	mtx_lock_spin(&txr->hn_txlist_spin);
2738 	txd = SLIST_FIRST(&txr->hn_txlist);
2739 	if (txd != NULL) {
2740 		KASSERT(txr->hn_txdesc_avail > 0,
2741 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2742 		txr->hn_txdesc_avail--;
2743 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2744 	}
2745 	mtx_unlock_spin(&txr->hn_txlist_spin);
2746 #else
2747 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2748 #endif
2749 
2750 	if (txd != NULL) {
2751 #ifdef HN_USE_TXDESC_BUFRING
2752 #ifdef HN_DEBUG
2753 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2754 #endif
2755 #endif	/* HN_USE_TXDESC_BUFRING */
2756 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2757 		    STAILQ_EMPTY(&txd->agg_list) &&
2758 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2759 		    txd->chim_size == 0 &&
2760 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2761 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2762 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2763 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2764 		txd->refs = 1;
2765 	}
2766 	return txd;
2767 }
2768 
2769 static __inline void
2770 hn_txdesc_hold(struct hn_txdesc *txd)
2771 {
2772 
2773 	/* 0->1 transition will never work */
2774 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2775 	atomic_add_int(&txd->refs, 1);
2776 }
2777 
2778 static __inline void
2779 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2780 {
2781 
2782 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2783 	    ("recursive aggregation on aggregating txdesc"));
2784 
2785 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2786 	    ("already aggregated"));
2787 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2788 	    ("recursive aggregation on to-be-aggregated txdesc"));
2789 
2790 	txd->flags |= HN_TXD_FLAG_ONAGG;
2791 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2792 }
2793 
2794 static bool
2795 hn_tx_ring_pending(struct hn_tx_ring *txr)
2796 {
2797 	bool pending = false;
2798 
2799 #ifndef HN_USE_TXDESC_BUFRING
2800 	mtx_lock_spin(&txr->hn_txlist_spin);
2801 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2802 		pending = true;
2803 	mtx_unlock_spin(&txr->hn_txlist_spin);
2804 #else
2805 	if (!buf_ring_full(txr->hn_txdesc_br))
2806 		pending = true;
2807 #endif
2808 	return (pending);
2809 }
2810 
2811 static __inline void
2812 hn_txeof(struct hn_tx_ring *txr)
2813 {
2814 	txr->hn_has_txeof = 0;
2815 	txr->hn_txeof(txr);
2816 }
2817 
2818 static void
2819 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2820     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2821 {
2822 	struct hn_txdesc *txd = sndc->hn_cbarg;
2823 	struct hn_tx_ring *txr;
2824 
2825 	txr = txd->txr;
2826 	KASSERT(txr->hn_chan == chan,
2827 	    ("channel mismatch, on chan%u, should be chan%u",
2828 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2829 
2830 	txr->hn_has_txeof = 1;
2831 	hn_txdesc_put(txr, txd);
2832 
2833 	++txr->hn_txdone_cnt;
2834 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2835 		txr->hn_txdone_cnt = 0;
2836 		if (txr->hn_oactive)
2837 			hn_txeof(txr);
2838 	}
2839 }
2840 
2841 static void
2842 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2843 {
2844 #if defined(INET) || defined(INET6)
2845 	tcp_lro_flush_all(&rxr->hn_lro);
2846 #endif
2847 
2848 	/*
2849 	 * NOTE:
2850 	 * 'txr' could be NULL, if multiple channels and
2851 	 * ifnet.if_start method are enabled.
2852 	 */
2853 	if (txr == NULL || !txr->hn_has_txeof)
2854 		return;
2855 
2856 	txr->hn_txdone_cnt = 0;
2857 	hn_txeof(txr);
2858 }
2859 
2860 static __inline uint32_t
2861 hn_rndis_pktmsg_offset(uint32_t ofs)
2862 {
2863 
2864 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2865 	    ("invalid RNDIS packet msg offset %u", ofs));
2866 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2867 }
2868 
2869 static __inline void *
2870 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2871     size_t pi_dlen, uint32_t pi_type)
2872 {
2873 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2874 	struct rndis_pktinfo *pi;
2875 
2876 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2877 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2878 
2879 	/*
2880 	 * Per-packet-info does not move; it only grows.
2881 	 *
2882 	 * NOTE:
2883 	 * rm_pktinfooffset in this phase counts from the beginning
2884 	 * of rndis_packet_msg.
2885 	 */
2886 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2887 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2888 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2889 	    pkt->rm_pktinfolen);
2890 	pkt->rm_pktinfolen += pi_size;
2891 
2892 	pi->rm_size = pi_size;
2893 	pi->rm_type = pi_type;
2894 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2895 
2896 	return (pi->rm_data);
2897 }
2898 
2899 static __inline int
2900 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2901 {
2902 	struct hn_txdesc *txd;
2903 	struct mbuf *m;
2904 	int error, pkts;
2905 
2906 	txd = txr->hn_agg_txd;
2907 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2908 
2909 	/*
2910 	 * Since hn_txpkt() will reset this temporary stat, save
2911 	 * it now, so that oerrors can be updated properly, if
2912 	 * hn_txpkt() ever fails.
2913 	 */
2914 	pkts = txr->hn_stat_pkts;
2915 
2916 	/*
2917 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2918 	 * failure, save it for later freeing, if hn_txpkt() ever
2919 	 * fails.
2920 	 */
2921 	m = txd->m;
2922 	error = hn_txpkt(ifp, txr, txd);
2923 	if (__predict_false(error)) {
2924 		/* txd is freed, but m is not. */
2925 		m_freem(m);
2926 
2927 		txr->hn_flush_failed++;
2928 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2929 	}
2930 
2931 	/* Reset all aggregation states. */
2932 	txr->hn_agg_txd = NULL;
2933 	txr->hn_agg_szleft = 0;
2934 	txr->hn_agg_pktleft = 0;
2935 	txr->hn_agg_prevpkt = NULL;
2936 
2937 	return (error);
2938 }
2939 
2940 static void *
2941 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2942     int pktsize)
2943 {
2944 	void *chim;
2945 
2946 	if (txr->hn_agg_txd != NULL) {
2947 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2948 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2949 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2950 			int olen;
2951 
2952 			/*
2953 			 * Update the previous RNDIS packet's total length,
2954 			 * it can be increased due to the mandatory alignment
2955 			 * padding for this RNDIS packet.  And update the
2956 			 * aggregating txdesc's chimney sending buffer size
2957 			 * accordingly.
2958 			 *
2959 			 * XXX
2960 			 * Zero-out the padding, as required by the RNDIS spec.
2961 			 */
2962 			olen = pkt->rm_len;
2963 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2964 			agg_txd->chim_size += pkt->rm_len - olen;
2965 
2966 			/* Link this txdesc to the parent. */
2967 			hn_txdesc_agg(agg_txd, txd);
2968 
2969 			chim = (uint8_t *)pkt + pkt->rm_len;
2970 			/* Save the current packet for later fixup. */
2971 			txr->hn_agg_prevpkt = chim;
2972 
2973 			txr->hn_agg_pktleft--;
2974 			txr->hn_agg_szleft -= pktsize;
2975 			if (txr->hn_agg_szleft <=
2976 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2977 				/*
2978 				 * Probably can't aggregate more packets,
2979 				 * flush this aggregating txdesc proactively.
2980 				 */
2981 				txr->hn_agg_pktleft = 0;
2982 			}
2983 			/* Done! */
2984 			return (chim);
2985 		}
2986 		hn_flush_txagg(ifp, txr);
2987 	}
2988 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2989 
2990 	txr->hn_tx_chimney_tried++;
2991 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
2992 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2993 		return (NULL);
2994 	txr->hn_tx_chimney++;
2995 
2996 	chim = txr->hn_sc->hn_chim +
2997 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2998 
2999 	if (txr->hn_agg_pktmax > 1 &&
3000 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3001 		txr->hn_agg_txd = txd;
3002 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3003 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3004 		txr->hn_agg_prevpkt = chim;
3005 	}
3006 	return (chim);
3007 }
3008 
3009 /*
3010  * NOTE:
3011  * If this function fails, then both txd and m_head0 will be freed.
3012  */
3013 static int
3014 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3015     struct mbuf **m_head0)
3016 {
3017 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3018 	int error, nsegs, i;
3019 	struct mbuf *m_head = *m_head0;
3020 	struct rndis_packet_msg *pkt;
3021 	uint32_t *pi_data;
3022 	void *chim = NULL;
3023 	int pkt_hlen, pkt_size;
3024 
3025 	pkt = txd->rndis_pkt;
3026 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3027 	if (pkt_size < txr->hn_chim_size) {
3028 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3029 		if (chim != NULL)
3030 			pkt = chim;
3031 	} else {
3032 		if (txr->hn_agg_txd != NULL)
3033 			hn_flush_txagg(ifp, txr);
3034 	}
3035 
3036 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3037 	pkt->rm_len = m_head->m_pkthdr.len;
3038 	pkt->rm_dataoffset = 0;
3039 	pkt->rm_datalen = m_head->m_pkthdr.len;
3040 	pkt->rm_oobdataoffset = 0;
3041 	pkt->rm_oobdatalen = 0;
3042 	pkt->rm_oobdataelements = 0;
3043 	pkt->rm_pktinfooffset = sizeof(*pkt);
3044 	pkt->rm_pktinfolen = 0;
3045 	pkt->rm_vchandle = 0;
3046 	pkt->rm_reserved = 0;
3047 
3048 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3049 		/*
3050 		 * Set the hash value for this packet, so that the host could
3051 		 * dispatch the TX done event for this packet back to this TX
3052 		 * ring's channel.
3053 		 */
3054 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3055 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3056 		*pi_data = txr->hn_tx_idx;
3057 	}
3058 
3059 	if (m_head->m_flags & M_VLANTAG) {
3060 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3061 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3062 		*pi_data = NDIS_VLAN_INFO_MAKE(
3063 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3064 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3065 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3066 	}
3067 
3068 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3069 #if defined(INET6) || defined(INET)
3070 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3071 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3072 #ifdef INET
3073 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3074 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3075 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3076 			    m_head->m_pkthdr.tso_segsz);
3077 		}
3078 #endif
3079 #if defined(INET6) && defined(INET)
3080 		else
3081 #endif
3082 #ifdef INET6
3083 		{
3084 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3085 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3086 			    m_head->m_pkthdr.tso_segsz);
3087 		}
3088 #endif
3089 #endif	/* INET6 || INET */
3090 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3091 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3092 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3093 		if (m_head->m_pkthdr.csum_flags &
3094 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3095 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3096 		} else {
3097 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3098 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3099 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3100 		}
3101 
3102 		if (m_head->m_pkthdr.csum_flags &
3103 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3104 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3105 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3106 		} else if (m_head->m_pkthdr.csum_flags &
3107 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3108 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3109 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3110 		}
3111 	}
3112 
3113 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3114 	/* Fixup RNDIS packet message total length */
3115 	pkt->rm_len += pkt_hlen;
3116 	/* Convert RNDIS packet message offsets */
3117 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3118 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3119 
3120 	/*
3121 	 * Fast path: Chimney sending.
3122 	 */
3123 	if (chim != NULL) {
3124 		struct hn_txdesc *tgt_txd = txd;
3125 
3126 		if (txr->hn_agg_txd != NULL) {
3127 			tgt_txd = txr->hn_agg_txd;
3128 #ifdef INVARIANTS
3129 			*m_head0 = NULL;
3130 #endif
3131 		}
3132 
3133 		KASSERT(pkt == chim,
3134 		    ("RNDIS pkt not in chimney sending buffer"));
3135 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3136 		    ("chimney sending buffer is not used"));
3137 		tgt_txd->chim_size += pkt->rm_len;
3138 
3139 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3140 		    ((uint8_t *)chim) + pkt_hlen);
3141 
3142 		txr->hn_gpa_cnt = 0;
3143 		txr->hn_sendpkt = hn_txpkt_chim;
3144 		goto done;
3145 	}
3146 
3147 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3148 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3149 	    ("chimney buffer is used"));
3150 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3151 
3152 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3153 	if (__predict_false(error)) {
3154 		int freed;
3155 
3156 		/*
3157 		 * This mbuf is not linked w/ the txd yet, so free it now.
3158 		 */
3159 		m_freem(m_head);
3160 		*m_head0 = NULL;
3161 
3162 		freed = hn_txdesc_put(txr, txd);
3163 		KASSERT(freed != 0,
3164 		    ("fail to free txd upon txdma error"));
3165 
3166 		txr->hn_txdma_failed++;
3167 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3168 		return error;
3169 	}
3170 	*m_head0 = m_head;
3171 
3172 	/* +1 RNDIS packet message */
3173 	txr->hn_gpa_cnt = nsegs + 1;
3174 
3175 	/* send packet with page buffer */
3176 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3177 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3178 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3179 
3180 	/*
3181 	 * Fill the page buffers with mbuf info after the page
3182 	 * buffer for RNDIS packet message.
3183 	 */
3184 	for (i = 0; i < nsegs; ++i) {
3185 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3186 
3187 		gpa->gpa_page = atop(segs[i].ds_addr);
3188 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3189 		gpa->gpa_len = segs[i].ds_len;
3190 	}
3191 
3192 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3193 	txd->chim_size = 0;
3194 	txr->hn_sendpkt = hn_txpkt_sglist;
3195 done:
3196 	txd->m = m_head;
3197 
3198 	/* Set the completion routine */
3199 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3200 
3201 	/* Update temporary stats for later use. */
3202 	txr->hn_stat_pkts++;
3203 	txr->hn_stat_size += m_head->m_pkthdr.len;
3204 	if (m_head->m_flags & M_MCAST)
3205 		txr->hn_stat_mcasts++;
3206 
3207 	return 0;
3208 }
3209 
3210 /*
3211  * NOTE:
3212  * If this function fails, then txd will be freed, but the mbuf
3213  * associated w/ the txd will _not_ be freed.
3214  */
3215 static int
3216 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3217 {
3218 	int error, send_failed = 0, has_bpf;
3219 
3220 again:
3221 	has_bpf = bpf_peers_present(ifp->if_bpf);
3222 	if (has_bpf) {
3223 		/*
3224 		 * Make sure that this txd and any aggregated txds are not
3225 		 * freed before ETHER_BPF_MTAP.
3226 		 */
3227 		hn_txdesc_hold(txd);
3228 	}
3229 	error = txr->hn_sendpkt(txr, txd);
3230 	if (!error) {
3231 		if (has_bpf) {
3232 			const struct hn_txdesc *tmp_txd;
3233 
3234 			ETHER_BPF_MTAP(ifp, txd->m);
3235 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3236 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3237 		}
3238 
3239 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3240 #ifdef HN_IFSTART_SUPPORT
3241 		if (!hn_use_if_start)
3242 #endif
3243 		{
3244 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3245 			    txr->hn_stat_size);
3246 			if (txr->hn_stat_mcasts != 0) {
3247 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3248 				    txr->hn_stat_mcasts);
3249 			}
3250 		}
3251 		txr->hn_pkts += txr->hn_stat_pkts;
3252 		txr->hn_sends++;
3253 	}
3254 	if (has_bpf)
3255 		hn_txdesc_put(txr, txd);
3256 
3257 	if (__predict_false(error)) {
3258 		int freed;
3259 
3260 		/*
3261 		 * This should "really rarely" happen.
3262 		 *
3263 		 * XXX Too many RX to be acked or too many sideband
3264 		 * commands to run?  Ask netvsc_channel_rollup()
3265 		 * to kick start later.
3266 		 */
3267 		txr->hn_has_txeof = 1;
3268 		if (!send_failed) {
3269 			txr->hn_send_failed++;
3270 			send_failed = 1;
3271 			/*
3272 			 * Try sending again after set hn_has_txeof;
3273 			 * in case that we missed the last
3274 			 * netvsc_channel_rollup().
3275 			 */
3276 			goto again;
3277 		}
3278 		if_printf(ifp, "send failed\n");
3279 
3280 		/*
3281 		 * Caller will perform further processing on the
3282 		 * associated mbuf, so don't free it in hn_txdesc_put();
3283 		 * only unload it from the DMA map in hn_txdesc_put(),
3284 		 * if it was loaded.
3285 		 */
3286 		txd->m = NULL;
3287 		freed = hn_txdesc_put(txr, txd);
3288 		KASSERT(freed != 0,
3289 		    ("fail to free txd upon send error"));
3290 
3291 		txr->hn_send_failed++;
3292 	}
3293 
3294 	/* Reset temporary stats, after this sending is done. */
3295 	txr->hn_stat_size = 0;
3296 	txr->hn_stat_pkts = 0;
3297 	txr->hn_stat_mcasts = 0;
3298 
3299 	return (error);
3300 }
3301 
3302 /*
3303  * Append the specified data to the indicated mbuf chain,
3304  * Extend the mbuf chain if the new data does not fit in
3305  * existing space.
3306  *
3307  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3308  * There should be an equivalent in the kernel mbuf code,
3309  * but there does not appear to be one yet.
3310  *
3311  * Differs from m_append() in that additional mbufs are
3312  * allocated with cluster size MJUMPAGESIZE, and filled
3313  * accordingly.
3314  *
3315  * Return 1 if able to complete the job; otherwise 0.
3316  */
3317 static int
3318 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3319 {
3320 	struct mbuf *m, *n;
3321 	int remainder, space;
3322 
3323 	for (m = m0; m->m_next != NULL; m = m->m_next)
3324 		;
3325 	remainder = len;
3326 	space = M_TRAILINGSPACE(m);
3327 	if (space > 0) {
3328 		/*
3329 		 * Copy into available space.
3330 		 */
3331 		if (space > remainder)
3332 			space = remainder;
3333 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3334 		m->m_len += space;
3335 		cp += space;
3336 		remainder -= space;
3337 	}
3338 	while (remainder > 0) {
3339 		/*
3340 		 * Allocate a new mbuf; could check space
3341 		 * and allocate a cluster instead.
3342 		 */
3343 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3344 		if (n == NULL)
3345 			break;
3346 		n->m_len = min(MJUMPAGESIZE, remainder);
3347 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3348 		cp += n->m_len;
3349 		remainder -= n->m_len;
3350 		m->m_next = n;
3351 		m = n;
3352 	}
3353 	if (m0->m_flags & M_PKTHDR)
3354 		m0->m_pkthdr.len += len - remainder;
3355 
3356 	return (remainder == 0);
3357 }
3358 
3359 #if defined(INET) || defined(INET6)
3360 static __inline int
3361 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3362 {
3363 #if __FreeBSD_version >= 1100095
3364 	if (hn_lro_mbufq_depth) {
3365 		tcp_lro_queue_mbuf(lc, m);
3366 		return 0;
3367 	}
3368 #endif
3369 	return tcp_lro_rx(lc, m, 0);
3370 }
3371 #endif
3372 
3373 static int
3374 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3375     const struct hn_rxinfo *info)
3376 {
3377 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3378 	struct mbuf *m_new;
3379 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3380 	int hash_type = M_HASHTYPE_NONE;
3381 
3382 	ifp = hn_ifp;
3383 	if (rxr->hn_rxvf_ifp != NULL) {
3384 		/*
3385 		 * Non-transparent mode VF; pretend this packet is from
3386 		 * the VF.
3387 		 */
3388 		ifp = rxr->hn_rxvf_ifp;
3389 		is_vf = 1;
3390 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3391 		/* Transparent mode VF. */
3392 		is_vf = 1;
3393 	}
3394 
3395 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3396 		/*
3397 		 * NOTE:
3398 		 * See the NOTE of hn_rndis_init_fixat().  This
3399 		 * function can be reached, immediately after the
3400 		 * RNDIS is initialized but before the ifnet is
3401 		 * setup on the hn_attach() path; drop the unexpected
3402 		 * packets.
3403 		 */
3404 		return (0);
3405 	}
3406 
3407 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3408 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3409 		return (0);
3410 	}
3411 
3412 	if (dlen <= MHLEN) {
3413 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3414 		if (m_new == NULL) {
3415 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3416 			return (0);
3417 		}
3418 		memcpy(mtod(m_new, void *), data, dlen);
3419 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3420 		rxr->hn_small_pkts++;
3421 	} else {
3422 		/*
3423 		 * Get an mbuf with a cluster.  For packets 2K or less,
3424 		 * get a standard 2K cluster.  For anything larger, get a
3425 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3426 		 * if looped around to the Hyper-V TX channel, so avoid them.
3427 		 */
3428 		size = MCLBYTES;
3429 		if (dlen > MCLBYTES) {
3430 			/* 4096 */
3431 			size = MJUMPAGESIZE;
3432 		}
3433 
3434 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3435 		if (m_new == NULL) {
3436 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3437 			return (0);
3438 		}
3439 
3440 		hv_m_append(m_new, dlen, data);
3441 	}
3442 	m_new->m_pkthdr.rcvif = ifp;
3443 
3444 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3445 		do_csum = 0;
3446 
3447 	/* receive side checksum offload */
3448 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3449 		/* IP csum offload */
3450 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3451 			m_new->m_pkthdr.csum_flags |=
3452 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3453 			rxr->hn_csum_ip++;
3454 		}
3455 
3456 		/* TCP/UDP csum offload */
3457 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3458 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3459 			m_new->m_pkthdr.csum_flags |=
3460 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3461 			m_new->m_pkthdr.csum_data = 0xffff;
3462 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3463 				rxr->hn_csum_tcp++;
3464 			else
3465 				rxr->hn_csum_udp++;
3466 		}
3467 
3468 		/*
3469 		 * XXX
3470 		 * As of this write (Oct 28th, 2016), host side will turn
3471 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3472 		 * the do_lro setting here is actually _not_ accurate.  We
3473 		 * depend on the RSS hash type check to reset do_lro.
3474 		 */
3475 		if ((info->csum_info &
3476 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3477 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3478 			do_lro = 1;
3479 	} else {
3480 		const struct ether_header *eh;
3481 		uint16_t etype;
3482 		int hoff;
3483 
3484 		hoff = sizeof(*eh);
3485 		/* Checked at the beginning of this function. */
3486 		KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3487 
3488 		eh = mtod(m_new, struct ether_header *);
3489 		etype = ntohs(eh->ether_type);
3490 		if (etype == ETHERTYPE_VLAN) {
3491 			const struct ether_vlan_header *evl;
3492 
3493 			hoff = sizeof(*evl);
3494 			if (m_new->m_len < hoff)
3495 				goto skip;
3496 			evl = mtod(m_new, struct ether_vlan_header *);
3497 			etype = ntohs(evl->evl_proto);
3498 		}
3499 
3500 		if (etype == ETHERTYPE_IP) {
3501 			int pr;
3502 
3503 			pr = hn_check_iplen(m_new, hoff);
3504 			if (pr == IPPROTO_TCP) {
3505 				if (do_csum &&
3506 				    (rxr->hn_trust_hcsum &
3507 				     HN_TRUST_HCSUM_TCP)) {
3508 					rxr->hn_csum_trusted++;
3509 					m_new->m_pkthdr.csum_flags |=
3510 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3511 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3512 					m_new->m_pkthdr.csum_data = 0xffff;
3513 				}
3514 				do_lro = 1;
3515 			} else if (pr == IPPROTO_UDP) {
3516 				if (do_csum &&
3517 				    (rxr->hn_trust_hcsum &
3518 				     HN_TRUST_HCSUM_UDP)) {
3519 					rxr->hn_csum_trusted++;
3520 					m_new->m_pkthdr.csum_flags |=
3521 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3522 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3523 					m_new->m_pkthdr.csum_data = 0xffff;
3524 				}
3525 			} else if (pr != IPPROTO_DONE && do_csum &&
3526 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3527 				rxr->hn_csum_trusted++;
3528 				m_new->m_pkthdr.csum_flags |=
3529 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3530 			}
3531 		}
3532 	}
3533 skip:
3534 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3535 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3536 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3537 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3538 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3539 		m_new->m_flags |= M_VLANTAG;
3540 	}
3541 
3542 	/*
3543 	 * If VF is activated (tranparent/non-transparent mode does not
3544 	 * matter here).
3545 	 *
3546 	 * - Disable LRO
3547 	 *
3548 	 *   hn(4) will only receive broadcast packets, multicast packets,
3549 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3550 	 *   packet types.
3551 	 *
3552 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3553 	 *   all, since the LRO flush will use hn(4) as the receiving
3554 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3555 	 */
3556 	if (is_vf)
3557 		do_lro = 0;
3558 
3559 	/*
3560 	 * If VF is activated (tranparent/non-transparent mode does not
3561 	 * matter here), do _not_ mess with unsupported hash types or
3562 	 * functions.
3563 	 */
3564 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3565 		rxr->hn_rss_pkts++;
3566 		m_new->m_pkthdr.flowid = info->hash_value;
3567 		if (!is_vf)
3568 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3569 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3570 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3571 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3572 			    rxr->hn_mbuf_hash);
3573 
3574 			/*
3575 			 * NOTE:
3576 			 * do_lro is resetted, if the hash types are not TCP
3577 			 * related.  See the comment in the above csum_flags
3578 			 * setup section.
3579 			 */
3580 			switch (type) {
3581 			case NDIS_HASH_IPV4:
3582 				hash_type = M_HASHTYPE_RSS_IPV4;
3583 				do_lro = 0;
3584 				break;
3585 
3586 			case NDIS_HASH_TCP_IPV4:
3587 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3588 				break;
3589 
3590 			case NDIS_HASH_IPV6:
3591 				hash_type = M_HASHTYPE_RSS_IPV6;
3592 				do_lro = 0;
3593 				break;
3594 
3595 			case NDIS_HASH_IPV6_EX:
3596 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3597 				do_lro = 0;
3598 				break;
3599 
3600 			case NDIS_HASH_TCP_IPV6:
3601 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3602 				break;
3603 
3604 			case NDIS_HASH_TCP_IPV6_EX:
3605 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3606 				break;
3607 			}
3608 		}
3609 	} else if (!is_vf) {
3610 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3611 		hash_type = M_HASHTYPE_OPAQUE;
3612 	}
3613 	M_HASHTYPE_SET(m_new, hash_type);
3614 
3615 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3616 	if (hn_ifp != ifp) {
3617 		const struct ether_header *eh;
3618 
3619 		/*
3620 		 * Non-transparent mode VF is activated.
3621 		 */
3622 
3623 		/*
3624 		 * Allow tapping on hn(4).
3625 		 */
3626 		ETHER_BPF_MTAP(hn_ifp, m_new);
3627 
3628 		/*
3629 		 * Update hn(4)'s stats.
3630 		 */
3631 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3632 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3633 		/* Checked at the beginning of this function. */
3634 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3635 		eh = mtod(m_new, struct ether_header *);
3636 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3637 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3638 	}
3639 	rxr->hn_pkts++;
3640 
3641 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3642 #if defined(INET) || defined(INET6)
3643 		struct lro_ctrl *lro = &rxr->hn_lro;
3644 
3645 		if (lro->lro_cnt) {
3646 			rxr->hn_lro_tried++;
3647 			if (hn_lro_rx(lro, m_new) == 0) {
3648 				/* DONE! */
3649 				return 0;
3650 			}
3651 		}
3652 #endif
3653 	}
3654 	ifp->if_input(ifp, m_new);
3655 
3656 	return (0);
3657 }
3658 
3659 static int
3660 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3661 {
3662 	struct hn_softc *sc = ifp->if_softc;
3663 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3664 	struct ifnet *vf_ifp;
3665 	int mask, error = 0;
3666 	struct ifrsskey *ifrk;
3667 	struct ifrsshash *ifrh;
3668 	uint32_t mtu;
3669 
3670 	switch (cmd) {
3671 	case SIOCSIFMTU:
3672 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3673 			error = EINVAL;
3674 			break;
3675 		}
3676 
3677 		HN_LOCK(sc);
3678 
3679 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3680 			HN_UNLOCK(sc);
3681 			break;
3682 		}
3683 
3684 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3685 			/* Can't change MTU */
3686 			HN_UNLOCK(sc);
3687 			error = EOPNOTSUPP;
3688 			break;
3689 		}
3690 
3691 		if (ifp->if_mtu == ifr->ifr_mtu) {
3692 			HN_UNLOCK(sc);
3693 			break;
3694 		}
3695 
3696 		if (hn_xpnt_vf_isready(sc)) {
3697 			vf_ifp = sc->hn_vf_ifp;
3698 			ifr_vf = *ifr;
3699 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3700 			    sizeof(ifr_vf.ifr_name));
3701 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3702 			    (caddr_t)&ifr_vf);
3703 			if (error) {
3704 				HN_UNLOCK(sc);
3705 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3706 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3707 				break;
3708 			}
3709 		}
3710 
3711 		/*
3712 		 * Suspend this interface before the synthetic parts
3713 		 * are ripped.
3714 		 */
3715 		hn_suspend(sc);
3716 
3717 		/*
3718 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3719 		 */
3720 		hn_synth_detach(sc);
3721 
3722 		/*
3723 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3724 		 * with the new MTU setting.
3725 		 */
3726 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3727 		if (error) {
3728 			HN_UNLOCK(sc);
3729 			break;
3730 		}
3731 
3732 		error = hn_rndis_get_mtu(sc, &mtu);
3733 		if (error)
3734 			mtu = ifr->ifr_mtu;
3735 		else if (bootverbose)
3736 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3737 
3738 		/*
3739 		 * Commit the requested MTU, after the synthetic parts
3740 		 * have been successfully attached.
3741 		 */
3742 		if (mtu >= ifr->ifr_mtu) {
3743 			mtu = ifr->ifr_mtu;
3744 		} else {
3745 			if_printf(ifp, "fixup mtu %d -> %u\n",
3746 			    ifr->ifr_mtu, mtu);
3747 		}
3748 		ifp->if_mtu = mtu;
3749 
3750 		/*
3751 		 * Synthetic parts' reattach may change the chimney
3752 		 * sending size; update it.
3753 		 */
3754 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3755 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3756 
3757 		/*
3758 		 * Make sure that various parameters based on MTU are
3759 		 * still valid, after the MTU change.
3760 		 */
3761 		hn_mtu_change_fixup(sc);
3762 
3763 		/*
3764 		 * All done!  Resume the interface now.
3765 		 */
3766 		hn_resume(sc);
3767 
3768 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3769 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3770 			/*
3771 			 * Since we have reattached the NVS part,
3772 			 * change the datapath to VF again; in case
3773 			 * that it is lost, after the NVS was detached.
3774 			 */
3775 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3776 		}
3777 
3778 		HN_UNLOCK(sc);
3779 		break;
3780 
3781 	case SIOCSIFFLAGS:
3782 		HN_LOCK(sc);
3783 
3784 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3785 			HN_UNLOCK(sc);
3786 			break;
3787 		}
3788 
3789 		if (hn_xpnt_vf_isready(sc))
3790 			hn_xpnt_vf_saveifflags(sc);
3791 
3792 		if (ifp->if_flags & IFF_UP) {
3793 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3794 				/*
3795 				 * Caller meight hold mutex, e.g.
3796 				 * bpf; use busy-wait for the RNDIS
3797 				 * reply.
3798 				 */
3799 				HN_NO_SLEEPING(sc);
3800 				hn_rxfilter_config(sc);
3801 				HN_SLEEPING_OK(sc);
3802 
3803 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3804 					error = hn_xpnt_vf_iocsetflags(sc);
3805 			} else {
3806 				hn_init_locked(sc);
3807 			}
3808 		} else {
3809 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3810 				hn_stop(sc, false);
3811 		}
3812 		sc->hn_if_flags = ifp->if_flags;
3813 
3814 		HN_UNLOCK(sc);
3815 		break;
3816 
3817 	case SIOCSIFCAP:
3818 		HN_LOCK(sc);
3819 
3820 		if (hn_xpnt_vf_isready(sc)) {
3821 			ifr_vf = *ifr;
3822 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3823 			    sizeof(ifr_vf.ifr_name));
3824 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3825 			HN_UNLOCK(sc);
3826 			break;
3827 		}
3828 
3829 		/*
3830 		 * Fix up requested capabilities w/ supported capabilities,
3831 		 * since the supported capabilities could have been changed.
3832 		 */
3833 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3834 		    ifp->if_capenable;
3835 
3836 		if (mask & IFCAP_TXCSUM) {
3837 			ifp->if_capenable ^= IFCAP_TXCSUM;
3838 			if (ifp->if_capenable & IFCAP_TXCSUM)
3839 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3840 			else
3841 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3842 		}
3843 		if (mask & IFCAP_TXCSUM_IPV6) {
3844 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3845 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3846 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3847 			else
3848 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3849 		}
3850 
3851 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3852 		if (mask & IFCAP_RXCSUM)
3853 			ifp->if_capenable ^= IFCAP_RXCSUM;
3854 #ifdef foo
3855 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3856 		if (mask & IFCAP_RXCSUM_IPV6)
3857 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3858 #endif
3859 
3860 		if (mask & IFCAP_LRO)
3861 			ifp->if_capenable ^= IFCAP_LRO;
3862 
3863 		if (mask & IFCAP_TSO4) {
3864 			ifp->if_capenable ^= IFCAP_TSO4;
3865 			if (ifp->if_capenable & IFCAP_TSO4)
3866 				ifp->if_hwassist |= CSUM_IP_TSO;
3867 			else
3868 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3869 		}
3870 		if (mask & IFCAP_TSO6) {
3871 			ifp->if_capenable ^= IFCAP_TSO6;
3872 			if (ifp->if_capenable & IFCAP_TSO6)
3873 				ifp->if_hwassist |= CSUM_IP6_TSO;
3874 			else
3875 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3876 		}
3877 
3878 		HN_UNLOCK(sc);
3879 		break;
3880 
3881 	case SIOCADDMULTI:
3882 	case SIOCDELMULTI:
3883 		HN_LOCK(sc);
3884 
3885 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3886 			HN_UNLOCK(sc);
3887 			break;
3888 		}
3889 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3890 			/*
3891 			 * Multicast uses mutex; use busy-wait for
3892 			 * the RNDIS reply.
3893 			 */
3894 			HN_NO_SLEEPING(sc);
3895 			hn_rxfilter_config(sc);
3896 			HN_SLEEPING_OK(sc);
3897 		}
3898 
3899 		/* XXX vlan(4) style mcast addr maintenance */
3900 		if (hn_xpnt_vf_isready(sc)) {
3901 			int old_if_flags;
3902 
3903 			old_if_flags = sc->hn_vf_ifp->if_flags;
3904 			hn_xpnt_vf_saveifflags(sc);
3905 
3906 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3907 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3908 			     IFF_ALLMULTI))
3909 				error = hn_xpnt_vf_iocsetflags(sc);
3910 		}
3911 
3912 		HN_UNLOCK(sc);
3913 		break;
3914 
3915 	case SIOCSIFMEDIA:
3916 	case SIOCGIFMEDIA:
3917 		HN_LOCK(sc);
3918 		if (hn_xpnt_vf_isready(sc)) {
3919 			/*
3920 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3921 			 * create and pass ifr_vf to the VF here; just
3922 			 * replace the ifr_name.
3923 			 */
3924 			vf_ifp = sc->hn_vf_ifp;
3925 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3926 			    sizeof(ifr->ifr_name));
3927 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3928 			/* Restore the ifr_name. */
3929 			strlcpy(ifr->ifr_name, ifp->if_xname,
3930 			    sizeof(ifr->ifr_name));
3931 			HN_UNLOCK(sc);
3932 			break;
3933 		}
3934 		HN_UNLOCK(sc);
3935 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3936 		break;
3937 
3938 	case SIOCGIFRSSHASH:
3939 		ifrh = (struct ifrsshash *)data;
3940 		HN_LOCK(sc);
3941 		if (sc->hn_rx_ring_inuse == 1) {
3942 			HN_UNLOCK(sc);
3943 			ifrh->ifrh_func = RSS_FUNC_NONE;
3944 			ifrh->ifrh_types = 0;
3945 			break;
3946 		}
3947 
3948 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3949 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3950 		else
3951 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3952 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3953 		HN_UNLOCK(sc);
3954 		break;
3955 
3956 	case SIOCGIFRSSKEY:
3957 		ifrk = (struct ifrsskey *)data;
3958 		HN_LOCK(sc);
3959 		if (sc->hn_rx_ring_inuse == 1) {
3960 			HN_UNLOCK(sc);
3961 			ifrk->ifrk_func = RSS_FUNC_NONE;
3962 			ifrk->ifrk_keylen = 0;
3963 			break;
3964 		}
3965 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3966 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3967 		else
3968 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3969 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3970 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3971 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
3972 		HN_UNLOCK(sc);
3973 		break;
3974 
3975 	default:
3976 		error = ether_ioctl(ifp, cmd, data);
3977 		break;
3978 	}
3979 	return (error);
3980 }
3981 
3982 static void
3983 hn_stop(struct hn_softc *sc, bool detaching)
3984 {
3985 	struct ifnet *ifp = sc->hn_ifp;
3986 	int i;
3987 
3988 	HN_LOCK_ASSERT(sc);
3989 
3990 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3991 	    ("synthetic parts were not attached"));
3992 
3993 	/* Clear RUNNING bit ASAP. */
3994 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3995 
3996 	/* Disable polling. */
3997 	hn_polling(sc, 0);
3998 
3999 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4000 		KASSERT(sc->hn_vf_ifp != NULL,
4001 		    ("%s: VF is not attached", ifp->if_xname));
4002 
4003 		/* Mark transparent mode VF as disabled. */
4004 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4005 
4006 		/*
4007 		 * NOTE:
4008 		 * Datapath setting must happen _before_ bringing
4009 		 * the VF down.
4010 		 */
4011 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4012 
4013 		/*
4014 		 * Bring the VF down.
4015 		 */
4016 		hn_xpnt_vf_saveifflags(sc);
4017 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4018 		hn_xpnt_vf_iocsetflags(sc);
4019 	}
4020 
4021 	/* Suspend data transfers. */
4022 	hn_suspend_data(sc);
4023 
4024 	/* Clear OACTIVE bit. */
4025 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4026 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4027 		sc->hn_tx_ring[i].hn_oactive = 0;
4028 
4029 	/*
4030 	 * If the non-transparent mode VF is active, make sure
4031 	 * that the RX filter still allows packet reception.
4032 	 */
4033 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4034 		hn_rxfilter_config(sc);
4035 }
4036 
4037 static void
4038 hn_init_locked(struct hn_softc *sc)
4039 {
4040 	struct ifnet *ifp = sc->hn_ifp;
4041 	int i;
4042 
4043 	HN_LOCK_ASSERT(sc);
4044 
4045 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4046 		return;
4047 
4048 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4049 		return;
4050 
4051 	/* Configure RX filter */
4052 	hn_rxfilter_config(sc);
4053 
4054 	/* Clear OACTIVE bit. */
4055 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4056 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4057 		sc->hn_tx_ring[i].hn_oactive = 0;
4058 
4059 	/* Clear TX 'suspended' bit. */
4060 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4061 
4062 	if (hn_xpnt_vf_isready(sc)) {
4063 		/* Initialize transparent VF. */
4064 		hn_xpnt_vf_init(sc);
4065 	}
4066 
4067 	/* Everything is ready; unleash! */
4068 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4069 
4070 	/* Re-enable polling if requested. */
4071 	if (sc->hn_pollhz > 0)
4072 		hn_polling(sc, sc->hn_pollhz);
4073 }
4074 
4075 static void
4076 hn_init(void *xsc)
4077 {
4078 	struct hn_softc *sc = xsc;
4079 
4080 	HN_LOCK(sc);
4081 	hn_init_locked(sc);
4082 	HN_UNLOCK(sc);
4083 }
4084 
4085 #if __FreeBSD_version >= 1100099
4086 
4087 static int
4088 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4089 {
4090 	struct hn_softc *sc = arg1;
4091 	unsigned int lenlim;
4092 	int error;
4093 
4094 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4095 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4096 	if (error || req->newptr == NULL)
4097 		return error;
4098 
4099 	HN_LOCK(sc);
4100 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4101 	    lenlim > TCP_LRO_LENGTH_MAX) {
4102 		HN_UNLOCK(sc);
4103 		return EINVAL;
4104 	}
4105 	hn_set_lro_lenlim(sc, lenlim);
4106 	HN_UNLOCK(sc);
4107 
4108 	return 0;
4109 }
4110 
4111 static int
4112 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4113 {
4114 	struct hn_softc *sc = arg1;
4115 	int ackcnt, error, i;
4116 
4117 	/*
4118 	 * lro_ackcnt_lim is append count limit,
4119 	 * +1 to turn it into aggregation limit.
4120 	 */
4121 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4122 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4123 	if (error || req->newptr == NULL)
4124 		return error;
4125 
4126 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4127 		return EINVAL;
4128 
4129 	/*
4130 	 * Convert aggregation limit back to append
4131 	 * count limit.
4132 	 */
4133 	--ackcnt;
4134 	HN_LOCK(sc);
4135 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4136 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4137 	HN_UNLOCK(sc);
4138 	return 0;
4139 }
4140 
4141 #endif
4142 
4143 static int
4144 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4145 {
4146 	struct hn_softc *sc = arg1;
4147 	int hcsum = arg2;
4148 	int on, error, i;
4149 
4150 	on = 0;
4151 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4152 		on = 1;
4153 
4154 	error = sysctl_handle_int(oidp, &on, 0, req);
4155 	if (error || req->newptr == NULL)
4156 		return error;
4157 
4158 	HN_LOCK(sc);
4159 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4160 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4161 
4162 		if (on)
4163 			rxr->hn_trust_hcsum |= hcsum;
4164 		else
4165 			rxr->hn_trust_hcsum &= ~hcsum;
4166 	}
4167 	HN_UNLOCK(sc);
4168 	return 0;
4169 }
4170 
4171 static int
4172 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4173 {
4174 	struct hn_softc *sc = arg1;
4175 	int chim_size, error;
4176 
4177 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4178 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4179 	if (error || req->newptr == NULL)
4180 		return error;
4181 
4182 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4183 		return EINVAL;
4184 
4185 	HN_LOCK(sc);
4186 	hn_set_chim_size(sc, chim_size);
4187 	HN_UNLOCK(sc);
4188 	return 0;
4189 }
4190 
4191 #if __FreeBSD_version < 1100095
4192 static int
4193 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4194 {
4195 	struct hn_softc *sc = arg1;
4196 	int ofs = arg2, i, error;
4197 	struct hn_rx_ring *rxr;
4198 	uint64_t stat;
4199 
4200 	stat = 0;
4201 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4202 		rxr = &sc->hn_rx_ring[i];
4203 		stat += *((int *)((uint8_t *)rxr + ofs));
4204 	}
4205 
4206 	error = sysctl_handle_64(oidp, &stat, 0, req);
4207 	if (error || req->newptr == NULL)
4208 		return error;
4209 
4210 	/* Zero out this stat. */
4211 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4212 		rxr = &sc->hn_rx_ring[i];
4213 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4214 	}
4215 	return 0;
4216 }
4217 #else
4218 static int
4219 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4220 {
4221 	struct hn_softc *sc = arg1;
4222 	int ofs = arg2, i, error;
4223 	struct hn_rx_ring *rxr;
4224 	uint64_t stat;
4225 
4226 	stat = 0;
4227 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4228 		rxr = &sc->hn_rx_ring[i];
4229 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4230 	}
4231 
4232 	error = sysctl_handle_64(oidp, &stat, 0, req);
4233 	if (error || req->newptr == NULL)
4234 		return error;
4235 
4236 	/* Zero out this stat. */
4237 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4238 		rxr = &sc->hn_rx_ring[i];
4239 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4240 	}
4241 	return 0;
4242 }
4243 
4244 #endif
4245 
4246 static int
4247 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4248 {
4249 	struct hn_softc *sc = arg1;
4250 	int ofs = arg2, i, error;
4251 	struct hn_rx_ring *rxr;
4252 	u_long stat;
4253 
4254 	stat = 0;
4255 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4256 		rxr = &sc->hn_rx_ring[i];
4257 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4258 	}
4259 
4260 	error = sysctl_handle_long(oidp, &stat, 0, req);
4261 	if (error || req->newptr == NULL)
4262 		return error;
4263 
4264 	/* Zero out this stat. */
4265 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4266 		rxr = &sc->hn_rx_ring[i];
4267 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4268 	}
4269 	return 0;
4270 }
4271 
4272 static int
4273 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4274 {
4275 	struct hn_softc *sc = arg1;
4276 	int ofs = arg2, i, error;
4277 	struct hn_tx_ring *txr;
4278 	u_long stat;
4279 
4280 	stat = 0;
4281 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4282 		txr = &sc->hn_tx_ring[i];
4283 		stat += *((u_long *)((uint8_t *)txr + ofs));
4284 	}
4285 
4286 	error = sysctl_handle_long(oidp, &stat, 0, req);
4287 	if (error || req->newptr == NULL)
4288 		return error;
4289 
4290 	/* Zero out this stat. */
4291 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4292 		txr = &sc->hn_tx_ring[i];
4293 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4294 	}
4295 	return 0;
4296 }
4297 
4298 static int
4299 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4300 {
4301 	struct hn_softc *sc = arg1;
4302 	int ofs = arg2, i, error, conf;
4303 	struct hn_tx_ring *txr;
4304 
4305 	txr = &sc->hn_tx_ring[0];
4306 	conf = *((int *)((uint8_t *)txr + ofs));
4307 
4308 	error = sysctl_handle_int(oidp, &conf, 0, req);
4309 	if (error || req->newptr == NULL)
4310 		return error;
4311 
4312 	HN_LOCK(sc);
4313 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4314 		txr = &sc->hn_tx_ring[i];
4315 		*((int *)((uint8_t *)txr + ofs)) = conf;
4316 	}
4317 	HN_UNLOCK(sc);
4318 
4319 	return 0;
4320 }
4321 
4322 static int
4323 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4324 {
4325 	struct hn_softc *sc = arg1;
4326 	int error, size;
4327 
4328 	size = sc->hn_agg_size;
4329 	error = sysctl_handle_int(oidp, &size, 0, req);
4330 	if (error || req->newptr == NULL)
4331 		return (error);
4332 
4333 	HN_LOCK(sc);
4334 	sc->hn_agg_size = size;
4335 	hn_set_txagg(sc);
4336 	HN_UNLOCK(sc);
4337 
4338 	return (0);
4339 }
4340 
4341 static int
4342 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4343 {
4344 	struct hn_softc *sc = arg1;
4345 	int error, pkts;
4346 
4347 	pkts = sc->hn_agg_pkts;
4348 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4349 	if (error || req->newptr == NULL)
4350 		return (error);
4351 
4352 	HN_LOCK(sc);
4353 	sc->hn_agg_pkts = pkts;
4354 	hn_set_txagg(sc);
4355 	HN_UNLOCK(sc);
4356 
4357 	return (0);
4358 }
4359 
4360 static int
4361 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4362 {
4363 	struct hn_softc *sc = arg1;
4364 	int pkts;
4365 
4366 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4367 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4368 }
4369 
4370 static int
4371 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4372 {
4373 	struct hn_softc *sc = arg1;
4374 	int align;
4375 
4376 	align = sc->hn_tx_ring[0].hn_agg_align;
4377 	return (sysctl_handle_int(oidp, &align, 0, req));
4378 }
4379 
4380 static void
4381 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4382 {
4383 	if (pollhz == 0)
4384 		vmbus_chan_poll_disable(chan);
4385 	else
4386 		vmbus_chan_poll_enable(chan, pollhz);
4387 }
4388 
4389 static void
4390 hn_polling(struct hn_softc *sc, u_int pollhz)
4391 {
4392 	int nsubch = sc->hn_rx_ring_inuse - 1;
4393 
4394 	HN_LOCK_ASSERT(sc);
4395 
4396 	if (nsubch > 0) {
4397 		struct vmbus_channel **subch;
4398 		int i;
4399 
4400 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4401 		for (i = 0; i < nsubch; ++i)
4402 			hn_chan_polling(subch[i], pollhz);
4403 		vmbus_subchan_rel(subch, nsubch);
4404 	}
4405 	hn_chan_polling(sc->hn_prichan, pollhz);
4406 }
4407 
4408 static int
4409 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4410 {
4411 	struct hn_softc *sc = arg1;
4412 	int pollhz, error;
4413 
4414 	pollhz = sc->hn_pollhz;
4415 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4416 	if (error || req->newptr == NULL)
4417 		return (error);
4418 
4419 	if (pollhz != 0 &&
4420 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4421 		return (EINVAL);
4422 
4423 	HN_LOCK(sc);
4424 	if (sc->hn_pollhz != pollhz) {
4425 		sc->hn_pollhz = pollhz;
4426 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4427 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4428 			hn_polling(sc, sc->hn_pollhz);
4429 	}
4430 	HN_UNLOCK(sc);
4431 
4432 	return (0);
4433 }
4434 
4435 static int
4436 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4437 {
4438 	struct hn_softc *sc = arg1;
4439 	char verstr[16];
4440 
4441 	snprintf(verstr, sizeof(verstr), "%u.%u",
4442 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4443 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4444 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4445 }
4446 
4447 static int
4448 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4449 {
4450 	struct hn_softc *sc = arg1;
4451 	char caps_str[128];
4452 	uint32_t caps;
4453 
4454 	HN_LOCK(sc);
4455 	caps = sc->hn_caps;
4456 	HN_UNLOCK(sc);
4457 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4458 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4459 }
4460 
4461 static int
4462 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4463 {
4464 	struct hn_softc *sc = arg1;
4465 	char assist_str[128];
4466 	uint32_t hwassist;
4467 
4468 	HN_LOCK(sc);
4469 	hwassist = sc->hn_ifp->if_hwassist;
4470 	HN_UNLOCK(sc);
4471 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4472 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4473 }
4474 
4475 static int
4476 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4477 {
4478 	struct hn_softc *sc = arg1;
4479 	char filter_str[128];
4480 	uint32_t filter;
4481 
4482 	HN_LOCK(sc);
4483 	filter = sc->hn_rx_filter;
4484 	HN_UNLOCK(sc);
4485 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4486 	    NDIS_PACKET_TYPES);
4487 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4488 }
4489 
4490 #ifndef RSS
4491 
4492 static int
4493 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4494 {
4495 	struct hn_softc *sc = arg1;
4496 	int error;
4497 
4498 	HN_LOCK(sc);
4499 
4500 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4501 	if (error || req->newptr == NULL)
4502 		goto back;
4503 
4504 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4505 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4506 		/*
4507 		 * RSS key is synchronized w/ VF's, don't allow users
4508 		 * to change it.
4509 		 */
4510 		error = EBUSY;
4511 		goto back;
4512 	}
4513 
4514 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4515 	if (error)
4516 		goto back;
4517 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4518 
4519 	if (sc->hn_rx_ring_inuse > 1) {
4520 		error = hn_rss_reconfig(sc);
4521 	} else {
4522 		/* Not RSS capable, at least for now; just save the RSS key. */
4523 		error = 0;
4524 	}
4525 back:
4526 	HN_UNLOCK(sc);
4527 	return (error);
4528 }
4529 
4530 static int
4531 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4532 {
4533 	struct hn_softc *sc = arg1;
4534 	int error;
4535 
4536 	HN_LOCK(sc);
4537 
4538 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4539 	if (error || req->newptr == NULL)
4540 		goto back;
4541 
4542 	/*
4543 	 * Don't allow RSS indirect table change, if this interface is not
4544 	 * RSS capable currently.
4545 	 */
4546 	if (sc->hn_rx_ring_inuse == 1) {
4547 		error = EOPNOTSUPP;
4548 		goto back;
4549 	}
4550 
4551 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4552 	if (error)
4553 		goto back;
4554 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4555 
4556 	hn_rss_ind_fixup(sc);
4557 	error = hn_rss_reconfig(sc);
4558 back:
4559 	HN_UNLOCK(sc);
4560 	return (error);
4561 }
4562 
4563 #endif	/* !RSS */
4564 
4565 static int
4566 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4567 {
4568 	struct hn_softc *sc = arg1;
4569 	char hash_str[128];
4570 	uint32_t hash;
4571 
4572 	HN_LOCK(sc);
4573 	hash = sc->hn_rss_hash;
4574 	HN_UNLOCK(sc);
4575 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4576 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4577 }
4578 
4579 static int
4580 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4581 {
4582 	struct hn_softc *sc = arg1;
4583 	char hash_str[128];
4584 	uint32_t hash;
4585 
4586 	HN_LOCK(sc);
4587 	hash = sc->hn_rss_hcap;
4588 	HN_UNLOCK(sc);
4589 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4590 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4591 }
4592 
4593 static int
4594 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4595 {
4596 	struct hn_softc *sc = arg1;
4597 	char hash_str[128];
4598 	uint32_t hash;
4599 
4600 	HN_LOCK(sc);
4601 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4602 	HN_UNLOCK(sc);
4603 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4604 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4605 }
4606 
4607 static int
4608 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4609 {
4610 	struct hn_softc *sc = arg1;
4611 	char vf_name[IFNAMSIZ + 1];
4612 	struct ifnet *vf_ifp;
4613 
4614 	HN_LOCK(sc);
4615 	vf_name[0] = '\0';
4616 	vf_ifp = sc->hn_vf_ifp;
4617 	if (vf_ifp != NULL)
4618 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4619 	HN_UNLOCK(sc);
4620 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4621 }
4622 
4623 static int
4624 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4625 {
4626 	struct hn_softc *sc = arg1;
4627 	char vf_name[IFNAMSIZ + 1];
4628 	struct ifnet *vf_ifp;
4629 
4630 	HN_LOCK(sc);
4631 	vf_name[0] = '\0';
4632 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4633 	if (vf_ifp != NULL)
4634 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4635 	HN_UNLOCK(sc);
4636 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4637 }
4638 
4639 static int
4640 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4641 {
4642 	struct rm_priotracker pt;
4643 	struct sbuf *sb;
4644 	int error, i;
4645 	bool first;
4646 
4647 	error = sysctl_wire_old_buffer(req, 0);
4648 	if (error != 0)
4649 		return (error);
4650 
4651 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4652 	if (sb == NULL)
4653 		return (ENOMEM);
4654 
4655 	rm_rlock(&hn_vfmap_lock, &pt);
4656 
4657 	first = true;
4658 	for (i = 0; i < hn_vfmap_size; ++i) {
4659 		struct ifnet *ifp;
4660 
4661 		if (hn_vfmap[i] == NULL)
4662 			continue;
4663 
4664 		ifp = ifnet_byindex(i);
4665 		if (ifp != NULL) {
4666 			if (first)
4667 				sbuf_printf(sb, "%s", ifp->if_xname);
4668 			else
4669 				sbuf_printf(sb, " %s", ifp->if_xname);
4670 			first = false;
4671 		}
4672 	}
4673 
4674 	rm_runlock(&hn_vfmap_lock, &pt);
4675 
4676 	error = sbuf_finish(sb);
4677 	sbuf_delete(sb);
4678 	return (error);
4679 }
4680 
4681 static int
4682 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4683 {
4684 	struct rm_priotracker pt;
4685 	struct sbuf *sb;
4686 	int error, i;
4687 	bool first;
4688 
4689 	error = sysctl_wire_old_buffer(req, 0);
4690 	if (error != 0)
4691 		return (error);
4692 
4693 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4694 	if (sb == NULL)
4695 		return (ENOMEM);
4696 
4697 	rm_rlock(&hn_vfmap_lock, &pt);
4698 
4699 	first = true;
4700 	for (i = 0; i < hn_vfmap_size; ++i) {
4701 		struct ifnet *ifp, *hn_ifp;
4702 
4703 		hn_ifp = hn_vfmap[i];
4704 		if (hn_ifp == NULL)
4705 			continue;
4706 
4707 		ifp = ifnet_byindex(i);
4708 		if (ifp != NULL) {
4709 			if (first) {
4710 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4711 				    hn_ifp->if_xname);
4712 			} else {
4713 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4714 				    hn_ifp->if_xname);
4715 			}
4716 			first = false;
4717 		}
4718 	}
4719 
4720 	rm_runlock(&hn_vfmap_lock, &pt);
4721 
4722 	error = sbuf_finish(sb);
4723 	sbuf_delete(sb);
4724 	return (error);
4725 }
4726 
4727 static int
4728 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4729 {
4730 	struct hn_softc *sc = arg1;
4731 	int error, onoff = 0;
4732 
4733 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4734 		onoff = 1;
4735 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4736 	if (error || req->newptr == NULL)
4737 		return (error);
4738 
4739 	HN_LOCK(sc);
4740 	/* NOTE: hn_vf_lock for hn_transmit() */
4741 	rm_wlock(&sc->hn_vf_lock);
4742 	if (onoff)
4743 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4744 	else
4745 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4746 	rm_wunlock(&sc->hn_vf_lock);
4747 	HN_UNLOCK(sc);
4748 
4749 	return (0);
4750 }
4751 
4752 static int
4753 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4754 {
4755 	struct hn_softc *sc = arg1;
4756 	int enabled = 0;
4757 
4758 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4759 		enabled = 1;
4760 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4761 }
4762 
4763 static int
4764 hn_check_iplen(const struct mbuf *m, int hoff)
4765 {
4766 	const struct ip *ip;
4767 	int len, iphlen, iplen;
4768 	const struct tcphdr *th;
4769 	int thoff;				/* TCP data offset */
4770 
4771 	len = hoff + sizeof(struct ip);
4772 
4773 	/* The packet must be at least the size of an IP header. */
4774 	if (m->m_pkthdr.len < len)
4775 		return IPPROTO_DONE;
4776 
4777 	/* The fixed IP header must reside completely in the first mbuf. */
4778 	if (m->m_len < len)
4779 		return IPPROTO_DONE;
4780 
4781 	ip = mtodo(m, hoff);
4782 
4783 	/* Bound check the packet's stated IP header length. */
4784 	iphlen = ip->ip_hl << 2;
4785 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4786 		return IPPROTO_DONE;
4787 
4788 	/* The full IP header must reside completely in the one mbuf. */
4789 	if (m->m_len < hoff + iphlen)
4790 		return IPPROTO_DONE;
4791 
4792 	iplen = ntohs(ip->ip_len);
4793 
4794 	/*
4795 	 * Check that the amount of data in the buffers is as
4796 	 * at least much as the IP header would have us expect.
4797 	 */
4798 	if (m->m_pkthdr.len < hoff + iplen)
4799 		return IPPROTO_DONE;
4800 
4801 	/*
4802 	 * Ignore IP fragments.
4803 	 */
4804 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4805 		return IPPROTO_DONE;
4806 
4807 	/*
4808 	 * The TCP/IP or UDP/IP header must be entirely contained within
4809 	 * the first fragment of a packet.
4810 	 */
4811 	switch (ip->ip_p) {
4812 	case IPPROTO_TCP:
4813 		if (iplen < iphlen + sizeof(struct tcphdr))
4814 			return IPPROTO_DONE;
4815 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4816 			return IPPROTO_DONE;
4817 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4818 		thoff = th->th_off << 2;
4819 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4820 			return IPPROTO_DONE;
4821 		if (m->m_len < hoff + iphlen + thoff)
4822 			return IPPROTO_DONE;
4823 		break;
4824 	case IPPROTO_UDP:
4825 		if (iplen < iphlen + sizeof(struct udphdr))
4826 			return IPPROTO_DONE;
4827 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4828 			return IPPROTO_DONE;
4829 		break;
4830 	default:
4831 		if (iplen < iphlen)
4832 			return IPPROTO_DONE;
4833 		break;
4834 	}
4835 	return ip->ip_p;
4836 }
4837 
4838 static int
4839 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4840 {
4841 	struct sysctl_oid_list *child;
4842 	struct sysctl_ctx_list *ctx;
4843 	device_t dev = sc->hn_dev;
4844 #if defined(INET) || defined(INET6)
4845 #if __FreeBSD_version >= 1100095
4846 	int lroent_cnt;
4847 #endif
4848 #endif
4849 	int i;
4850 
4851 	/*
4852 	 * Create RXBUF for reception.
4853 	 *
4854 	 * NOTE:
4855 	 * - It is shared by all channels.
4856 	 * - A large enough buffer is allocated, certain version of NVSes
4857 	 *   may further limit the usable space.
4858 	 */
4859 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4860 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4861 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4862 	if (sc->hn_rxbuf == NULL) {
4863 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4864 		return (ENOMEM);
4865 	}
4866 
4867 	sc->hn_rx_ring_cnt = ring_cnt;
4868 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4869 
4870 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4871 	    M_DEVBUF, M_WAITOK | M_ZERO);
4872 
4873 #if defined(INET) || defined(INET6)
4874 #if __FreeBSD_version >= 1100095
4875 	lroent_cnt = hn_lro_entry_count;
4876 	if (lroent_cnt < TCP_LRO_ENTRIES)
4877 		lroent_cnt = TCP_LRO_ENTRIES;
4878 	if (bootverbose)
4879 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4880 #endif
4881 #endif	/* INET || INET6 */
4882 
4883 	ctx = device_get_sysctl_ctx(dev);
4884 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4885 
4886 	/* Create dev.hn.UNIT.rx sysctl tree */
4887 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4888 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4889 
4890 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4891 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4892 
4893 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4894 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4895 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4896 		if (rxr->hn_br == NULL) {
4897 			device_printf(dev, "allocate bufring failed\n");
4898 			return (ENOMEM);
4899 		}
4900 
4901 		if (hn_trust_hosttcp)
4902 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4903 		if (hn_trust_hostudp)
4904 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4905 		if (hn_trust_hostip)
4906 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4907 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4908 		rxr->hn_ifp = sc->hn_ifp;
4909 		if (i < sc->hn_tx_ring_cnt)
4910 			rxr->hn_txr = &sc->hn_tx_ring[i];
4911 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4912 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4913 		rxr->hn_rx_idx = i;
4914 		rxr->hn_rxbuf = sc->hn_rxbuf;
4915 
4916 		/*
4917 		 * Initialize LRO.
4918 		 */
4919 #if defined(INET) || defined(INET6)
4920 #if __FreeBSD_version >= 1100095
4921 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4922 		    hn_lro_mbufq_depth);
4923 #else
4924 		tcp_lro_init(&rxr->hn_lro);
4925 		rxr->hn_lro.ifp = sc->hn_ifp;
4926 #endif
4927 #if __FreeBSD_version >= 1100099
4928 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4929 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4930 #endif
4931 #endif	/* INET || INET6 */
4932 
4933 		if (sc->hn_rx_sysctl_tree != NULL) {
4934 			char name[16];
4935 
4936 			/*
4937 			 * Create per RX ring sysctl tree:
4938 			 * dev.hn.UNIT.rx.RINGID
4939 			 */
4940 			snprintf(name, sizeof(name), "%d", i);
4941 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4942 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4943 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4944 
4945 			if (rxr->hn_rx_sysctl_tree != NULL) {
4946 				SYSCTL_ADD_ULONG(ctx,
4947 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4948 				    OID_AUTO, "packets", CTLFLAG_RW,
4949 				    &rxr->hn_pkts, "# of packets received");
4950 				SYSCTL_ADD_ULONG(ctx,
4951 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4952 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
4953 				    &rxr->hn_rss_pkts,
4954 				    "# of packets w/ RSS info received");
4955 				SYSCTL_ADD_INT(ctx,
4956 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4957 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4958 				    &rxr->hn_pktbuf_len, 0,
4959 				    "Temporary channel packet buffer length");
4960 			}
4961 		}
4962 	}
4963 
4964 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4965 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4966 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4967 #if __FreeBSD_version < 1100095
4968 	    hn_rx_stat_int_sysctl,
4969 #else
4970 	    hn_rx_stat_u64_sysctl,
4971 #endif
4972 	    "LU", "LRO queued");
4973 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4974 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4975 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4976 #if __FreeBSD_version < 1100095
4977 	    hn_rx_stat_int_sysctl,
4978 #else
4979 	    hn_rx_stat_u64_sysctl,
4980 #endif
4981 	    "LU", "LRO flushed");
4982 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4983 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4984 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
4985 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4986 #if __FreeBSD_version >= 1100099
4987 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4988 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4989 	    hn_lro_lenlim_sysctl, "IU",
4990 	    "Max # of data bytes to be aggregated by LRO");
4991 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4992 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4993 	    hn_lro_ackcnt_sysctl, "I",
4994 	    "Max # of ACKs to be aggregated by LRO");
4995 #endif
4996 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4997 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4998 	    hn_trust_hcsum_sysctl, "I",
4999 	    "Trust tcp segement verification on host side, "
5000 	    "when csum info is missing");
5001 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5002 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5003 	    hn_trust_hcsum_sysctl, "I",
5004 	    "Trust udp datagram verification on host side, "
5005 	    "when csum info is missing");
5006 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5007 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5008 	    hn_trust_hcsum_sysctl, "I",
5009 	    "Trust ip packet verification on host side, "
5010 	    "when csum info is missing");
5011 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5012 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5013 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5014 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5015 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5016 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5017 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5018 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5019 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5020 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5021 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5022 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5023 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5024 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5025 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5026 	    hn_rx_stat_ulong_sysctl, "LU",
5027 	    "# of packets that we trust host's csum verification");
5028 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5029 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5030 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5031 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5032 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5033 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5034 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5035 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5036 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5037 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5038 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5039 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5040 
5041 	return (0);
5042 }
5043 
5044 static void
5045 hn_destroy_rx_data(struct hn_softc *sc)
5046 {
5047 	int i;
5048 
5049 	if (sc->hn_rxbuf != NULL) {
5050 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5051 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5052 		else
5053 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5054 		sc->hn_rxbuf = NULL;
5055 	}
5056 
5057 	if (sc->hn_rx_ring_cnt == 0)
5058 		return;
5059 
5060 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5061 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5062 
5063 		if (rxr->hn_br == NULL)
5064 			continue;
5065 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5066 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5067 		} else {
5068 			device_printf(sc->hn_dev,
5069 			    "%dth channel bufring is referenced", i);
5070 		}
5071 		rxr->hn_br = NULL;
5072 
5073 #if defined(INET) || defined(INET6)
5074 		tcp_lro_free(&rxr->hn_lro);
5075 #endif
5076 		free(rxr->hn_pktbuf, M_DEVBUF);
5077 	}
5078 	free(sc->hn_rx_ring, M_DEVBUF);
5079 	sc->hn_rx_ring = NULL;
5080 
5081 	sc->hn_rx_ring_cnt = 0;
5082 	sc->hn_rx_ring_inuse = 0;
5083 }
5084 
5085 static int
5086 hn_tx_ring_create(struct hn_softc *sc, int id)
5087 {
5088 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5089 	device_t dev = sc->hn_dev;
5090 	bus_dma_tag_t parent_dtag;
5091 	int error, i;
5092 
5093 	txr->hn_sc = sc;
5094 	txr->hn_tx_idx = id;
5095 
5096 #ifndef HN_USE_TXDESC_BUFRING
5097 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5098 #endif
5099 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5100 
5101 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5102 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5103 	    M_DEVBUF, M_WAITOK | M_ZERO);
5104 #ifndef HN_USE_TXDESC_BUFRING
5105 	SLIST_INIT(&txr->hn_txlist);
5106 #else
5107 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5108 	    M_WAITOK, &txr->hn_tx_lock);
5109 #endif
5110 
5111 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5112 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5113 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5114 	} else {
5115 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5116 	}
5117 
5118 #ifdef HN_IFSTART_SUPPORT
5119 	if (hn_use_if_start) {
5120 		txr->hn_txeof = hn_start_txeof;
5121 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5122 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5123 	} else
5124 #endif
5125 	{
5126 		int br_depth;
5127 
5128 		txr->hn_txeof = hn_xmit_txeof;
5129 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5130 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5131 
5132 		br_depth = hn_get_txswq_depth(txr);
5133 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5134 		    M_WAITOK, &txr->hn_tx_lock);
5135 	}
5136 
5137 	txr->hn_direct_tx_size = hn_direct_tx_size;
5138 
5139 	/*
5140 	 * Always schedule transmission instead of trying to do direct
5141 	 * transmission.  This one gives the best performance so far.
5142 	 */
5143 	txr->hn_sched_tx = 1;
5144 
5145 	parent_dtag = bus_get_dma_tag(dev);
5146 
5147 	/* DMA tag for RNDIS packet messages. */
5148 	error = bus_dma_tag_create(parent_dtag, /* parent */
5149 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5150 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5151 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5152 	    BUS_SPACE_MAXADDR,		/* highaddr */
5153 	    NULL, NULL,			/* filter, filterarg */
5154 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5155 	    1,				/* nsegments */
5156 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5157 	    0,				/* flags */
5158 	    NULL,			/* lockfunc */
5159 	    NULL,			/* lockfuncarg */
5160 	    &txr->hn_tx_rndis_dtag);
5161 	if (error) {
5162 		device_printf(dev, "failed to create rndis dmatag\n");
5163 		return error;
5164 	}
5165 
5166 	/* DMA tag for data. */
5167 	error = bus_dma_tag_create(parent_dtag, /* parent */
5168 	    1,				/* alignment */
5169 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5170 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5171 	    BUS_SPACE_MAXADDR,		/* highaddr */
5172 	    NULL, NULL,			/* filter, filterarg */
5173 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5174 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5175 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5176 	    0,				/* flags */
5177 	    NULL,			/* lockfunc */
5178 	    NULL,			/* lockfuncarg */
5179 	    &txr->hn_tx_data_dtag);
5180 	if (error) {
5181 		device_printf(dev, "failed to create data dmatag\n");
5182 		return error;
5183 	}
5184 
5185 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5186 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5187 
5188 		txd->txr = txr;
5189 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5190 		STAILQ_INIT(&txd->agg_list);
5191 
5192 		/*
5193 		 * Allocate and load RNDIS packet message.
5194 		 */
5195         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5196 		    (void **)&txd->rndis_pkt,
5197 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5198 		    &txd->rndis_pkt_dmap);
5199 		if (error) {
5200 			device_printf(dev,
5201 			    "failed to allocate rndis_packet_msg, %d\n", i);
5202 			return error;
5203 		}
5204 
5205 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5206 		    txd->rndis_pkt_dmap,
5207 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5208 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5209 		    BUS_DMA_NOWAIT);
5210 		if (error) {
5211 			device_printf(dev,
5212 			    "failed to load rndis_packet_msg, %d\n", i);
5213 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5214 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5215 			return error;
5216 		}
5217 
5218 		/* DMA map for TX data. */
5219 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5220 		    &txd->data_dmap);
5221 		if (error) {
5222 			device_printf(dev,
5223 			    "failed to allocate tx data dmamap\n");
5224 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5225 			    txd->rndis_pkt_dmap);
5226 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5227 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5228 			return error;
5229 		}
5230 
5231 		/* All set, put it to list */
5232 		txd->flags |= HN_TXD_FLAG_ONLIST;
5233 #ifndef HN_USE_TXDESC_BUFRING
5234 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5235 #else
5236 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5237 #endif
5238 	}
5239 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5240 
5241 	if (sc->hn_tx_sysctl_tree != NULL) {
5242 		struct sysctl_oid_list *child;
5243 		struct sysctl_ctx_list *ctx;
5244 		char name[16];
5245 
5246 		/*
5247 		 * Create per TX ring sysctl tree:
5248 		 * dev.hn.UNIT.tx.RINGID
5249 		 */
5250 		ctx = device_get_sysctl_ctx(dev);
5251 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5252 
5253 		snprintf(name, sizeof(name), "%d", id);
5254 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5255 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5256 
5257 		if (txr->hn_tx_sysctl_tree != NULL) {
5258 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5259 
5260 #ifdef HN_DEBUG
5261 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5262 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5263 			    "# of available TX descs");
5264 #endif
5265 #ifdef HN_IFSTART_SUPPORT
5266 			if (!hn_use_if_start)
5267 #endif
5268 			{
5269 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5270 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5271 				    "over active");
5272 			}
5273 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5274 			    CTLFLAG_RW, &txr->hn_pkts,
5275 			    "# of packets transmitted");
5276 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5277 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5278 		}
5279 	}
5280 
5281 	return 0;
5282 }
5283 
5284 static void
5285 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5286 {
5287 	struct hn_tx_ring *txr = txd->txr;
5288 
5289 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5290 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5291 
5292 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5293 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5294 	    txd->rndis_pkt_dmap);
5295 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5296 }
5297 
5298 static void
5299 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5300 {
5301 
5302 	KASSERT(txd->refs == 0 || txd->refs == 1,
5303 	    ("invalid txd refs %d", txd->refs));
5304 
5305 	/* Aggregated txds will be freed by their aggregating txd. */
5306 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5307 		int freed;
5308 
5309 		freed = hn_txdesc_put(txr, txd);
5310 		KASSERT(freed, ("can't free txdesc"));
5311 	}
5312 }
5313 
5314 static void
5315 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5316 {
5317 	int i;
5318 
5319 	if (txr->hn_txdesc == NULL)
5320 		return;
5321 
5322 	/*
5323 	 * NOTE:
5324 	 * Because the freeing of aggregated txds will be deferred
5325 	 * to the aggregating txd, two passes are used here:
5326 	 * - The first pass GCes any pending txds.  This GC is necessary,
5327 	 *   since if the channels are revoked, hypervisor will not
5328 	 *   deliver send-done for all pending txds.
5329 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5330 	 *   were freed.
5331 	 */
5332 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5333 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5334 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5335 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5336 
5337 	if (txr->hn_tx_data_dtag != NULL)
5338 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5339 	if (txr->hn_tx_rndis_dtag != NULL)
5340 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5341 
5342 #ifdef HN_USE_TXDESC_BUFRING
5343 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5344 #endif
5345 
5346 	free(txr->hn_txdesc, M_DEVBUF);
5347 	txr->hn_txdesc = NULL;
5348 
5349 	if (txr->hn_mbuf_br != NULL)
5350 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5351 
5352 #ifndef HN_USE_TXDESC_BUFRING
5353 	mtx_destroy(&txr->hn_txlist_spin);
5354 #endif
5355 	mtx_destroy(&txr->hn_tx_lock);
5356 }
5357 
5358 static int
5359 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5360 {
5361 	struct sysctl_oid_list *child;
5362 	struct sysctl_ctx_list *ctx;
5363 	int i;
5364 
5365 	/*
5366 	 * Create TXBUF for chimney sending.
5367 	 *
5368 	 * NOTE: It is shared by all channels.
5369 	 */
5370 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5371 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5372 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5373 	if (sc->hn_chim == NULL) {
5374 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5375 		return (ENOMEM);
5376 	}
5377 
5378 	sc->hn_tx_ring_cnt = ring_cnt;
5379 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5380 
5381 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5382 	    M_DEVBUF, M_WAITOK | M_ZERO);
5383 
5384 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5385 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5386 
5387 	/* Create dev.hn.UNIT.tx sysctl tree */
5388 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5389 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5390 
5391 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5392 		int error;
5393 
5394 		error = hn_tx_ring_create(sc, i);
5395 		if (error)
5396 			return error;
5397 	}
5398 
5399 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5400 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5401 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5402 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5403 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5404 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5405 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5406 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5407 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5408 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5409 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5410 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5411 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5412 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5413 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5414 	    hn_tx_stat_ulong_sysctl, "LU",
5415 	    "# of packet transmission aggregation flush failure");
5416 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5417 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5418 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5419 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5420 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5421 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5422 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5423 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5424 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5425 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5426 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5427 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5428 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5429 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5430 	    "# of total TX descs");
5431 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5432 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5433 	    "Chimney send packet size upper boundary");
5434 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5435 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5436 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5437 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5438 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5439 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5440 	    hn_tx_conf_int_sysctl, "I",
5441 	    "Size of the packet for direct transmission");
5442 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5443 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5444 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5445 	    hn_tx_conf_int_sysctl, "I",
5446 	    "Always schedule transmission "
5447 	    "instead of doing direct transmission");
5448 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5449 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5450 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5451 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5452 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5453 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5454 	    "Applied packet transmission aggregation size");
5455 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5456 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5457 	    hn_txagg_pktmax_sysctl, "I",
5458 	    "Applied packet transmission aggregation packets");
5459 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5460 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5461 	    hn_txagg_align_sysctl, "I",
5462 	    "Applied packet transmission aggregation alignment");
5463 
5464 	return 0;
5465 }
5466 
5467 static void
5468 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5469 {
5470 	int i;
5471 
5472 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5473 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5474 }
5475 
5476 static void
5477 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5478 {
5479 	struct ifnet *ifp = sc->hn_ifp;
5480 	u_int hw_tsomax;
5481 	int tso_minlen;
5482 
5483 	HN_LOCK_ASSERT(sc);
5484 
5485 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5486 		return;
5487 
5488 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5489 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5490 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5491 
5492 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5493 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5494 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5495 
5496 	if (tso_maxlen < tso_minlen)
5497 		tso_maxlen = tso_minlen;
5498 	else if (tso_maxlen > IP_MAXPACKET)
5499 		tso_maxlen = IP_MAXPACKET;
5500 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5501 		tso_maxlen = sc->hn_ndis_tso_szmax;
5502 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5503 
5504 	if (hn_xpnt_vf_isready(sc)) {
5505 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5506 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5507 	}
5508 	ifp->if_hw_tsomax = hw_tsomax;
5509 	if (bootverbose)
5510 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5511 }
5512 
5513 static void
5514 hn_fixup_tx_data(struct hn_softc *sc)
5515 {
5516 	uint64_t csum_assist;
5517 	int i;
5518 
5519 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5520 	if (hn_tx_chimney_size > 0 &&
5521 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5522 		hn_set_chim_size(sc, hn_tx_chimney_size);
5523 
5524 	csum_assist = 0;
5525 	if (sc->hn_caps & HN_CAP_IPCS)
5526 		csum_assist |= CSUM_IP;
5527 	if (sc->hn_caps & HN_CAP_TCP4CS)
5528 		csum_assist |= CSUM_IP_TCP;
5529 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5530 		csum_assist |= CSUM_IP_UDP;
5531 	if (sc->hn_caps & HN_CAP_TCP6CS)
5532 		csum_assist |= CSUM_IP6_TCP;
5533 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5534 		csum_assist |= CSUM_IP6_UDP;
5535 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5536 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5537 
5538 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5539 		/*
5540 		 * Support HASHVAL pktinfo on TX path.
5541 		 */
5542 		if (bootverbose)
5543 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5544 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5545 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5546 	}
5547 }
5548 
5549 static void
5550 hn_destroy_tx_data(struct hn_softc *sc)
5551 {
5552 	int i;
5553 
5554 	if (sc->hn_chim != NULL) {
5555 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5556 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5557 		} else {
5558 			device_printf(sc->hn_dev,
5559 			    "chimney sending buffer is referenced");
5560 		}
5561 		sc->hn_chim = NULL;
5562 	}
5563 
5564 	if (sc->hn_tx_ring_cnt == 0)
5565 		return;
5566 
5567 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5568 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5569 
5570 	free(sc->hn_tx_ring, M_DEVBUF);
5571 	sc->hn_tx_ring = NULL;
5572 
5573 	sc->hn_tx_ring_cnt = 0;
5574 	sc->hn_tx_ring_inuse = 0;
5575 }
5576 
5577 #ifdef HN_IFSTART_SUPPORT
5578 
5579 static void
5580 hn_start_taskfunc(void *xtxr, int pending __unused)
5581 {
5582 	struct hn_tx_ring *txr = xtxr;
5583 
5584 	mtx_lock(&txr->hn_tx_lock);
5585 	hn_start_locked(txr, 0);
5586 	mtx_unlock(&txr->hn_tx_lock);
5587 }
5588 
5589 static int
5590 hn_start_locked(struct hn_tx_ring *txr, int len)
5591 {
5592 	struct hn_softc *sc = txr->hn_sc;
5593 	struct ifnet *ifp = sc->hn_ifp;
5594 	int sched = 0;
5595 
5596 	KASSERT(hn_use_if_start,
5597 	    ("hn_start_locked is called, when if_start is disabled"));
5598 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5599 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5600 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5601 
5602 	if (__predict_false(txr->hn_suspended))
5603 		return (0);
5604 
5605 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5606 	    IFF_DRV_RUNNING)
5607 		return (0);
5608 
5609 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5610 		struct hn_txdesc *txd;
5611 		struct mbuf *m_head;
5612 		int error;
5613 
5614 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5615 		if (m_head == NULL)
5616 			break;
5617 
5618 		if (len > 0 && m_head->m_pkthdr.len > len) {
5619 			/*
5620 			 * This sending could be time consuming; let callers
5621 			 * dispatch this packet sending (and sending of any
5622 			 * following up packets) to tx taskqueue.
5623 			 */
5624 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5625 			sched = 1;
5626 			break;
5627 		}
5628 
5629 #if defined(INET6) || defined(INET)
5630 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5631 			m_head = hn_tso_fixup(m_head);
5632 			if (__predict_false(m_head == NULL)) {
5633 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5634 				continue;
5635 			}
5636 		} else if (m_head->m_pkthdr.csum_flags &
5637 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5638 			m_head = hn_set_hlen(m_head);
5639 			if (__predict_false(m_head == NULL)) {
5640 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5641 				continue;
5642 			}
5643 		}
5644 #endif
5645 
5646 		txd = hn_txdesc_get(txr);
5647 		if (txd == NULL) {
5648 			txr->hn_no_txdescs++;
5649 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5650 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5651 			break;
5652 		}
5653 
5654 		error = hn_encap(ifp, txr, txd, &m_head);
5655 		if (error) {
5656 			/* Both txd and m_head are freed */
5657 			KASSERT(txr->hn_agg_txd == NULL,
5658 			    ("encap failed w/ pending aggregating txdesc"));
5659 			continue;
5660 		}
5661 
5662 		if (txr->hn_agg_pktleft == 0) {
5663 			if (txr->hn_agg_txd != NULL) {
5664 				KASSERT(m_head == NULL,
5665 				    ("pending mbuf for aggregating txdesc"));
5666 				error = hn_flush_txagg(ifp, txr);
5667 				if (__predict_false(error)) {
5668 					atomic_set_int(&ifp->if_drv_flags,
5669 					    IFF_DRV_OACTIVE);
5670 					break;
5671 				}
5672 			} else {
5673 				KASSERT(m_head != NULL, ("mbuf was freed"));
5674 				error = hn_txpkt(ifp, txr, txd);
5675 				if (__predict_false(error)) {
5676 					/* txd is freed, but m_head is not */
5677 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5678 					atomic_set_int(&ifp->if_drv_flags,
5679 					    IFF_DRV_OACTIVE);
5680 					break;
5681 				}
5682 			}
5683 		}
5684 #ifdef INVARIANTS
5685 		else {
5686 			KASSERT(txr->hn_agg_txd != NULL,
5687 			    ("no aggregating txdesc"));
5688 			KASSERT(m_head == NULL,
5689 			    ("pending mbuf for aggregating txdesc"));
5690 		}
5691 #endif
5692 	}
5693 
5694 	/* Flush pending aggerated transmission. */
5695 	if (txr->hn_agg_txd != NULL)
5696 		hn_flush_txagg(ifp, txr);
5697 	return (sched);
5698 }
5699 
5700 static void
5701 hn_start(struct ifnet *ifp)
5702 {
5703 	struct hn_softc *sc = ifp->if_softc;
5704 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5705 
5706 	if (txr->hn_sched_tx)
5707 		goto do_sched;
5708 
5709 	if (mtx_trylock(&txr->hn_tx_lock)) {
5710 		int sched;
5711 
5712 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5713 		mtx_unlock(&txr->hn_tx_lock);
5714 		if (!sched)
5715 			return;
5716 	}
5717 do_sched:
5718 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5719 }
5720 
5721 static void
5722 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5723 {
5724 	struct hn_tx_ring *txr = xtxr;
5725 
5726 	mtx_lock(&txr->hn_tx_lock);
5727 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5728 	hn_start_locked(txr, 0);
5729 	mtx_unlock(&txr->hn_tx_lock);
5730 }
5731 
5732 static void
5733 hn_start_txeof(struct hn_tx_ring *txr)
5734 {
5735 	struct hn_softc *sc = txr->hn_sc;
5736 	struct ifnet *ifp = sc->hn_ifp;
5737 
5738 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5739 
5740 	if (txr->hn_sched_tx)
5741 		goto do_sched;
5742 
5743 	if (mtx_trylock(&txr->hn_tx_lock)) {
5744 		int sched;
5745 
5746 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5747 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5748 		mtx_unlock(&txr->hn_tx_lock);
5749 		if (sched) {
5750 			taskqueue_enqueue(txr->hn_tx_taskq,
5751 			    &txr->hn_tx_task);
5752 		}
5753 	} else {
5754 do_sched:
5755 		/*
5756 		 * Release the OACTIVE earlier, with the hope, that
5757 		 * others could catch up.  The task will clear the
5758 		 * flag again with the hn_tx_lock to avoid possible
5759 		 * races.
5760 		 */
5761 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5762 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5763 	}
5764 }
5765 
5766 #endif	/* HN_IFSTART_SUPPORT */
5767 
5768 static int
5769 hn_xmit(struct hn_tx_ring *txr, int len)
5770 {
5771 	struct hn_softc *sc = txr->hn_sc;
5772 	struct ifnet *ifp = sc->hn_ifp;
5773 	struct mbuf *m_head;
5774 	int sched = 0;
5775 
5776 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5777 #ifdef HN_IFSTART_SUPPORT
5778 	KASSERT(hn_use_if_start == 0,
5779 	    ("hn_xmit is called, when if_start is enabled"));
5780 #endif
5781 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5782 
5783 	if (__predict_false(txr->hn_suspended))
5784 		return (0);
5785 
5786 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5787 		return (0);
5788 
5789 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5790 		struct hn_txdesc *txd;
5791 		int error;
5792 
5793 		if (len > 0 && m_head->m_pkthdr.len > len) {
5794 			/*
5795 			 * This sending could be time consuming; let callers
5796 			 * dispatch this packet sending (and sending of any
5797 			 * following up packets) to tx taskqueue.
5798 			 */
5799 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5800 			sched = 1;
5801 			break;
5802 		}
5803 
5804 		txd = hn_txdesc_get(txr);
5805 		if (txd == NULL) {
5806 			txr->hn_no_txdescs++;
5807 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5808 			txr->hn_oactive = 1;
5809 			break;
5810 		}
5811 
5812 		error = hn_encap(ifp, txr, txd, &m_head);
5813 		if (error) {
5814 			/* Both txd and m_head are freed; discard */
5815 			KASSERT(txr->hn_agg_txd == NULL,
5816 			    ("encap failed w/ pending aggregating txdesc"));
5817 			drbr_advance(ifp, txr->hn_mbuf_br);
5818 			continue;
5819 		}
5820 
5821 		if (txr->hn_agg_pktleft == 0) {
5822 			if (txr->hn_agg_txd != NULL) {
5823 				KASSERT(m_head == NULL,
5824 				    ("pending mbuf for aggregating txdesc"));
5825 				error = hn_flush_txagg(ifp, txr);
5826 				if (__predict_false(error)) {
5827 					txr->hn_oactive = 1;
5828 					break;
5829 				}
5830 			} else {
5831 				KASSERT(m_head != NULL, ("mbuf was freed"));
5832 				error = hn_txpkt(ifp, txr, txd);
5833 				if (__predict_false(error)) {
5834 					/* txd is freed, but m_head is not */
5835 					drbr_putback(ifp, txr->hn_mbuf_br,
5836 					    m_head);
5837 					txr->hn_oactive = 1;
5838 					break;
5839 				}
5840 			}
5841 		}
5842 #ifdef INVARIANTS
5843 		else {
5844 			KASSERT(txr->hn_agg_txd != NULL,
5845 			    ("no aggregating txdesc"));
5846 			KASSERT(m_head == NULL,
5847 			    ("pending mbuf for aggregating txdesc"));
5848 		}
5849 #endif
5850 
5851 		/* Sent */
5852 		drbr_advance(ifp, txr->hn_mbuf_br);
5853 	}
5854 
5855 	/* Flush pending aggerated transmission. */
5856 	if (txr->hn_agg_txd != NULL)
5857 		hn_flush_txagg(ifp, txr);
5858 	return (sched);
5859 }
5860 
5861 static int
5862 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5863 {
5864 	struct hn_softc *sc = ifp->if_softc;
5865 	struct hn_tx_ring *txr;
5866 	int error, idx = 0;
5867 
5868 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5869 		struct rm_priotracker pt;
5870 
5871 		rm_rlock(&sc->hn_vf_lock, &pt);
5872 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5873 			struct mbuf *m_bpf = NULL;
5874 			int obytes, omcast;
5875 
5876 			obytes = m->m_pkthdr.len;
5877 			if (m->m_flags & M_MCAST)
5878 				omcast = 1;
5879 
5880 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5881 				if (bpf_peers_present(ifp->if_bpf)) {
5882 					m_bpf = m_copypacket(m, M_NOWAIT);
5883 					if (m_bpf == NULL) {
5884 						/*
5885 						 * Failed to grab a shallow
5886 						 * copy; tap now.
5887 						 */
5888 						ETHER_BPF_MTAP(ifp, m);
5889 					}
5890 				}
5891 			} else {
5892 				ETHER_BPF_MTAP(ifp, m);
5893 			}
5894 
5895 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5896 			rm_runlock(&sc->hn_vf_lock, &pt);
5897 
5898 			if (m_bpf != NULL) {
5899 				if (!error)
5900 					ETHER_BPF_MTAP(ifp, m_bpf);
5901 				m_freem(m_bpf);
5902 			}
5903 
5904 			if (error == ENOBUFS) {
5905 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5906 			} else if (error) {
5907 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5908 			} else {
5909 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5910 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5911 				if (omcast) {
5912 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5913 					    omcast);
5914 				}
5915 			}
5916 			return (error);
5917 		}
5918 		rm_runlock(&sc->hn_vf_lock, &pt);
5919 	}
5920 
5921 #if defined(INET6) || defined(INET)
5922 	/*
5923 	 * Perform TSO packet header fixup or get l2/l3 header length now,
5924 	 * since packet headers should be cache-hot.
5925 	 */
5926 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5927 		m = hn_tso_fixup(m);
5928 		if (__predict_false(m == NULL)) {
5929 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5930 			return EIO;
5931 		}
5932 	} else if (m->m_pkthdr.csum_flags &
5933 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5934 		m = hn_set_hlen(m);
5935 		if (__predict_false(m == NULL)) {
5936 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5937 			return EIO;
5938 		}
5939 	}
5940 #endif
5941 
5942 	/*
5943 	 * Select the TX ring based on flowid
5944 	 */
5945 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5946 #ifdef RSS
5947 		uint32_t bid;
5948 
5949 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
5950 		    &bid) == 0)
5951 			idx = bid % sc->hn_tx_ring_inuse;
5952 		else
5953 #endif
5954 		{
5955 #if defined(INET6) || defined(INET)
5956 			int tcpsyn = 0;
5957 
5958 			if (m->m_pkthdr.len < 128 &&
5959 			    (m->m_pkthdr.csum_flags &
5960 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5961 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5962 				m = hn_check_tcpsyn(m, &tcpsyn);
5963 				if (__predict_false(m == NULL)) {
5964 					if_inc_counter(ifp,
5965 					    IFCOUNTER_OERRORS, 1);
5966 					return (EIO);
5967 				}
5968 			}
5969 #else
5970 			const int tcpsyn = 0;
5971 #endif
5972 			if (tcpsyn)
5973 				idx = 0;
5974 			else
5975 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5976 		}
5977 	}
5978 	txr = &sc->hn_tx_ring[idx];
5979 
5980 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5981 	if (error) {
5982 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5983 		return error;
5984 	}
5985 
5986 	if (txr->hn_oactive)
5987 		return 0;
5988 
5989 	if (txr->hn_sched_tx)
5990 		goto do_sched;
5991 
5992 	if (mtx_trylock(&txr->hn_tx_lock)) {
5993 		int sched;
5994 
5995 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
5996 		mtx_unlock(&txr->hn_tx_lock);
5997 		if (!sched)
5998 			return 0;
5999 	}
6000 do_sched:
6001 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6002 	return 0;
6003 }
6004 
6005 static void
6006 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6007 {
6008 	struct mbuf *m;
6009 
6010 	mtx_lock(&txr->hn_tx_lock);
6011 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6012 		m_freem(m);
6013 	mtx_unlock(&txr->hn_tx_lock);
6014 }
6015 
6016 static void
6017 hn_xmit_qflush(struct ifnet *ifp)
6018 {
6019 	struct hn_softc *sc = ifp->if_softc;
6020 	struct rm_priotracker pt;
6021 	int i;
6022 
6023 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6024 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6025 	if_qflush(ifp);
6026 
6027 	rm_rlock(&sc->hn_vf_lock, &pt);
6028 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6029 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6030 	rm_runlock(&sc->hn_vf_lock, &pt);
6031 }
6032 
6033 static void
6034 hn_xmit_txeof(struct hn_tx_ring *txr)
6035 {
6036 
6037 	if (txr->hn_sched_tx)
6038 		goto do_sched;
6039 
6040 	if (mtx_trylock(&txr->hn_tx_lock)) {
6041 		int sched;
6042 
6043 		txr->hn_oactive = 0;
6044 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6045 		mtx_unlock(&txr->hn_tx_lock);
6046 		if (sched) {
6047 			taskqueue_enqueue(txr->hn_tx_taskq,
6048 			    &txr->hn_tx_task);
6049 		}
6050 	} else {
6051 do_sched:
6052 		/*
6053 		 * Release the oactive earlier, with the hope, that
6054 		 * others could catch up.  The task will clear the
6055 		 * oactive again with the hn_tx_lock to avoid possible
6056 		 * races.
6057 		 */
6058 		txr->hn_oactive = 0;
6059 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6060 	}
6061 }
6062 
6063 static void
6064 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6065 {
6066 	struct hn_tx_ring *txr = xtxr;
6067 
6068 	mtx_lock(&txr->hn_tx_lock);
6069 	hn_xmit(txr, 0);
6070 	mtx_unlock(&txr->hn_tx_lock);
6071 }
6072 
6073 static void
6074 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6075 {
6076 	struct hn_tx_ring *txr = xtxr;
6077 
6078 	mtx_lock(&txr->hn_tx_lock);
6079 	txr->hn_oactive = 0;
6080 	hn_xmit(txr, 0);
6081 	mtx_unlock(&txr->hn_tx_lock);
6082 }
6083 
6084 static int
6085 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6086 {
6087 	struct vmbus_chan_br cbr;
6088 	struct hn_rx_ring *rxr;
6089 	struct hn_tx_ring *txr = NULL;
6090 	int idx, error;
6091 
6092 	idx = vmbus_chan_subidx(chan);
6093 
6094 	/*
6095 	 * Link this channel to RX/TX ring.
6096 	 */
6097 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6098 	    ("invalid channel index %d, should > 0 && < %d",
6099 	     idx, sc->hn_rx_ring_inuse));
6100 	rxr = &sc->hn_rx_ring[idx];
6101 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6102 	    ("RX ring %d already attached", idx));
6103 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6104 	rxr->hn_chan = chan;
6105 
6106 	if (bootverbose) {
6107 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6108 		    idx, vmbus_chan_id(chan));
6109 	}
6110 
6111 	if (idx < sc->hn_tx_ring_inuse) {
6112 		txr = &sc->hn_tx_ring[idx];
6113 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6114 		    ("TX ring %d already attached", idx));
6115 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6116 
6117 		txr->hn_chan = chan;
6118 		if (bootverbose) {
6119 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6120 			    idx, vmbus_chan_id(chan));
6121 		}
6122 	}
6123 
6124 	/* Bind this channel to a proper CPU. */
6125 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6126 
6127 	/*
6128 	 * Open this channel
6129 	 */
6130 	cbr.cbr = rxr->hn_br;
6131 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6132 	cbr.cbr_txsz = HN_TXBR_SIZE;
6133 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6134 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6135 	if (error) {
6136 		if (error == EISCONN) {
6137 			if_printf(sc->hn_ifp, "bufring is connected after "
6138 			    "chan%u open failure\n", vmbus_chan_id(chan));
6139 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6140 		} else {
6141 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6142 			    vmbus_chan_id(chan), error);
6143 		}
6144 	}
6145 	return (error);
6146 }
6147 
6148 static void
6149 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6150 {
6151 	struct hn_rx_ring *rxr;
6152 	int idx, error;
6153 
6154 	idx = vmbus_chan_subidx(chan);
6155 
6156 	/*
6157 	 * Link this channel to RX/TX ring.
6158 	 */
6159 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6160 	    ("invalid channel index %d, should > 0 && < %d",
6161 	     idx, sc->hn_rx_ring_inuse));
6162 	rxr = &sc->hn_rx_ring[idx];
6163 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6164 	    ("RX ring %d is not attached", idx));
6165 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6166 
6167 	if (idx < sc->hn_tx_ring_inuse) {
6168 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6169 
6170 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6171 		    ("TX ring %d is not attached attached", idx));
6172 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6173 	}
6174 
6175 	/*
6176 	 * Close this channel.
6177 	 *
6178 	 * NOTE:
6179 	 * Channel closing does _not_ destroy the target channel.
6180 	 */
6181 	error = vmbus_chan_close_direct(chan);
6182 	if (error == EISCONN) {
6183 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6184 		    "after being closed\n", vmbus_chan_id(chan));
6185 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6186 	} else if (error) {
6187 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6188 		    vmbus_chan_id(chan), error);
6189 	}
6190 }
6191 
6192 static int
6193 hn_attach_subchans(struct hn_softc *sc)
6194 {
6195 	struct vmbus_channel **subchans;
6196 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6197 	int i, error = 0;
6198 
6199 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6200 
6201 	/* Attach the sub-channels. */
6202 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6203 	for (i = 0; i < subchan_cnt; ++i) {
6204 		int error1;
6205 
6206 		error1 = hn_chan_attach(sc, subchans[i]);
6207 		if (error1) {
6208 			error = error1;
6209 			/* Move on; all channels will be detached later. */
6210 		}
6211 	}
6212 	vmbus_subchan_rel(subchans, subchan_cnt);
6213 
6214 	if (error) {
6215 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6216 	} else {
6217 		if (bootverbose) {
6218 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6219 			    subchan_cnt);
6220 		}
6221 	}
6222 	return (error);
6223 }
6224 
6225 static void
6226 hn_detach_allchans(struct hn_softc *sc)
6227 {
6228 	struct vmbus_channel **subchans;
6229 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6230 	int i;
6231 
6232 	if (subchan_cnt == 0)
6233 		goto back;
6234 
6235 	/* Detach the sub-channels. */
6236 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6237 	for (i = 0; i < subchan_cnt; ++i)
6238 		hn_chan_detach(sc, subchans[i]);
6239 	vmbus_subchan_rel(subchans, subchan_cnt);
6240 
6241 back:
6242 	/*
6243 	 * Detach the primary channel, _after_ all sub-channels
6244 	 * are detached.
6245 	 */
6246 	hn_chan_detach(sc, sc->hn_prichan);
6247 
6248 	/* Wait for sub-channels to be destroyed, if any. */
6249 	vmbus_subchan_drain(sc->hn_prichan);
6250 
6251 #ifdef INVARIANTS
6252 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6253 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6254 		    HN_RX_FLAG_ATTACHED) == 0,
6255 		    ("%dth RX ring is still attached", i));
6256 	}
6257 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6258 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6259 		    HN_TX_FLAG_ATTACHED) == 0,
6260 		    ("%dth TX ring is still attached", i));
6261 	}
6262 #endif
6263 }
6264 
6265 static int
6266 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6267 {
6268 	struct vmbus_channel **subchans;
6269 	int nchan, rxr_cnt, error;
6270 
6271 	nchan = *nsubch + 1;
6272 	if (nchan == 1) {
6273 		/*
6274 		 * Multiple RX/TX rings are not requested.
6275 		 */
6276 		*nsubch = 0;
6277 		return (0);
6278 	}
6279 
6280 	/*
6281 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6282 	 * table entries.
6283 	 */
6284 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6285 	if (error) {
6286 		/* No RSS; this is benign. */
6287 		*nsubch = 0;
6288 		return (0);
6289 	}
6290 	if (bootverbose) {
6291 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6292 		    rxr_cnt, nchan);
6293 	}
6294 
6295 	if (nchan > rxr_cnt)
6296 		nchan = rxr_cnt;
6297 	if (nchan == 1) {
6298 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6299 		*nsubch = 0;
6300 		return (0);
6301 	}
6302 
6303 	/*
6304 	 * Allocate sub-channels from NVS.
6305 	 */
6306 	*nsubch = nchan - 1;
6307 	error = hn_nvs_alloc_subchans(sc, nsubch);
6308 	if (error || *nsubch == 0) {
6309 		/* Failed to allocate sub-channels. */
6310 		*nsubch = 0;
6311 		return (0);
6312 	}
6313 
6314 	/*
6315 	 * Wait for all sub-channels to become ready before moving on.
6316 	 */
6317 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6318 	vmbus_subchan_rel(subchans, *nsubch);
6319 	return (0);
6320 }
6321 
6322 static bool
6323 hn_synth_attachable(const struct hn_softc *sc)
6324 {
6325 	int i;
6326 
6327 	if (sc->hn_flags & HN_FLAG_ERRORS)
6328 		return (false);
6329 
6330 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6331 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6332 
6333 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6334 			return (false);
6335 	}
6336 	return (true);
6337 }
6338 
6339 /*
6340  * Make sure that the RX filter is zero after the successful
6341  * RNDIS initialization.
6342  *
6343  * NOTE:
6344  * Under certain conditions on certain versions of Hyper-V,
6345  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6346  * after the successful RNDIS initialization, which breaks
6347  * the assumption of any following code (well, it breaks the
6348  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6349  * explicitly, drain packets sneaking through, and drain the
6350  * interrupt taskqueues scheduled due to the stealth packets.
6351  */
6352 static void
6353 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6354 {
6355 
6356 	hn_disable_rx(sc);
6357 	hn_drain_rxtx(sc, nchan);
6358 }
6359 
6360 static int
6361 hn_synth_attach(struct hn_softc *sc, int mtu)
6362 {
6363 #define ATTACHED_NVS		0x0002
6364 #define ATTACHED_RNDIS		0x0004
6365 
6366 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6367 	int error, nsubch, nchan = 1, i, rndis_inited;
6368 	uint32_t old_caps, attached = 0;
6369 
6370 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6371 	    ("synthetic parts were attached"));
6372 
6373 	if (!hn_synth_attachable(sc))
6374 		return (ENXIO);
6375 
6376 	/* Save capabilities for later verification. */
6377 	old_caps = sc->hn_caps;
6378 	sc->hn_caps = 0;
6379 
6380 	/* Clear RSS stuffs. */
6381 	sc->hn_rss_ind_size = 0;
6382 	sc->hn_rss_hash = 0;
6383 	sc->hn_rss_hcap = 0;
6384 
6385 	/*
6386 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6387 	 */
6388 	error = hn_chan_attach(sc, sc->hn_prichan);
6389 	if (error)
6390 		goto failed;
6391 
6392 	/*
6393 	 * Attach NVS.
6394 	 */
6395 	error = hn_nvs_attach(sc, mtu);
6396 	if (error)
6397 		goto failed;
6398 	attached |= ATTACHED_NVS;
6399 
6400 	/*
6401 	 * Attach RNDIS _after_ NVS is attached.
6402 	 */
6403 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6404 	if (rndis_inited)
6405 		attached |= ATTACHED_RNDIS;
6406 	if (error)
6407 		goto failed;
6408 
6409 	/*
6410 	 * Make sure capabilities are not changed.
6411 	 */
6412 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6413 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6414 		    old_caps, sc->hn_caps);
6415 		error = ENXIO;
6416 		goto failed;
6417 	}
6418 
6419 	/*
6420 	 * Allocate sub-channels for multi-TX/RX rings.
6421 	 *
6422 	 * NOTE:
6423 	 * The # of RX rings that can be used is equivalent to the # of
6424 	 * channels to be requested.
6425 	 */
6426 	nsubch = sc->hn_rx_ring_cnt - 1;
6427 	error = hn_synth_alloc_subchans(sc, &nsubch);
6428 	if (error)
6429 		goto failed;
6430 	/* NOTE: _Full_ synthetic parts detach is required now. */
6431 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6432 
6433 	/*
6434 	 * Set the # of TX/RX rings that could be used according to
6435 	 * the # of channels that NVS offered.
6436 	 */
6437 	nchan = nsubch + 1;
6438 	hn_set_ring_inuse(sc, nchan);
6439 	if (nchan == 1) {
6440 		/* Only the primary channel can be used; done */
6441 		goto back;
6442 	}
6443 
6444 	/*
6445 	 * Attach the sub-channels.
6446 	 *
6447 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6448 	 */
6449 	error = hn_attach_subchans(sc);
6450 	if (error)
6451 		goto failed;
6452 
6453 	/*
6454 	 * Configure RSS key and indirect table _after_ all sub-channels
6455 	 * are attached.
6456 	 */
6457 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6458 		/*
6459 		 * RSS key is not set yet; set it to the default RSS key.
6460 		 */
6461 		if (bootverbose)
6462 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6463 #ifdef RSS
6464 		rss_getkey(rss->rss_key);
6465 #else
6466 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6467 #endif
6468 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6469 	}
6470 
6471 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6472 		/*
6473 		 * RSS indirect table is not set yet; set it up in round-
6474 		 * robin fashion.
6475 		 */
6476 		if (bootverbose) {
6477 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6478 			    "table\n");
6479 		}
6480 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6481 			uint32_t subidx;
6482 
6483 #ifdef RSS
6484 			subidx = rss_get_indirection_to_bucket(i);
6485 #else
6486 			subidx = i;
6487 #endif
6488 			rss->rss_ind[i] = subidx % nchan;
6489 		}
6490 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6491 	} else {
6492 		/*
6493 		 * # of usable channels may be changed, so we have to
6494 		 * make sure that all entries in RSS indirect table
6495 		 * are valid.
6496 		 *
6497 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6498 		 */
6499 		hn_rss_ind_fixup(sc);
6500 	}
6501 
6502 	sc->hn_rss_hash = sc->hn_rss_hcap;
6503 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6504 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6505 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6506 		hn_vf_rss_fixup(sc, false);
6507 	}
6508 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6509 	if (error)
6510 		goto failed;
6511 back:
6512 	/*
6513 	 * Fixup transmission aggregation setup.
6514 	 */
6515 	hn_set_txagg(sc);
6516 	hn_rndis_init_fixat(sc, nchan);
6517 	return (0);
6518 
6519 failed:
6520 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6521 		hn_rndis_init_fixat(sc, nchan);
6522 		hn_synth_detach(sc);
6523 	} else {
6524 		if (attached & ATTACHED_RNDIS) {
6525 			hn_rndis_init_fixat(sc, nchan);
6526 			hn_rndis_detach(sc);
6527 		}
6528 		if (attached & ATTACHED_NVS)
6529 			hn_nvs_detach(sc);
6530 		hn_chan_detach(sc, sc->hn_prichan);
6531 		/* Restore old capabilities. */
6532 		sc->hn_caps = old_caps;
6533 	}
6534 	return (error);
6535 
6536 #undef ATTACHED_RNDIS
6537 #undef ATTACHED_NVS
6538 }
6539 
6540 /*
6541  * NOTE:
6542  * The interface must have been suspended though hn_suspend(), before
6543  * this function get called.
6544  */
6545 static void
6546 hn_synth_detach(struct hn_softc *sc)
6547 {
6548 
6549 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6550 	    ("synthetic parts were not attached"));
6551 
6552 	/* Detach the RNDIS first. */
6553 	hn_rndis_detach(sc);
6554 
6555 	/* Detach NVS. */
6556 	hn_nvs_detach(sc);
6557 
6558 	/* Detach all of the channels. */
6559 	hn_detach_allchans(sc);
6560 
6561 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6562 }
6563 
6564 static void
6565 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6566 {
6567 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6568 	    ("invalid ring count %d", ring_cnt));
6569 
6570 	if (sc->hn_tx_ring_cnt > ring_cnt)
6571 		sc->hn_tx_ring_inuse = ring_cnt;
6572 	else
6573 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6574 	sc->hn_rx_ring_inuse = ring_cnt;
6575 
6576 #ifdef RSS
6577 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6578 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6579 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6580 		    rss_getnumbuckets());
6581 	}
6582 #endif
6583 
6584 	if (bootverbose) {
6585 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6586 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6587 	}
6588 }
6589 
6590 static void
6591 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6592 {
6593 
6594 	/*
6595 	 * NOTE:
6596 	 * The TX bufring will not be drained by the hypervisor,
6597 	 * if the primary channel is revoked.
6598 	 */
6599 	while (!vmbus_chan_rx_empty(chan) ||
6600 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6601 	     !vmbus_chan_tx_empty(chan)))
6602 		pause("waitch", 1);
6603 	vmbus_chan_intr_drain(chan);
6604 }
6605 
6606 static void
6607 hn_disable_rx(struct hn_softc *sc)
6608 {
6609 
6610 	/*
6611 	 * Disable RX by clearing RX filter forcefully.
6612 	 */
6613 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6614 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6615 
6616 	/*
6617 	 * Give RNDIS enough time to flush all pending data packets.
6618 	 */
6619 	pause("waitrx", (200 * hz) / 1000);
6620 }
6621 
6622 /*
6623  * NOTE:
6624  * RX/TX _must_ have been suspended/disabled, before this function
6625  * is called.
6626  */
6627 static void
6628 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6629 {
6630 	struct vmbus_channel **subch = NULL;
6631 	int nsubch;
6632 
6633 	/*
6634 	 * Drain RX/TX bufrings and interrupts.
6635 	 */
6636 	nsubch = nchan - 1;
6637 	if (nsubch > 0)
6638 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6639 
6640 	if (subch != NULL) {
6641 		int i;
6642 
6643 		for (i = 0; i < nsubch; ++i)
6644 			hn_chan_drain(sc, subch[i]);
6645 	}
6646 	hn_chan_drain(sc, sc->hn_prichan);
6647 
6648 	if (subch != NULL)
6649 		vmbus_subchan_rel(subch, nsubch);
6650 }
6651 
6652 static void
6653 hn_suspend_data(struct hn_softc *sc)
6654 {
6655 	struct hn_tx_ring *txr;
6656 	int i;
6657 
6658 	HN_LOCK_ASSERT(sc);
6659 
6660 	/*
6661 	 * Suspend TX.
6662 	 */
6663 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6664 		txr = &sc->hn_tx_ring[i];
6665 
6666 		mtx_lock(&txr->hn_tx_lock);
6667 		txr->hn_suspended = 1;
6668 		mtx_unlock(&txr->hn_tx_lock);
6669 		/* No one is able send more packets now. */
6670 
6671 		/*
6672 		 * Wait for all pending sends to finish.
6673 		 *
6674 		 * NOTE:
6675 		 * We will _not_ receive all pending send-done, if the
6676 		 * primary channel is revoked.
6677 		 */
6678 		while (hn_tx_ring_pending(txr) &&
6679 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6680 			pause("hnwtx", 1 /* 1 tick */);
6681 	}
6682 
6683 	/*
6684 	 * Disable RX.
6685 	 */
6686 	hn_disable_rx(sc);
6687 
6688 	/*
6689 	 * Drain RX/TX.
6690 	 */
6691 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6692 
6693 	/*
6694 	 * Drain any pending TX tasks.
6695 	 *
6696 	 * NOTE:
6697 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6698 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6699 	 */
6700 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6701 		txr = &sc->hn_tx_ring[i];
6702 
6703 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6704 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6705 	}
6706 }
6707 
6708 static void
6709 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6710 {
6711 
6712 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6713 }
6714 
6715 static void
6716 hn_suspend_mgmt(struct hn_softc *sc)
6717 {
6718 	struct task task;
6719 
6720 	HN_LOCK_ASSERT(sc);
6721 
6722 	/*
6723 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6724 	 * through hn_mgmt_taskq.
6725 	 */
6726 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6727 	vmbus_chan_run_task(sc->hn_prichan, &task);
6728 
6729 	/*
6730 	 * Make sure that all pending management tasks are completed.
6731 	 */
6732 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6733 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6734 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6735 }
6736 
6737 static void
6738 hn_suspend(struct hn_softc *sc)
6739 {
6740 
6741 	/* Disable polling. */
6742 	hn_polling(sc, 0);
6743 
6744 	/*
6745 	 * If the non-transparent mode VF is activated, the synthetic
6746 	 * device is receiving packets, so the data path of the
6747 	 * synthetic device must be suspended.
6748 	 */
6749 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6750 	    (sc->hn_flags & HN_FLAG_RXVF))
6751 		hn_suspend_data(sc);
6752 	hn_suspend_mgmt(sc);
6753 }
6754 
6755 static void
6756 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6757 {
6758 	int i;
6759 
6760 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6761 	    ("invalid TX ring count %d", tx_ring_cnt));
6762 
6763 	for (i = 0; i < tx_ring_cnt; ++i) {
6764 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6765 
6766 		mtx_lock(&txr->hn_tx_lock);
6767 		txr->hn_suspended = 0;
6768 		mtx_unlock(&txr->hn_tx_lock);
6769 	}
6770 }
6771 
6772 static void
6773 hn_resume_data(struct hn_softc *sc)
6774 {
6775 	int i;
6776 
6777 	HN_LOCK_ASSERT(sc);
6778 
6779 	/*
6780 	 * Re-enable RX.
6781 	 */
6782 	hn_rxfilter_config(sc);
6783 
6784 	/*
6785 	 * Make sure to clear suspend status on "all" TX rings,
6786 	 * since hn_tx_ring_inuse can be changed after
6787 	 * hn_suspend_data().
6788 	 */
6789 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6790 
6791 #ifdef HN_IFSTART_SUPPORT
6792 	if (!hn_use_if_start)
6793 #endif
6794 	{
6795 		/*
6796 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6797 		 * reduced.
6798 		 */
6799 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6800 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6801 	}
6802 
6803 	/*
6804 	 * Kick start TX.
6805 	 */
6806 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6807 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6808 
6809 		/*
6810 		 * Use txeof task, so that any pending oactive can be
6811 		 * cleared properly.
6812 		 */
6813 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6814 	}
6815 }
6816 
6817 static void
6818 hn_resume_mgmt(struct hn_softc *sc)
6819 {
6820 
6821 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6822 
6823 	/*
6824 	 * Kick off network change detection, if it was pending.
6825 	 * If no network change was pending, start link status
6826 	 * checks, which is more lightweight than network change
6827 	 * detection.
6828 	 */
6829 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6830 		hn_change_network(sc);
6831 	else
6832 		hn_update_link_status(sc);
6833 }
6834 
6835 static void
6836 hn_resume(struct hn_softc *sc)
6837 {
6838 
6839 	/*
6840 	 * If the non-transparent mode VF is activated, the synthetic
6841 	 * device have to receive packets, so the data path of the
6842 	 * synthetic device must be resumed.
6843 	 */
6844 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6845 	    (sc->hn_flags & HN_FLAG_RXVF))
6846 		hn_resume_data(sc);
6847 
6848 	/*
6849 	 * Don't resume link status change if VF is attached/activated.
6850 	 * - In the non-transparent VF mode, the synthetic device marks
6851 	 *   link down until the VF is deactivated; i.e. VF is down.
6852 	 * - In transparent VF mode, VF's media status is used until
6853 	 *   the VF is detached.
6854 	 */
6855 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6856 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6857 		hn_resume_mgmt(sc);
6858 
6859 	/*
6860 	 * Re-enable polling if this interface is running and
6861 	 * the polling is requested.
6862 	 */
6863 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6864 		hn_polling(sc, sc->hn_pollhz);
6865 }
6866 
6867 static void
6868 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6869 {
6870 	const struct rndis_status_msg *msg;
6871 	int ofs;
6872 
6873 	if (dlen < sizeof(*msg)) {
6874 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6875 		return;
6876 	}
6877 	msg = data;
6878 
6879 	switch (msg->rm_status) {
6880 	case RNDIS_STATUS_MEDIA_CONNECT:
6881 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6882 		hn_update_link_status(sc);
6883 		break;
6884 
6885 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6886 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6887 		/* Not really useful; ignore. */
6888 		break;
6889 
6890 	case RNDIS_STATUS_NETWORK_CHANGE:
6891 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6892 		if (dlen < ofs + msg->rm_stbuflen ||
6893 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6894 			if_printf(sc->hn_ifp, "network changed\n");
6895 		} else {
6896 			uint32_t change;
6897 
6898 			memcpy(&change, ((const uint8_t *)msg) + ofs,
6899 			    sizeof(change));
6900 			if_printf(sc->hn_ifp, "network changed, change %u\n",
6901 			    change);
6902 		}
6903 		hn_change_network(sc);
6904 		break;
6905 
6906 	default:
6907 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6908 		    msg->rm_status);
6909 		break;
6910 	}
6911 }
6912 
6913 static int
6914 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6915 {
6916 	const struct rndis_pktinfo *pi = info_data;
6917 	uint32_t mask = 0;
6918 
6919 	while (info_dlen != 0) {
6920 		const void *data;
6921 		uint32_t dlen;
6922 
6923 		if (__predict_false(info_dlen < sizeof(*pi)))
6924 			return (EINVAL);
6925 		if (__predict_false(info_dlen < pi->rm_size))
6926 			return (EINVAL);
6927 		info_dlen -= pi->rm_size;
6928 
6929 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6930 			return (EINVAL);
6931 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6932 			return (EINVAL);
6933 		dlen = pi->rm_size - pi->rm_pktinfooffset;
6934 		data = pi->rm_data;
6935 
6936 		switch (pi->rm_type) {
6937 		case NDIS_PKTINFO_TYPE_VLAN:
6938 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6939 				return (EINVAL);
6940 			info->vlan_info = *((const uint32_t *)data);
6941 			mask |= HN_RXINFO_VLAN;
6942 			break;
6943 
6944 		case NDIS_PKTINFO_TYPE_CSUM:
6945 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6946 				return (EINVAL);
6947 			info->csum_info = *((const uint32_t *)data);
6948 			mask |= HN_RXINFO_CSUM;
6949 			break;
6950 
6951 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6952 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6953 				return (EINVAL);
6954 			info->hash_value = *((const uint32_t *)data);
6955 			mask |= HN_RXINFO_HASHVAL;
6956 			break;
6957 
6958 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
6959 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6960 				return (EINVAL);
6961 			info->hash_info = *((const uint32_t *)data);
6962 			mask |= HN_RXINFO_HASHINF;
6963 			break;
6964 
6965 		default:
6966 			goto next;
6967 		}
6968 
6969 		if (mask == HN_RXINFO_ALL) {
6970 			/* All found; done */
6971 			break;
6972 		}
6973 next:
6974 		pi = (const struct rndis_pktinfo *)
6975 		    ((const uint8_t *)pi + pi->rm_size);
6976 	}
6977 
6978 	/*
6979 	 * Final fixup.
6980 	 * - If there is no hash value, invalidate the hash info.
6981 	 */
6982 	if ((mask & HN_RXINFO_HASHVAL) == 0)
6983 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6984 	return (0);
6985 }
6986 
6987 static __inline bool
6988 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6989 {
6990 
6991 	if (off < check_off) {
6992 		if (__predict_true(off + len <= check_off))
6993 			return (false);
6994 	} else if (off > check_off) {
6995 		if (__predict_true(check_off + check_len <= off))
6996 			return (false);
6997 	}
6998 	return (true);
6999 }
7000 
7001 static void
7002 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7003 {
7004 	const struct rndis_packet_msg *pkt;
7005 	struct hn_rxinfo info;
7006 	int data_off, pktinfo_off, data_len, pktinfo_len;
7007 
7008 	/*
7009 	 * Check length.
7010 	 */
7011 	if (__predict_false(dlen < sizeof(*pkt))) {
7012 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7013 		return;
7014 	}
7015 	pkt = data;
7016 
7017 	if (__predict_false(dlen < pkt->rm_len)) {
7018 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7019 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7020 		return;
7021 	}
7022 	if (__predict_false(pkt->rm_len <
7023 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7024 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7025 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7026 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7027 		    pkt->rm_pktinfolen);
7028 		return;
7029 	}
7030 	if (__predict_false(pkt->rm_datalen == 0)) {
7031 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7032 		return;
7033 	}
7034 
7035 	/*
7036 	 * Check offests.
7037 	 */
7038 #define IS_OFFSET_INVALID(ofs)			\
7039 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7040 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7041 
7042 	/* XXX Hyper-V does not meet data offset alignment requirement */
7043 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7044 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7045 		    "data offset %u\n", pkt->rm_dataoffset);
7046 		return;
7047 	}
7048 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7049 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7050 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7051 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7052 		return;
7053 	}
7054 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7055 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7056 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7057 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7058 		return;
7059 	}
7060 
7061 #undef IS_OFFSET_INVALID
7062 
7063 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7064 	data_len = pkt->rm_datalen;
7065 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7066 	pktinfo_len = pkt->rm_pktinfolen;
7067 
7068 	/*
7069 	 * Check OOB coverage.
7070 	 */
7071 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7072 		int oob_off, oob_len;
7073 
7074 		if_printf(rxr->hn_ifp, "got oobdata\n");
7075 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7076 		oob_len = pkt->rm_oobdatalen;
7077 
7078 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7079 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7080 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7081 			    pkt->rm_len, oob_off, oob_len);
7082 			return;
7083 		}
7084 
7085 		/*
7086 		 * Check against data.
7087 		 */
7088 		if (hn_rndis_check_overlap(oob_off, oob_len,
7089 		    data_off, data_len)) {
7090 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7091 			    "oob overlaps data, oob abs %d len %d, "
7092 			    "data abs %d len %d\n",
7093 			    oob_off, oob_len, data_off, data_len);
7094 			return;
7095 		}
7096 
7097 		/*
7098 		 * Check against pktinfo.
7099 		 */
7100 		if (pktinfo_len != 0 &&
7101 		    hn_rndis_check_overlap(oob_off, oob_len,
7102 		    pktinfo_off, pktinfo_len)) {
7103 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7104 			    "oob overlaps pktinfo, oob abs %d len %d, "
7105 			    "pktinfo abs %d len %d\n",
7106 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7107 			return;
7108 		}
7109 	}
7110 
7111 	/*
7112 	 * Check per-packet-info coverage and find useful per-packet-info.
7113 	 */
7114 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7115 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7116 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7117 	if (__predict_true(pktinfo_len != 0)) {
7118 		bool overlap;
7119 		int error;
7120 
7121 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7122 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7123 			    "pktinfo overflow, msglen %u, "
7124 			    "pktinfo abs %d len %d\n",
7125 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7126 			return;
7127 		}
7128 
7129 		/*
7130 		 * Check packet info coverage.
7131 		 */
7132 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7133 		    data_off, data_len);
7134 		if (__predict_false(overlap)) {
7135 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7136 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7137 			    "data abs %d len %d\n",
7138 			    pktinfo_off, pktinfo_len, data_off, data_len);
7139 			return;
7140 		}
7141 
7142 		/*
7143 		 * Find useful per-packet-info.
7144 		 */
7145 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7146 		    pktinfo_len, &info);
7147 		if (__predict_false(error)) {
7148 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7149 			    "pktinfo\n");
7150 			return;
7151 		}
7152 	}
7153 
7154 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7155 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7156 		    "data overflow, msglen %u, data abs %d len %d\n",
7157 		    pkt->rm_len, data_off, data_len);
7158 		return;
7159 	}
7160 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7161 }
7162 
7163 static __inline void
7164 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7165 {
7166 	const struct rndis_msghdr *hdr;
7167 
7168 	if (__predict_false(dlen < sizeof(*hdr))) {
7169 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7170 		return;
7171 	}
7172 	hdr = data;
7173 
7174 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7175 		/* Hot data path. */
7176 		hn_rndis_rx_data(rxr, data, dlen);
7177 		/* Done! */
7178 		return;
7179 	}
7180 
7181 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7182 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7183 	else
7184 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7185 }
7186 
7187 static void
7188 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7189 {
7190 	const struct hn_nvs_hdr *hdr;
7191 
7192 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7193 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7194 		return;
7195 	}
7196 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7197 
7198 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7199 		/* Useless; ignore */
7200 		return;
7201 	}
7202 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7203 }
7204 
7205 static void
7206 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7207     const struct vmbus_chanpkt_hdr *pkt)
7208 {
7209 	struct hn_nvs_sendctx *sndc;
7210 
7211 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7212 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7213 	    VMBUS_CHANPKT_DATALEN(pkt));
7214 	/*
7215 	 * NOTE:
7216 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7217 	 * its callback.
7218 	 */
7219 }
7220 
7221 static void
7222 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7223     const struct vmbus_chanpkt_hdr *pkthdr)
7224 {
7225 	const struct vmbus_chanpkt_rxbuf *pkt;
7226 	const struct hn_nvs_hdr *nvs_hdr;
7227 	int count, i, hlen;
7228 
7229 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7230 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7231 		return;
7232 	}
7233 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7234 
7235 	/* Make sure that this is a RNDIS message. */
7236 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7237 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7238 		    nvs_hdr->nvs_type);
7239 		return;
7240 	}
7241 
7242 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7243 	if (__predict_false(hlen < sizeof(*pkt))) {
7244 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7245 		return;
7246 	}
7247 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7248 
7249 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7250 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7251 		    pkt->cp_rxbuf_id);
7252 		return;
7253 	}
7254 
7255 	count = pkt->cp_rxbuf_cnt;
7256 	if (__predict_false(hlen <
7257 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7258 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7259 		return;
7260 	}
7261 
7262 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7263 	for (i = 0; i < count; ++i) {
7264 		int ofs, len;
7265 
7266 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7267 		len = pkt->cp_rxbuf[i].rb_len;
7268 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7269 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7270 			    "ofs %d, len %d\n", i, ofs, len);
7271 			continue;
7272 		}
7273 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7274 	}
7275 
7276 	/*
7277 	 * Ack the consumed RXBUF associated w/ this channel packet,
7278 	 * so that this RXBUF can be recycled by the hypervisor.
7279 	 */
7280 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7281 }
7282 
7283 static void
7284 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7285     uint64_t tid)
7286 {
7287 	struct hn_nvs_rndis_ack ack;
7288 	int retries, error;
7289 
7290 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7291 	ack.nvs_status = HN_NVS_STATUS_OK;
7292 
7293 	retries = 0;
7294 again:
7295 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7296 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7297 	if (__predict_false(error == EAGAIN)) {
7298 		/*
7299 		 * NOTE:
7300 		 * This should _not_ happen in real world, since the
7301 		 * consumption of the TX bufring from the TX path is
7302 		 * controlled.
7303 		 */
7304 		if (rxr->hn_ack_failed == 0)
7305 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7306 		rxr->hn_ack_failed++;
7307 		retries++;
7308 		if (retries < 10) {
7309 			DELAY(100);
7310 			goto again;
7311 		}
7312 		/* RXBUF leaks! */
7313 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7314 	}
7315 }
7316 
7317 static void
7318 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7319 {
7320 	struct hn_rx_ring *rxr = xrxr;
7321 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7322 
7323 	for (;;) {
7324 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7325 		int error, pktlen;
7326 
7327 		pktlen = rxr->hn_pktbuf_len;
7328 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7329 		if (__predict_false(error == ENOBUFS)) {
7330 			void *nbuf;
7331 			int nlen;
7332 
7333 			/*
7334 			 * Expand channel packet buffer.
7335 			 *
7336 			 * XXX
7337 			 * Use M_WAITOK here, since allocation failure
7338 			 * is fatal.
7339 			 */
7340 			nlen = rxr->hn_pktbuf_len * 2;
7341 			while (nlen < pktlen)
7342 				nlen *= 2;
7343 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7344 
7345 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7346 			    rxr->hn_pktbuf_len, nlen);
7347 
7348 			free(rxr->hn_pktbuf, M_DEVBUF);
7349 			rxr->hn_pktbuf = nbuf;
7350 			rxr->hn_pktbuf_len = nlen;
7351 			/* Retry! */
7352 			continue;
7353 		} else if (__predict_false(error == EAGAIN)) {
7354 			/* No more channel packets; done! */
7355 			break;
7356 		}
7357 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7358 
7359 		switch (pkt->cph_type) {
7360 		case VMBUS_CHANPKT_TYPE_COMP:
7361 			hn_nvs_handle_comp(sc, chan, pkt);
7362 			break;
7363 
7364 		case VMBUS_CHANPKT_TYPE_RXBUF:
7365 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7366 			break;
7367 
7368 		case VMBUS_CHANPKT_TYPE_INBAND:
7369 			hn_nvs_handle_notify(sc, pkt);
7370 			break;
7371 
7372 		default:
7373 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7374 			    pkt->cph_type);
7375 			break;
7376 		}
7377 	}
7378 	hn_chan_rollup(rxr, rxr->hn_txr);
7379 }
7380 
7381 static void
7382 hn_sysinit(void *arg __unused)
7383 {
7384 	int i;
7385 
7386 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7387 
7388 #ifdef HN_IFSTART_SUPPORT
7389 	/*
7390 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7391 	 * mainly due to the IFF_DRV_OACTIVE flag.
7392 	 */
7393 	if (hn_xpnt_vf && hn_use_if_start) {
7394 		hn_use_if_start = 0;
7395 		printf("hn: tranparent VF mode, if_transmit will be used, "
7396 		    "instead of if_start\n");
7397 	}
7398 #endif
7399 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7400 		printf("hn: invalid transparent VF attach routing "
7401 		    "wait timeout %d, reset to %d\n",
7402 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7403 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7404 	}
7405 
7406 	/*
7407 	 * Initialize VF map.
7408 	 */
7409 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7410 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7411 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7412 	    M_WAITOK | M_ZERO);
7413 
7414 	/*
7415 	 * Fix the # of TX taskqueues.
7416 	 */
7417 	if (hn_tx_taskq_cnt <= 0)
7418 		hn_tx_taskq_cnt = 1;
7419 	else if (hn_tx_taskq_cnt > mp_ncpus)
7420 		hn_tx_taskq_cnt = mp_ncpus;
7421 
7422 	/*
7423 	 * Fix the TX taskqueue mode.
7424 	 */
7425 	switch (hn_tx_taskq_mode) {
7426 	case HN_TX_TASKQ_M_INDEP:
7427 	case HN_TX_TASKQ_M_GLOBAL:
7428 	case HN_TX_TASKQ_M_EVTTQ:
7429 		break;
7430 	default:
7431 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7432 		break;
7433 	}
7434 
7435 	if (vm_guest != VM_GUEST_HV)
7436 		return;
7437 
7438 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7439 		return;
7440 
7441 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7442 	    M_DEVBUF, M_WAITOK);
7443 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7444 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7445 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7446 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7447 		    "hn tx%d", i);
7448 	}
7449 }
7450 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7451 
7452 static void
7453 hn_sysuninit(void *arg __unused)
7454 {
7455 
7456 	if (hn_tx_taskque != NULL) {
7457 		int i;
7458 
7459 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7460 			taskqueue_free(hn_tx_taskque[i]);
7461 		free(hn_tx_taskque, M_DEVBUF);
7462 	}
7463 
7464 	if (hn_vfmap != NULL)
7465 		free(hn_vfmap, M_DEVBUF);
7466 	rm_destroy(&hn_vfmap_lock);
7467 
7468 	counter_u64_free(hn_udpcs_fixup);
7469 }
7470 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7471