xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 0b37c1590418417c894529d371800dfac71ef887)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87 
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99 
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107 
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112 
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118 
119 #include "vmbus_if.h"
120 
121 #define HN_IFSTART_SUPPORT
122 
123 #define HN_RING_CNT_DEF_MAX		8
124 
125 #define HN_VFMAP_SIZE_DEF		8
126 
127 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
128 
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT			512
131 
132 #define HN_RNDIS_PKT_LEN					\
133 	(sizeof(struct rndis_packet_msg) +			\
134 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
140 
141 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
146 
147 #define HN_DIRECT_TX_SIZE_DEF		128
148 
149 #define HN_EARLY_TXEOF_THRESH		8
150 
151 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
152 
153 #define HN_LROENT_CNT_DEF		128
154 
155 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
159 
160 #define HN_LRO_ACKCNT_DEF		1
161 
162 #define HN_LOCK_INIT(sc)		\
163 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc)					\
167 do {							\
168 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
169 		DELAY(1000);				\
170 } while (0)
171 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
172 
173 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc)		\
176 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc)	\
178 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 
180 #define HN_PKTSIZE_MIN(align)		\
181 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 	    HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align)		\
184 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191 
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 	SLIST_ENTRY(hn_txdesc)		link;
195 #endif
196 	STAILQ_ENTRY(hn_txdesc)		agg_link;
197 
198 	/* Aggregated txdescs, in sending order. */
199 	STAILQ_HEAD(, hn_txdesc)	agg_list;
200 
201 	/* The oldest packet, if transmission aggregation happens. */
202 	struct mbuf			*m;
203 	struct hn_tx_ring		*txr;
204 	int				refs;
205 	uint32_t			flags;	/* HN_TXD_FLAG_ */
206 	struct hn_nvs_sendctx		send_ctx;
207 	uint32_t			chim_index;
208 	int				chim_size;
209 
210 	bus_dmamap_t			data_dmap;
211 
212 	bus_addr_t			rndis_pkt_paddr;
213 	struct rndis_packet_msg		*rndis_pkt;
214 	bus_dmamap_t			rndis_pkt_dmap;
215 };
216 
217 #define HN_TXD_FLAG_ONLIST		0x0001
218 #define HN_TXD_FLAG_DMAMAP		0x0002
219 #define HN_TXD_FLAG_ONAGG		0x0004
220 
221 struct hn_rxinfo {
222 	uint32_t			vlan_info;
223 	uint32_t			csum_info;
224 	uint32_t			hash_info;
225 	uint32_t			hash_value;
226 };
227 
228 struct hn_rxvf_setarg {
229 	struct hn_rx_ring	*rxr;
230 	struct ifnet		*vf_ifp;
231 };
232 
233 #define HN_RXINFO_VLAN			0x0001
234 #define HN_RXINFO_CSUM			0x0002
235 #define HN_RXINFO_HASHINF		0x0004
236 #define HN_RXINFO_HASHVAL		0x0008
237 #define HN_RXINFO_ALL			\
238 	(HN_RXINFO_VLAN |		\
239 	 HN_RXINFO_CSUM |		\
240 	 HN_RXINFO_HASHINF |		\
241 	 HN_RXINFO_HASHVAL)
242 
243 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID	0
245 #define HN_NDIS_HASH_INFO_INVALID	0
246 
247 static int			hn_probe(device_t);
248 static int			hn_attach(device_t);
249 static int			hn_detach(device_t);
250 static int			hn_shutdown(device_t);
251 static void			hn_chan_callback(struct vmbus_channel *,
252 				    void *);
253 
254 static void			hn_init(void *);
255 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void			hn_start(struct ifnet *);
258 #endif
259 static int			hn_transmit(struct ifnet *, struct mbuf *);
260 static void			hn_xmit_qflush(struct ifnet *);
261 static int			hn_ifmedia_upd(struct ifnet *);
262 static void			hn_ifmedia_sts(struct ifnet *,
263 				    struct ifmediareq *);
264 
265 static void			hn_ifnet_event(void *, struct ifnet *, int);
266 static void			hn_ifaddr_event(void *, struct ifnet *);
267 static void			hn_ifnet_attevent(void *, struct ifnet *);
268 static void			hn_ifnet_detevent(void *, struct ifnet *);
269 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 
271 static bool			hn_ismyvf(const struct hn_softc *,
272 				    const struct ifnet *);
273 static void			hn_rxvf_change(struct hn_softc *,
274 				    struct ifnet *, bool);
275 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void			hn_rxvf_set_task(void *, int);
277 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 				    struct ifreq *);
281 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool			hn_xpnt_vf_isready(struct hn_softc *);
283 static void			hn_xpnt_vf_setready(struct hn_softc *);
284 static void			hn_xpnt_vf_init_taskfunc(void *, int);
285 static void			hn_xpnt_vf_init(struct hn_softc *);
286 static void			hn_xpnt_vf_setenable(struct hn_softc *);
287 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void			hn_vf_rss_restore(struct hn_softc *);
290 
291 static int			hn_rndis_rxinfo(const void *, int,
292 				    struct hn_rxinfo *);
293 static void			hn_rndis_rx_data(struct hn_rx_ring *,
294 				    const void *, int);
295 static void			hn_rndis_rx_status(struct hn_softc *,
296 				    const void *, int);
297 static void			hn_rndis_init_fixat(struct hn_softc *, int);
298 
299 static void			hn_nvs_handle_notify(struct hn_softc *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_comp(struct hn_softc *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *,
306 				    const struct vmbus_chanpkt_hdr *);
307 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 				    struct vmbus_channel *, uint64_t);
309 
310 #if __FreeBSD_version >= 1100099
311 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 
347 static void			hn_stop(struct hn_softc *, bool);
348 static void			hn_init_locked(struct hn_softc *);
349 static int			hn_chan_attach(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_chan_detach(struct hn_softc *,
352 				    struct vmbus_channel *);
353 static int			hn_attach_subchans(struct hn_softc *);
354 static void			hn_detach_allchans(struct hn_softc *);
355 static void			hn_chan_rollup(struct hn_rx_ring *,
356 				    struct hn_tx_ring *);
357 static void			hn_set_ring_inuse(struct hn_softc *, int);
358 static int			hn_synth_attach(struct hn_softc *, int);
359 static void			hn_synth_detach(struct hn_softc *);
360 static int			hn_synth_alloc_subchans(struct hn_softc *,
361 				    int *);
362 static bool			hn_synth_attachable(const struct hn_softc *);
363 static void			hn_suspend(struct hn_softc *);
364 static void			hn_suspend_data(struct hn_softc *);
365 static void			hn_suspend_mgmt(struct hn_softc *);
366 static void			hn_resume(struct hn_softc *);
367 static void			hn_resume_data(struct hn_softc *);
368 static void			hn_resume_mgmt(struct hn_softc *);
369 static void			hn_suspend_mgmt_taskfunc(void *, int);
370 static void			hn_chan_drain(struct hn_softc *,
371 				    struct vmbus_channel *);
372 static void			hn_disable_rx(struct hn_softc *);
373 static void			hn_drain_rxtx(struct hn_softc *, int);
374 static void			hn_polling(struct hn_softc *, u_int);
375 static void			hn_chan_polling(struct vmbus_channel *, u_int);
376 static void			hn_mtu_change_fixup(struct hn_softc *);
377 
378 static void			hn_update_link_status(struct hn_softc *);
379 static void			hn_change_network(struct hn_softc *);
380 static void			hn_link_taskfunc(void *, int);
381 static void			hn_netchg_init_taskfunc(void *, int);
382 static void			hn_netchg_status_taskfunc(void *, int);
383 static void			hn_link_status(struct hn_softc *);
384 
385 static int			hn_create_rx_data(struct hn_softc *, int);
386 static void			hn_destroy_rx_data(struct hn_softc *);
387 static int			hn_check_iplen(const struct mbuf *, int);
388 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
389 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
390 static int			hn_rxfilter_config(struct hn_softc *);
391 static int			hn_rss_reconfig(struct hn_softc *);
392 static void			hn_rss_ind_fixup(struct hn_softc *);
393 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
394 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
395 				    int, const struct hn_rxinfo *);
396 static uint32_t			hn_rss_type_fromndis(uint32_t);
397 static uint32_t			hn_rss_type_tondis(uint32_t);
398 
399 static int			hn_tx_ring_create(struct hn_softc *, int);
400 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
401 static int			hn_create_tx_data(struct hn_softc *, int);
402 static void			hn_fixup_tx_data(struct hn_softc *);
403 static void			hn_fixup_rx_data(struct hn_softc *);
404 static void			hn_destroy_tx_data(struct hn_softc *);
405 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
406 static void			hn_txdesc_gc(struct hn_tx_ring *,
407 				    struct hn_txdesc *);
408 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
409 				    struct hn_txdesc *, struct mbuf **);
410 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 				    struct hn_txdesc *);
412 static void			hn_set_chim_size(struct hn_softc *, int);
413 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
414 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
415 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
416 static void			hn_resume_tx(struct hn_softc *, int);
417 static void			hn_set_txagg(struct hn_softc *);
418 static void			*hn_try_txagg(struct ifnet *,
419 				    struct hn_tx_ring *, struct hn_txdesc *,
420 				    int);
421 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
422 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
423 				    struct hn_softc *, struct vmbus_channel *,
424 				    const void *, int);
425 static int			hn_txpkt_sglist(struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static int			hn_txpkt_chim(struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static int			hn_xmit(struct hn_tx_ring *, int);
430 static void			hn_xmit_taskfunc(void *, int);
431 static void			hn_xmit_txeof(struct hn_tx_ring *);
432 static void			hn_xmit_txeof_taskfunc(void *, int);
433 #ifdef HN_IFSTART_SUPPORT
434 static int			hn_start_locked(struct hn_tx_ring *, int);
435 static void			hn_start_taskfunc(void *, int);
436 static void			hn_start_txeof(struct hn_tx_ring *);
437 static void			hn_start_txeof_taskfunc(void *, int);
438 #endif
439 
440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
441     "Hyper-V network interface");
442 
443 /* Trust tcp segements verification on host side. */
444 static int			hn_trust_hosttcp = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
446     &hn_trust_hosttcp, 0,
447     "Trust tcp segement verification on host side, "
448     "when csum info is missing (global setting)");
449 
450 /* Trust udp datagrams verification on host side. */
451 static int			hn_trust_hostudp = 1;
452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
453     &hn_trust_hostudp, 0,
454     "Trust udp datagram verification on host side, "
455     "when csum info is missing (global setting)");
456 
457 /* Trust ip packets verification on host side. */
458 static int			hn_trust_hostip = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460     &hn_trust_hostip, 0,
461     "Trust ip packet verification on host side, "
462     "when csum info is missing (global setting)");
463 
464 /*
465  * Offload UDP/IPv4 checksum.
466  */
467 static int			hn_enable_udp4cs = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
469     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 
471 /*
472  * Offload UDP/IPv6 checksum.
473  */
474 static int			hn_enable_udp6cs = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
476     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 
478 /* Stats. */
479 static counter_u64_t		hn_udpcs_fixup;
480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
481     &hn_udpcs_fixup, "# of UDP checksum fixup");
482 
483 /*
484  * See hn_set_hlen().
485  *
486  * This value is for Azure.  For Hyper-V, set this above
487  * 65536 to disable UDP datagram checksum fixup.
488  */
489 static int			hn_udpcs_fixup_mtu = 1420;
490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
491     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
492 
493 /* Limit TSO burst size */
494 static int			hn_tso_maxlen = IP_MAXPACKET;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
496     &hn_tso_maxlen, 0, "TSO burst limit");
497 
498 /* Limit chimney send size */
499 static int			hn_tx_chimney_size = 0;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
501     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
502 
503 /* Limit the size of packet for direct transmission */
504 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
506     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
507 
508 /* # of LRO entries per RX ring */
509 #if defined(INET) || defined(INET6)
510 #if __FreeBSD_version >= 1100095
511 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
513     &hn_lro_entry_count, 0, "LRO entry count");
514 #endif
515 #endif
516 
517 static int			hn_tx_taskq_cnt = 1;
518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
519     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
520 
521 #define HN_TX_TASKQ_M_INDEP	0
522 #define HN_TX_TASKQ_M_GLOBAL	1
523 #define HN_TX_TASKQ_M_EVTTQ	2
524 
525 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
527     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
528     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
529 
530 #ifndef HN_USE_TXDESC_BUFRING
531 static int			hn_use_txdesc_bufring = 0;
532 #else
533 static int			hn_use_txdesc_bufring = 1;
534 #endif
535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
536     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
537 
538 #ifdef HN_IFSTART_SUPPORT
539 /* Use ifnet.if_start instead of ifnet.if_transmit */
540 static int			hn_use_if_start = 0;
541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
542     &hn_use_if_start, 0, "Use if_start TX method");
543 #endif
544 
545 /* # of channels to use */
546 static int			hn_chan_cnt = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
548     &hn_chan_cnt, 0,
549     "# of channels to use; each channel has one RX ring and one TX ring");
550 
551 /* # of transmit rings to use */
552 static int			hn_tx_ring_cnt = 0;
553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
554     &hn_tx_ring_cnt, 0, "# of TX rings to use");
555 
556 /* Software TX ring deptch */
557 static int			hn_tx_swq_depth = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
559     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
560 
561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
562 #if __FreeBSD_version >= 1100095
563 static u_int			hn_lro_mbufq_depth = 0;
564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
565     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 #endif
567 
568 /* Packet transmission aggregation size limit */
569 static int			hn_tx_agg_size = -1;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
571     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
572 
573 /* Packet transmission aggregation count limit */
574 static int			hn_tx_agg_pkts = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
576     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 
578 /* VF list */
579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
580     0, 0, hn_vflist_sysctl, "A", "VF list");
581 
582 /* VF mapping */
583 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
584     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 
586 /* Transparent VF */
587 static int			hn_xpnt_vf = 1;
588 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
589     &hn_xpnt_vf, 0, "Transparent VF mod");
590 
591 /* Accurate BPF support for Transparent VF */
592 static int			hn_xpnt_vf_accbpf = 0;
593 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
594     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
595 
596 /* Extra wait for transparent VF attach routing; unit seconds. */
597 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
598 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
599     &hn_xpnt_vf_attwait, 0,
600     "Extra wait for transparent VF attach routing; unit: seconds");
601 
602 static u_int			hn_cpu_index;	/* next CPU for channel */
603 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
604 
605 static struct rmlock		hn_vfmap_lock;
606 static int			hn_vfmap_size;
607 static struct ifnet		**hn_vfmap;
608 
609 #ifndef RSS
610 static const uint8_t
611 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
612 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
613 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
614 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
615 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
616 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
617 };
618 #endif	/* !RSS */
619 
620 static const struct hyperv_guid	hn_guid = {
621 	.hv_guid = {
622 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
623 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
624 };
625 
626 static device_method_t hn_methods[] = {
627 	/* Device interface */
628 	DEVMETHOD(device_probe,		hn_probe),
629 	DEVMETHOD(device_attach,	hn_attach),
630 	DEVMETHOD(device_detach,	hn_detach),
631 	DEVMETHOD(device_shutdown,	hn_shutdown),
632 	DEVMETHOD_END
633 };
634 
635 static driver_t hn_driver = {
636 	"hn",
637 	hn_methods,
638 	sizeof(struct hn_softc)
639 };
640 
641 static devclass_t hn_devclass;
642 
643 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646 
647 #if __FreeBSD_version >= 1100099
648 static void
649 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
650 {
651 	int i;
652 
653 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
654 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 }
656 #endif
657 
658 static int
659 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
660 {
661 
662 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
663 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
664 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
665 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
666 }
667 
668 static int
669 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
670 {
671 	struct hn_nvs_rndis rndis;
672 
673 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size > 0, ("invalid rndis chim txd"));
675 
676 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
677 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
678 	rndis.nvs_chim_idx = txd->chim_index;
679 	rndis.nvs_chim_sz = txd->chim_size;
680 
681 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
682 	    &rndis, sizeof(rndis), &txd->send_ctx));
683 }
684 
685 static __inline uint32_t
686 hn_chim_alloc(struct hn_softc *sc)
687 {
688 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
689 	u_long *bmap = sc->hn_chim_bmap;
690 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
691 
692 	for (i = 0; i < bmap_cnt; ++i) {
693 		int idx;
694 
695 		idx = ffsl(~bmap[i]);
696 		if (idx == 0)
697 			continue;
698 
699 		--idx; /* ffsl is 1-based */
700 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
701 		    ("invalid i %d and idx %d", i, idx));
702 
703 		if (atomic_testandset_long(&bmap[i], idx))
704 			continue;
705 
706 		ret = i * LONG_BIT + idx;
707 		break;
708 	}
709 	return (ret);
710 }
711 
712 static __inline void
713 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 {
715 	u_long mask;
716 	uint32_t idx;
717 
718 	idx = chim_idx / LONG_BIT;
719 	KASSERT(idx < sc->hn_chim_bmap_cnt,
720 	    ("invalid chimney index 0x%x", chim_idx));
721 
722 	mask = 1UL << (chim_idx % LONG_BIT);
723 	KASSERT(sc->hn_chim_bmap[idx] & mask,
724 	    ("index bitmap 0x%lx, chimney index %u, "
725 	     "bitmap idx %d, bitmask 0x%lx",
726 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
727 
728 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
729 }
730 
731 #if defined(INET6) || defined(INET)
732 
733 #define PULLUP_HDR(m, len)				\
734 do {							\
735 	if (__predict_false((m)->m_len < (len))) {	\
736 		(m) = m_pullup((m), (len));		\
737 		if ((m) == NULL)			\
738 			return (NULL);			\
739 	}						\
740 } while (0)
741 
742 /*
743  * NOTE: If this function failed, the m_head would be freed.
744  */
745 static __inline struct mbuf *
746 hn_tso_fixup(struct mbuf *m_head)
747 {
748 	struct ether_vlan_header *evl;
749 	struct tcphdr *th;
750 	int ehlen;
751 
752 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
753 
754 	PULLUP_HDR(m_head, sizeof(*evl));
755 	evl = mtod(m_head, struct ether_vlan_header *);
756 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
757 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
758 	else
759 		ehlen = ETHER_HDR_LEN;
760 	m_head->m_pkthdr.l2hlen = ehlen;
761 
762 #ifdef INET
763 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
764 		struct ip *ip;
765 		int iphlen;
766 
767 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
768 		ip = mtodo(m_head, ehlen);
769 		iphlen = ip->ip_hl << 2;
770 		m_head->m_pkthdr.l3hlen = iphlen;
771 
772 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
773 		th = mtodo(m_head, ehlen + iphlen);
774 
775 		ip->ip_len = 0;
776 		ip->ip_sum = 0;
777 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
778 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
779 	}
780 #endif
781 #if defined(INET6) && defined(INET)
782 	else
783 #endif
784 #ifdef INET6
785 	{
786 		struct ip6_hdr *ip6;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
789 		ip6 = mtodo(m_head, ehlen);
790 		if (ip6->ip6_nxt != IPPROTO_TCP) {
791 			m_freem(m_head);
792 			return (NULL);
793 		}
794 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
795 
796 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
797 		th = mtodo(m_head, ehlen + sizeof(*ip6));
798 
799 		ip6->ip6_plen = 0;
800 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
801 	}
802 #endif
803 	return (m_head);
804 }
805 
806 /*
807  * NOTE: If this function failed, the m_head would be freed.
808  */
809 static __inline struct mbuf *
810 hn_set_hlen(struct mbuf *m_head)
811 {
812 	const struct ether_vlan_header *evl;
813 	int ehlen;
814 
815 	PULLUP_HDR(m_head, sizeof(*evl));
816 	evl = mtod(m_head, const struct ether_vlan_header *);
817 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
818 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
819 	else
820 		ehlen = ETHER_HDR_LEN;
821 	m_head->m_pkthdr.l2hlen = ehlen;
822 
823 #ifdef INET
824 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
825 		const struct ip *ip;
826 		int iphlen;
827 
828 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
829 		ip = mtodo(m_head, ehlen);
830 		iphlen = ip->ip_hl << 2;
831 		m_head->m_pkthdr.l3hlen = iphlen;
832 
833 		/*
834 		 * UDP checksum offload does not work in Azure, if the
835 		 * following conditions meet:
836 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
837 		 * - IP_DF is not set in the IP hdr.
838 		 *
839 		 * Fallback to software checksum for these UDP datagrams.
840 		 */
841 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
842 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
843 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
844 			uint16_t off = ehlen + iphlen;
845 
846 			counter_u64_add(hn_udpcs_fixup, 1);
847 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
848 			*(uint16_t *)(m_head->m_data + off +
849                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
850 			    m_head, m_head->m_pkthdr.len, off);
851 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
852 		}
853 	}
854 #endif
855 #if defined(INET6) && defined(INET)
856 	else
857 #endif
858 #ifdef INET6
859 	{
860 		const struct ip6_hdr *ip6;
861 
862 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
863 		ip6 = mtodo(m_head, ehlen);
864 		if (ip6->ip6_nxt != IPPROTO_TCP &&
865 		    ip6->ip6_nxt != IPPROTO_UDP) {
866 			m_freem(m_head);
867 			return (NULL);
868 		}
869 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
870 	}
871 #endif
872 	return (m_head);
873 }
874 
875 /*
876  * NOTE: If this function failed, the m_head would be freed.
877  */
878 static __inline struct mbuf *
879 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
880 {
881 	const struct tcphdr *th;
882 	int ehlen, iphlen;
883 
884 	*tcpsyn = 0;
885 	ehlen = m_head->m_pkthdr.l2hlen;
886 	iphlen = m_head->m_pkthdr.l3hlen;
887 
888 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
889 	th = mtodo(m_head, ehlen + iphlen);
890 	if (th->th_flags & TH_SYN)
891 		*tcpsyn = 1;
892 	return (m_head);
893 }
894 
895 #undef PULLUP_HDR
896 
897 #endif	/* INET6 || INET */
898 
899 static int
900 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
901 {
902 	int error = 0;
903 
904 	HN_LOCK_ASSERT(sc);
905 
906 	if (sc->hn_rx_filter != filter) {
907 		error = hn_rndis_set_rxfilter(sc, filter);
908 		if (!error)
909 			sc->hn_rx_filter = filter;
910 	}
911 	return (error);
912 }
913 
914 static int
915 hn_rxfilter_config(struct hn_softc *sc)
916 {
917 	struct ifnet *ifp = sc->hn_ifp;
918 	uint32_t filter;
919 
920 	HN_LOCK_ASSERT(sc);
921 
922 	/*
923 	 * If the non-transparent mode VF is activated, we don't know how
924 	 * its RX filter is configured, so stick the synthetic device in
925 	 * the promiscous mode.
926 	 */
927 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
928 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
929 	} else {
930 		filter = NDIS_PACKET_TYPE_DIRECTED;
931 		if (ifp->if_flags & IFF_BROADCAST)
932 			filter |= NDIS_PACKET_TYPE_BROADCAST;
933 		/* TODO: support multicast list */
934 		if ((ifp->if_flags & IFF_ALLMULTI) ||
935 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
936 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
937 	}
938 	return (hn_set_rxfilter(sc, filter));
939 }
940 
941 static void
942 hn_set_txagg(struct hn_softc *sc)
943 {
944 	uint32_t size, pkts;
945 	int i;
946 
947 	/*
948 	 * Setup aggregation size.
949 	 */
950 	if (sc->hn_agg_size < 0)
951 		size = UINT32_MAX;
952 	else
953 		size = sc->hn_agg_size;
954 
955 	if (sc->hn_rndis_agg_size < size)
956 		size = sc->hn_rndis_agg_size;
957 
958 	/* NOTE: We only aggregate packets using chimney sending buffers. */
959 	if (size > (uint32_t)sc->hn_chim_szmax)
960 		size = sc->hn_chim_szmax;
961 
962 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
963 		/* Disable */
964 		size = 0;
965 		pkts = 0;
966 		goto done;
967 	}
968 
969 	/* NOTE: Type of the per TX ring setting is 'int'. */
970 	if (size > INT_MAX)
971 		size = INT_MAX;
972 
973 	/*
974 	 * Setup aggregation packet count.
975 	 */
976 	if (sc->hn_agg_pkts < 0)
977 		pkts = UINT32_MAX;
978 	else
979 		pkts = sc->hn_agg_pkts;
980 
981 	if (sc->hn_rndis_agg_pkts < pkts)
982 		pkts = sc->hn_rndis_agg_pkts;
983 
984 	if (pkts <= 1) {
985 		/* Disable */
986 		size = 0;
987 		pkts = 0;
988 		goto done;
989 	}
990 
991 	/* NOTE: Type of the per TX ring setting is 'short'. */
992 	if (pkts > SHRT_MAX)
993 		pkts = SHRT_MAX;
994 
995 done:
996 	/* NOTE: Type of the per TX ring setting is 'short'. */
997 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
998 		/* Disable */
999 		size = 0;
1000 		pkts = 0;
1001 	}
1002 
1003 	if (bootverbose) {
1004 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1005 		    size, pkts, sc->hn_rndis_agg_align);
1006 	}
1007 
1008 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1009 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1010 
1011 		mtx_lock(&txr->hn_tx_lock);
1012 		txr->hn_agg_szmax = size;
1013 		txr->hn_agg_pktmax = pkts;
1014 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1015 		mtx_unlock(&txr->hn_tx_lock);
1016 	}
1017 }
1018 
1019 static int
1020 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1021 {
1022 
1023 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1024 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1025 		return txr->hn_txdesc_cnt;
1026 	return hn_tx_swq_depth;
1027 }
1028 
1029 static int
1030 hn_rss_reconfig(struct hn_softc *sc)
1031 {
1032 	int error;
1033 
1034 	HN_LOCK_ASSERT(sc);
1035 
1036 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1037 		return (ENXIO);
1038 
1039 	/*
1040 	 * Disable RSS first.
1041 	 *
1042 	 * NOTE:
1043 	 * Direct reconfiguration by setting the UNCHG flags does
1044 	 * _not_ work properly.
1045 	 */
1046 	if (bootverbose)
1047 		if_printf(sc->hn_ifp, "disable RSS\n");
1048 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1049 	if (error) {
1050 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1051 		return (error);
1052 	}
1053 
1054 	/*
1055 	 * Reenable the RSS w/ the updated RSS key or indirect
1056 	 * table.
1057 	 */
1058 	if (bootverbose)
1059 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1060 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1061 	if (error) {
1062 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1063 		return (error);
1064 	}
1065 	return (0);
1066 }
1067 
1068 static void
1069 hn_rss_ind_fixup(struct hn_softc *sc)
1070 {
1071 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1072 	int i, nchan;
1073 
1074 	nchan = sc->hn_rx_ring_inuse;
1075 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1076 
1077 	/*
1078 	 * Check indirect table to make sure that all channels in it
1079 	 * can be used.
1080 	 */
1081 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1082 		if (rss->rss_ind[i] >= nchan) {
1083 			if_printf(sc->hn_ifp,
1084 			    "RSS indirect table %d fixup: %u -> %d\n",
1085 			    i, rss->rss_ind[i], nchan - 1);
1086 			rss->rss_ind[i] = nchan - 1;
1087 		}
1088 	}
1089 }
1090 
1091 static int
1092 hn_ifmedia_upd(struct ifnet *ifp __unused)
1093 {
1094 
1095 	return EOPNOTSUPP;
1096 }
1097 
1098 static void
1099 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1100 {
1101 	struct hn_softc *sc = ifp->if_softc;
1102 
1103 	ifmr->ifm_status = IFM_AVALID;
1104 	ifmr->ifm_active = IFM_ETHER;
1105 
1106 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1107 		ifmr->ifm_active |= IFM_NONE;
1108 		return;
1109 	}
1110 	ifmr->ifm_status |= IFM_ACTIVE;
1111 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1112 }
1113 
1114 static void
1115 hn_rxvf_set_task(void *xarg, int pending __unused)
1116 {
1117 	struct hn_rxvf_setarg *arg = xarg;
1118 
1119 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1120 }
1121 
1122 static void
1123 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1124 {
1125 	struct hn_rx_ring *rxr;
1126 	struct hn_rxvf_setarg arg;
1127 	struct task task;
1128 	int i;
1129 
1130 	HN_LOCK_ASSERT(sc);
1131 
1132 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1133 
1134 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1135 		rxr = &sc->hn_rx_ring[i];
1136 
1137 		if (i < sc->hn_rx_ring_inuse) {
1138 			arg.rxr = rxr;
1139 			arg.vf_ifp = vf_ifp;
1140 			vmbus_chan_run_task(rxr->hn_chan, &task);
1141 		} else {
1142 			rxr->hn_rxvf_ifp = vf_ifp;
1143 		}
1144 	}
1145 }
1146 
1147 static bool
1148 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1149 {
1150 	const struct ifnet *hn_ifp;
1151 
1152 	hn_ifp = sc->hn_ifp;
1153 
1154 	if (ifp == hn_ifp)
1155 		return (false);
1156 
1157 	if (ifp->if_alloctype != IFT_ETHER)
1158 		return (false);
1159 
1160 	/* Ignore lagg/vlan interfaces */
1161 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1162 	    strcmp(ifp->if_dname, "vlan") == 0)
1163 		return (false);
1164 
1165 	/*
1166 	 * During detach events ifp->if_addr might be NULL.
1167 	 * Make sure the bcmp() below doesn't panic on that:
1168 	 */
1169 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1170 		return (false);
1171 
1172 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1173 		return (false);
1174 
1175 	return (true);
1176 }
1177 
1178 static void
1179 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1180 {
1181 	struct ifnet *hn_ifp;
1182 
1183 	HN_LOCK(sc);
1184 
1185 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1186 		goto out;
1187 
1188 	if (!hn_ismyvf(sc, ifp))
1189 		goto out;
1190 	hn_ifp = sc->hn_ifp;
1191 
1192 	if (rxvf) {
1193 		if (sc->hn_flags & HN_FLAG_RXVF)
1194 			goto out;
1195 
1196 		sc->hn_flags |= HN_FLAG_RXVF;
1197 		hn_rxfilter_config(sc);
1198 	} else {
1199 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1200 			goto out;
1201 
1202 		sc->hn_flags &= ~HN_FLAG_RXVF;
1203 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1204 			hn_rxfilter_config(sc);
1205 		else
1206 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1207 	}
1208 
1209 	hn_nvs_set_datapath(sc,
1210 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1211 
1212 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1213 
1214 	if (rxvf) {
1215 		hn_vf_rss_fixup(sc, true);
1216 		hn_suspend_mgmt(sc);
1217 		sc->hn_link_flags &=
1218 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1219 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1220 	} else {
1221 		hn_vf_rss_restore(sc);
1222 		hn_resume_mgmt(sc);
1223 	}
1224 
1225 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1226 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1227 
1228 	if (bootverbose) {
1229 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1230 		    rxvf ? "to" : "from", ifp->if_xname);
1231 	}
1232 out:
1233 	HN_UNLOCK(sc);
1234 }
1235 
1236 static void
1237 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1238 {
1239 
1240 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1241 		return;
1242 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1243 }
1244 
1245 static void
1246 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1247 {
1248 
1249 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1250 }
1251 
1252 static int
1253 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1254 {
1255 	struct ifnet *ifp, *vf_ifp;
1256 	uint64_t tmp;
1257 	int error;
1258 
1259 	HN_LOCK_ASSERT(sc);
1260 	ifp = sc->hn_ifp;
1261 	vf_ifp = sc->hn_vf_ifp;
1262 
1263 	/*
1264 	 * Fix up requested capabilities w/ supported capabilities,
1265 	 * since the supported capabilities could have been changed.
1266 	 */
1267 	ifr->ifr_reqcap &= ifp->if_capabilities;
1268 	/* Pass SIOCSIFCAP to VF. */
1269 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1270 
1271 	/*
1272 	 * NOTE:
1273 	 * The error will be propagated to the callers, however, it
1274 	 * is _not_ useful here.
1275 	 */
1276 
1277 	/*
1278 	 * Merge VF's enabled capabilities.
1279 	 */
1280 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1281 
1282 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1283 	if (ifp->if_capenable & IFCAP_TXCSUM)
1284 		ifp->if_hwassist |= tmp;
1285 	else
1286 		ifp->if_hwassist &= ~tmp;
1287 
1288 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1289 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1290 		ifp->if_hwassist |= tmp;
1291 	else
1292 		ifp->if_hwassist &= ~tmp;
1293 
1294 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1295 	if (ifp->if_capenable & IFCAP_TSO4)
1296 		ifp->if_hwassist |= tmp;
1297 	else
1298 		ifp->if_hwassist &= ~tmp;
1299 
1300 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1301 	if (ifp->if_capenable & IFCAP_TSO6)
1302 		ifp->if_hwassist |= tmp;
1303 	else
1304 		ifp->if_hwassist &= ~tmp;
1305 
1306 	return (error);
1307 }
1308 
1309 static int
1310 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1311 {
1312 	struct ifnet *vf_ifp;
1313 	struct ifreq ifr;
1314 
1315 	HN_LOCK_ASSERT(sc);
1316 	vf_ifp = sc->hn_vf_ifp;
1317 
1318 	memset(&ifr, 0, sizeof(ifr));
1319 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1320 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1321 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1322 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1323 }
1324 
1325 static void
1326 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1327 {
1328 	struct ifnet *ifp = sc->hn_ifp;
1329 	int allmulti = 0;
1330 
1331 	HN_LOCK_ASSERT(sc);
1332 
1333 	/* XXX vlan(4) style mcast addr maintenance */
1334 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1335 		allmulti = IFF_ALLMULTI;
1336 
1337 	/* Always set the VF's if_flags */
1338 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1339 }
1340 
1341 static void
1342 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1343 {
1344 	struct rm_priotracker pt;
1345 	struct ifnet *hn_ifp = NULL;
1346 	struct mbuf *mn;
1347 
1348 	/*
1349 	 * XXX racy, if hn(4) ever detached.
1350 	 */
1351 	rm_rlock(&hn_vfmap_lock, &pt);
1352 	if (vf_ifp->if_index < hn_vfmap_size)
1353 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1354 	rm_runlock(&hn_vfmap_lock, &pt);
1355 
1356 	if (hn_ifp != NULL) {
1357 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1358 			/*
1359 			 * Allow tapping on the VF.
1360 			 */
1361 			ETHER_BPF_MTAP(vf_ifp, mn);
1362 
1363 			/*
1364 			 * Update VF stats.
1365 			 */
1366 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1367 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1368 				    mn->m_pkthdr.len);
1369 			}
1370 			/*
1371 			 * XXX IFCOUNTER_IMCAST
1372 			 * This stat updating is kinda invasive, since it
1373 			 * requires two checks on the mbuf: the length check
1374 			 * and the ethernet header check.  As of this write,
1375 			 * all multicast packets go directly to hn(4), which
1376 			 * makes imcast stat updating in the VF a try in vian.
1377 			 */
1378 
1379 			/*
1380 			 * Fix up rcvif and increase hn(4)'s ipackets.
1381 			 */
1382 			mn->m_pkthdr.rcvif = hn_ifp;
1383 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1384 		}
1385 		/*
1386 		 * Go through hn(4)'s if_input.
1387 		 */
1388 		hn_ifp->if_input(hn_ifp, m);
1389 	} else {
1390 		/*
1391 		 * In the middle of the transition; free this
1392 		 * mbuf chain.
1393 		 */
1394 		while (m != NULL) {
1395 			mn = m->m_nextpkt;
1396 			m->m_nextpkt = NULL;
1397 			m_freem(m);
1398 			m = mn;
1399 		}
1400 	}
1401 }
1402 
1403 static void
1404 hn_mtu_change_fixup(struct hn_softc *sc)
1405 {
1406 	struct ifnet *ifp;
1407 
1408 	HN_LOCK_ASSERT(sc);
1409 	ifp = sc->hn_ifp;
1410 
1411 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1412 #if __FreeBSD_version >= 1100099
1413 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1414 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1415 #endif
1416 }
1417 
1418 static uint32_t
1419 hn_rss_type_fromndis(uint32_t rss_hash)
1420 {
1421 	uint32_t types = 0;
1422 
1423 	if (rss_hash & NDIS_HASH_IPV4)
1424 		types |= RSS_TYPE_IPV4;
1425 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1426 		types |= RSS_TYPE_TCP_IPV4;
1427 	if (rss_hash & NDIS_HASH_IPV6)
1428 		types |= RSS_TYPE_IPV6;
1429 	if (rss_hash & NDIS_HASH_IPV6_EX)
1430 		types |= RSS_TYPE_IPV6_EX;
1431 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1432 		types |= RSS_TYPE_TCP_IPV6;
1433 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1434 		types |= RSS_TYPE_TCP_IPV6_EX;
1435 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1436 		types |= RSS_TYPE_UDP_IPV4;
1437 	return (types);
1438 }
1439 
1440 static uint32_t
1441 hn_rss_type_tondis(uint32_t types)
1442 {
1443 	uint32_t rss_hash = 0;
1444 
1445 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1446 	    ("UDP6 and UDP6EX are not supported"));
1447 
1448 	if (types & RSS_TYPE_IPV4)
1449 		rss_hash |= NDIS_HASH_IPV4;
1450 	if (types & RSS_TYPE_TCP_IPV4)
1451 		rss_hash |= NDIS_HASH_TCP_IPV4;
1452 	if (types & RSS_TYPE_IPV6)
1453 		rss_hash |= NDIS_HASH_IPV6;
1454 	if (types & RSS_TYPE_IPV6_EX)
1455 		rss_hash |= NDIS_HASH_IPV6_EX;
1456 	if (types & RSS_TYPE_TCP_IPV6)
1457 		rss_hash |= NDIS_HASH_TCP_IPV6;
1458 	if (types & RSS_TYPE_TCP_IPV6_EX)
1459 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1460 	if (types & RSS_TYPE_UDP_IPV4)
1461 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1462 	return (rss_hash);
1463 }
1464 
1465 static void
1466 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1467 {
1468 	int i;
1469 
1470 	HN_LOCK_ASSERT(sc);
1471 
1472 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1473 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1474 }
1475 
1476 static void
1477 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1478 {
1479 	struct ifnet *ifp, *vf_ifp;
1480 	struct ifrsshash ifrh;
1481 	struct ifrsskey ifrk;
1482 	int error;
1483 	uint32_t my_types, diff_types, mbuf_types = 0;
1484 
1485 	HN_LOCK_ASSERT(sc);
1486 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1487 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1488 
1489 	if (sc->hn_rx_ring_inuse == 1) {
1490 		/* No RSS on synthetic parts; done. */
1491 		return;
1492 	}
1493 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1494 		/* Synthetic parts do not support Toeplitz; done. */
1495 		return;
1496 	}
1497 
1498 	ifp = sc->hn_ifp;
1499 	vf_ifp = sc->hn_vf_ifp;
1500 
1501 	/*
1502 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1503 	 * supported.
1504 	 */
1505 	memset(&ifrk, 0, sizeof(ifrk));
1506 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1507 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1508 	if (error) {
1509 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1510 		    vf_ifp->if_xname, error);
1511 		goto done;
1512 	}
1513 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1514 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1515 		    vf_ifp->if_xname, ifrk.ifrk_func);
1516 		goto done;
1517 	}
1518 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1519 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1520 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1521 		goto done;
1522 	}
1523 
1524 	/*
1525 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1526 	 */
1527 	memset(&ifrh, 0, sizeof(ifrh));
1528 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1529 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1530 	if (error) {
1531 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1532 		    vf_ifp->if_xname, error);
1533 		goto done;
1534 	}
1535 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1536 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1537 		    vf_ifp->if_xname, ifrh.ifrh_func);
1538 		goto done;
1539 	}
1540 
1541 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1542 	if ((ifrh.ifrh_types & my_types) == 0) {
1543 		/* This disables RSS; ignore it then */
1544 		if_printf(ifp, "%s intersection of RSS types failed.  "
1545 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1546 		    ifrh.ifrh_types, my_types);
1547 		goto done;
1548 	}
1549 
1550 	diff_types = my_types ^ ifrh.ifrh_types;
1551 	my_types &= ifrh.ifrh_types;
1552 	mbuf_types = my_types;
1553 
1554 	/*
1555 	 * Detect RSS hash value/type confliction.
1556 	 *
1557 	 * NOTE:
1558 	 * We don't disable the hash type, but stop delivery the hash
1559 	 * value/type through mbufs on RX path.
1560 	 *
1561 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1562 	 * hash is delivered with type of TCP_IPV4.  This means if
1563 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1564 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1565 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1566 	 * here.
1567 	 */
1568 	if ((my_types & RSS_TYPE_IPV4) &&
1569 	    (diff_types & ifrh.ifrh_types &
1570 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1571 		/* Conflict; disable IPV4 hash type/value delivery. */
1572 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1573 		mbuf_types &= ~RSS_TYPE_IPV4;
1574 	}
1575 	if ((my_types & RSS_TYPE_IPV6) &&
1576 	    (diff_types & ifrh.ifrh_types &
1577 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1578 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1579 	      RSS_TYPE_IPV6_EX))) {
1580 		/* Conflict; disable IPV6 hash type/value delivery. */
1581 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1582 		mbuf_types &= ~RSS_TYPE_IPV6;
1583 	}
1584 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1585 	    (diff_types & ifrh.ifrh_types &
1586 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1587 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1588 	      RSS_TYPE_IPV6))) {
1589 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1590 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1591 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1592 	}
1593 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1594 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1595 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1596 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1597 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1598 	}
1599 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1600 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1601 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1602 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1603 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1604 	}
1605 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1606 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1607 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1608 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1609 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1610 	}
1611 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1612 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1613 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1614 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1615 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1616 	}
1617 
1618 	/*
1619 	 * Indirect table does not matter.
1620 	 */
1621 
1622 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1623 	    hn_rss_type_tondis(my_types);
1624 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1625 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1626 
1627 	if (reconf) {
1628 		error = hn_rss_reconfig(sc);
1629 		if (error) {
1630 			/* XXX roll-back? */
1631 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1632 			/* XXX keep going. */
1633 		}
1634 	}
1635 done:
1636 	/* Hash deliverability for mbufs. */
1637 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1638 }
1639 
1640 static void
1641 hn_vf_rss_restore(struct hn_softc *sc)
1642 {
1643 
1644 	HN_LOCK_ASSERT(sc);
1645 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1646 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1647 
1648 	if (sc->hn_rx_ring_inuse == 1)
1649 		goto done;
1650 
1651 	/*
1652 	 * Restore hash types.  Key does _not_ matter.
1653 	 */
1654 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1655 		int error;
1656 
1657 		sc->hn_rss_hash = sc->hn_rss_hcap;
1658 		error = hn_rss_reconfig(sc);
1659 		if (error) {
1660 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1661 			    error);
1662 			/* XXX keep going. */
1663 		}
1664 	}
1665 done:
1666 	/* Hash deliverability for mbufs. */
1667 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1668 }
1669 
1670 static void
1671 hn_xpnt_vf_setready(struct hn_softc *sc)
1672 {
1673 	struct ifnet *ifp, *vf_ifp;
1674 	struct ifreq ifr;
1675 
1676 	HN_LOCK_ASSERT(sc);
1677 	ifp = sc->hn_ifp;
1678 	vf_ifp = sc->hn_vf_ifp;
1679 
1680 	/*
1681 	 * Mark the VF ready.
1682 	 */
1683 	sc->hn_vf_rdytick = 0;
1684 
1685 	/*
1686 	 * Save information for restoration.
1687 	 */
1688 	sc->hn_saved_caps = ifp->if_capabilities;
1689 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1690 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1691 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1692 
1693 	/*
1694 	 * Intersect supported/enabled capabilities.
1695 	 *
1696 	 * NOTE:
1697 	 * if_hwassist is not changed here.
1698 	 */
1699 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1700 	ifp->if_capenable &= ifp->if_capabilities;
1701 
1702 	/*
1703 	 * Fix TSO settings.
1704 	 */
1705 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1706 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1707 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1708 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1709 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1710 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1711 
1712 	/*
1713 	 * Change VF's enabled capabilities.
1714 	 */
1715 	memset(&ifr, 0, sizeof(ifr));
1716 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1717 	ifr.ifr_reqcap = ifp->if_capenable;
1718 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1719 
1720 	if (ifp->if_mtu != ETHERMTU) {
1721 		int error;
1722 
1723 		/*
1724 		 * Change VF's MTU.
1725 		 */
1726 		memset(&ifr, 0, sizeof(ifr));
1727 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1728 		ifr.ifr_mtu = ifp->if_mtu;
1729 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1730 		if (error) {
1731 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1732 			    vf_ifp->if_xname, ifp->if_mtu);
1733 			if (ifp->if_mtu > ETHERMTU) {
1734 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1735 
1736 				/*
1737 				 * XXX
1738 				 * No need to adjust the synthetic parts' MTU;
1739 				 * failure of the adjustment will cause us
1740 				 * infinite headache.
1741 				 */
1742 				ifp->if_mtu = ETHERMTU;
1743 				hn_mtu_change_fixup(sc);
1744 			}
1745 		}
1746 	}
1747 }
1748 
1749 static bool
1750 hn_xpnt_vf_isready(struct hn_softc *sc)
1751 {
1752 
1753 	HN_LOCK_ASSERT(sc);
1754 
1755 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1756 		return (false);
1757 
1758 	if (sc->hn_vf_rdytick == 0)
1759 		return (true);
1760 
1761 	if (sc->hn_vf_rdytick > ticks)
1762 		return (false);
1763 
1764 	/* Mark VF as ready. */
1765 	hn_xpnt_vf_setready(sc);
1766 	return (true);
1767 }
1768 
1769 static void
1770 hn_xpnt_vf_setenable(struct hn_softc *sc)
1771 {
1772 	int i;
1773 
1774 	HN_LOCK_ASSERT(sc);
1775 
1776 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1777 	rm_wlock(&sc->hn_vf_lock);
1778 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1779 	rm_wunlock(&sc->hn_vf_lock);
1780 
1781 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1782 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1783 }
1784 
1785 static void
1786 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1787 {
1788 	int i;
1789 
1790 	HN_LOCK_ASSERT(sc);
1791 
1792 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1793 	rm_wlock(&sc->hn_vf_lock);
1794 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1795 	if (clear_vf)
1796 		sc->hn_vf_ifp = NULL;
1797 	rm_wunlock(&sc->hn_vf_lock);
1798 
1799 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1800 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1801 }
1802 
1803 static void
1804 hn_xpnt_vf_init(struct hn_softc *sc)
1805 {
1806 	int error;
1807 
1808 	HN_LOCK_ASSERT(sc);
1809 
1810 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1811 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1812 
1813 	if (bootverbose) {
1814 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1815 		    sc->hn_vf_ifp->if_xname);
1816 	}
1817 
1818 	/*
1819 	 * Bring the VF up.
1820 	 */
1821 	hn_xpnt_vf_saveifflags(sc);
1822 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1823 	error = hn_xpnt_vf_iocsetflags(sc);
1824 	if (error) {
1825 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1826 		    sc->hn_vf_ifp->if_xname, error);
1827 		return;
1828 	}
1829 
1830 	/*
1831 	 * NOTE:
1832 	 * Datapath setting must happen _after_ bringing the VF up.
1833 	 */
1834 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1835 
1836 	/*
1837 	 * NOTE:
1838 	 * Fixup RSS related bits _after_ the VF is brought up, since
1839 	 * many VFs generate RSS key during it's initialization.
1840 	 */
1841 	hn_vf_rss_fixup(sc, true);
1842 
1843 	/* Mark transparent mode VF as enabled. */
1844 	hn_xpnt_vf_setenable(sc);
1845 }
1846 
1847 static void
1848 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1849 {
1850 	struct hn_softc *sc = xsc;
1851 
1852 	HN_LOCK(sc);
1853 
1854 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1855 		goto done;
1856 	if (sc->hn_vf_ifp == NULL)
1857 		goto done;
1858 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1859 		goto done;
1860 
1861 	if (sc->hn_vf_rdytick != 0) {
1862 		/* Mark VF as ready. */
1863 		hn_xpnt_vf_setready(sc);
1864 	}
1865 
1866 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1867 		/*
1868 		 * Delayed VF initialization.
1869 		 */
1870 		if (bootverbose) {
1871 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1872 			    sc->hn_vf_ifp->if_xname);
1873 		}
1874 		hn_xpnt_vf_init(sc);
1875 	}
1876 done:
1877 	HN_UNLOCK(sc);
1878 }
1879 
1880 static void
1881 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1882 {
1883 	struct hn_softc *sc = xsc;
1884 
1885 	HN_LOCK(sc);
1886 
1887 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1888 		goto done;
1889 
1890 	if (!hn_ismyvf(sc, ifp))
1891 		goto done;
1892 
1893 	if (sc->hn_vf_ifp != NULL) {
1894 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1895 		    sc->hn_vf_ifp->if_xname);
1896 		goto done;
1897 	}
1898 
1899 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1900 		/*
1901 		 * ifnet.if_start is _not_ supported by transparent
1902 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1903 		 */
1904 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1905 		    "in transparent VF mode.\n", ifp->if_xname);
1906 		goto done;
1907 	}
1908 
1909 	rm_wlock(&hn_vfmap_lock);
1910 
1911 	if (ifp->if_index >= hn_vfmap_size) {
1912 		struct ifnet **newmap;
1913 		int newsize;
1914 
1915 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1916 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1917 		    M_WAITOK | M_ZERO);
1918 
1919 		memcpy(newmap, hn_vfmap,
1920 		    sizeof(struct ifnet *) * hn_vfmap_size);
1921 		free(hn_vfmap, M_DEVBUF);
1922 		hn_vfmap = newmap;
1923 		hn_vfmap_size = newsize;
1924 	}
1925 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1926 	    ("%s: ifindex %d was mapped to %s",
1927 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1928 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1929 
1930 	rm_wunlock(&hn_vfmap_lock);
1931 
1932 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1933 	rm_wlock(&sc->hn_vf_lock);
1934 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1935 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1936 	sc->hn_vf_ifp = ifp;
1937 	rm_wunlock(&sc->hn_vf_lock);
1938 
1939 	if (hn_xpnt_vf) {
1940 		int wait_ticks;
1941 
1942 		/*
1943 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1944 		 * Save vf_ifp's current if_input for later restoration.
1945 		 */
1946 		sc->hn_vf_input = ifp->if_input;
1947 		ifp->if_input = hn_xpnt_vf_input;
1948 
1949 		/*
1950 		 * Stop link status management; use the VF's.
1951 		 */
1952 		hn_suspend_mgmt(sc);
1953 
1954 		/*
1955 		 * Give VF sometime to complete its attach routing.
1956 		 */
1957 		wait_ticks = hn_xpnt_vf_attwait * hz;
1958 		sc->hn_vf_rdytick = ticks + wait_ticks;
1959 
1960 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1961 		    wait_ticks);
1962 	}
1963 done:
1964 	HN_UNLOCK(sc);
1965 }
1966 
1967 static void
1968 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1969 {
1970 	struct hn_softc *sc = xsc;
1971 
1972 	HN_LOCK(sc);
1973 
1974 	if (sc->hn_vf_ifp == NULL)
1975 		goto done;
1976 
1977 	if (!hn_ismyvf(sc, ifp))
1978 		goto done;
1979 
1980 	if (hn_xpnt_vf) {
1981 		/*
1982 		 * Make sure that the delayed initialization is not running.
1983 		 *
1984 		 * NOTE:
1985 		 * - This lock _must_ be released, since the hn_vf_init task
1986 		 *   will try holding this lock.
1987 		 * - It is safe to release this lock here, since the
1988 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1989 		 *
1990 		 * XXX racy, if hn(4) ever detached.
1991 		 */
1992 		HN_UNLOCK(sc);
1993 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1994 		HN_LOCK(sc);
1995 
1996 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1997 		    sc->hn_ifp->if_xname));
1998 		ifp->if_input = sc->hn_vf_input;
1999 		sc->hn_vf_input = NULL;
2000 
2001 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2002 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2003 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2004 
2005 		if (sc->hn_vf_rdytick == 0) {
2006 			/*
2007 			 * The VF was ready; restore some settings.
2008 			 */
2009 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2010 			/*
2011 			 * NOTE:
2012 			 * There is _no_ need to fixup if_capenable and
2013 			 * if_hwassist, since the if_capabilities before
2014 			 * restoration was an intersection of the VF's
2015 			 * if_capabilites and the synthetic device's
2016 			 * if_capabilites.
2017 			 */
2018 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2019 			sc->hn_ifp->if_hw_tsomaxsegcount =
2020 			    sc->hn_saved_tsosegcnt;
2021 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2022 		}
2023 
2024 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2025 			/*
2026 			 * Restore RSS settings.
2027 			 */
2028 			hn_vf_rss_restore(sc);
2029 
2030 			/*
2031 			 * Resume link status management, which was suspended
2032 			 * by hn_ifnet_attevent().
2033 			 */
2034 			hn_resume_mgmt(sc);
2035 		}
2036 	}
2037 
2038 	/* Mark transparent mode VF as disabled. */
2039 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2040 
2041 	rm_wlock(&hn_vfmap_lock);
2042 
2043 	KASSERT(ifp->if_index < hn_vfmap_size,
2044 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2045 	if (hn_vfmap[ifp->if_index] != NULL) {
2046 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2047 		    ("%s: ifindex %d was mapped to %s",
2048 		     ifp->if_xname, ifp->if_index,
2049 		     hn_vfmap[ifp->if_index]->if_xname));
2050 		hn_vfmap[ifp->if_index] = NULL;
2051 	}
2052 
2053 	rm_wunlock(&hn_vfmap_lock);
2054 done:
2055 	HN_UNLOCK(sc);
2056 }
2057 
2058 static void
2059 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2060 {
2061 	struct hn_softc *sc = xsc;
2062 
2063 	if (sc->hn_vf_ifp == ifp)
2064 		if_link_state_change(sc->hn_ifp, link_state);
2065 }
2066 
2067 static int
2068 hn_probe(device_t dev)
2069 {
2070 
2071 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2072 		device_set_desc(dev, "Hyper-V Network Interface");
2073 		return BUS_PROBE_DEFAULT;
2074 	}
2075 	return ENXIO;
2076 }
2077 
2078 static int
2079 hn_attach(device_t dev)
2080 {
2081 	struct hn_softc *sc = device_get_softc(dev);
2082 	struct sysctl_oid_list *child;
2083 	struct sysctl_ctx_list *ctx;
2084 	uint8_t eaddr[ETHER_ADDR_LEN];
2085 	struct ifnet *ifp = NULL;
2086 	int error, ring_cnt, tx_ring_cnt;
2087 	uint32_t mtu;
2088 
2089 	sc->hn_dev = dev;
2090 	sc->hn_prichan = vmbus_get_channel(dev);
2091 	HN_LOCK_INIT(sc);
2092 	rm_init(&sc->hn_vf_lock, "hnvf");
2093 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2094 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2095 
2096 	/*
2097 	 * Initialize these tunables once.
2098 	 */
2099 	sc->hn_agg_size = hn_tx_agg_size;
2100 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2101 
2102 	/*
2103 	 * Setup taskqueue for transmission.
2104 	 */
2105 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2106 		int i;
2107 
2108 		sc->hn_tx_taskqs =
2109 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2110 		    M_DEVBUF, M_WAITOK);
2111 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2112 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2113 			    M_WAITOK, taskqueue_thread_enqueue,
2114 			    &sc->hn_tx_taskqs[i]);
2115 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2116 			    "%s tx%d", device_get_nameunit(dev), i);
2117 		}
2118 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2119 		sc->hn_tx_taskqs = hn_tx_taskque;
2120 	}
2121 
2122 	/*
2123 	 * Setup taskqueue for mangement tasks, e.g. link status.
2124 	 */
2125 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2126 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2127 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2128 	    device_get_nameunit(dev));
2129 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2130 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2131 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2132 	    hn_netchg_status_taskfunc, sc);
2133 
2134 	if (hn_xpnt_vf) {
2135 		/*
2136 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2137 		 */
2138 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2139 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2140 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2141 		    device_get_nameunit(dev));
2142 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2143 		    hn_xpnt_vf_init_taskfunc, sc);
2144 	}
2145 
2146 	/*
2147 	 * Allocate ifnet and setup its name earlier, so that if_printf
2148 	 * can be used by functions, which will be called after
2149 	 * ether_ifattach().
2150 	 */
2151 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2152 	ifp->if_softc = sc;
2153 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2154 
2155 	/*
2156 	 * Initialize ifmedia earlier so that it can be unconditionally
2157 	 * destroyed, if error happened later on.
2158 	 */
2159 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2160 
2161 	/*
2162 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2163 	 * to use (tx_ring_cnt).
2164 	 *
2165 	 * NOTE:
2166 	 * The # of RX rings to use is same as the # of channels to use.
2167 	 */
2168 	ring_cnt = hn_chan_cnt;
2169 	if (ring_cnt <= 0) {
2170 		/* Default */
2171 		ring_cnt = mp_ncpus;
2172 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2173 			ring_cnt = HN_RING_CNT_DEF_MAX;
2174 	} else if (ring_cnt > mp_ncpus) {
2175 		ring_cnt = mp_ncpus;
2176 	}
2177 #ifdef RSS
2178 	if (ring_cnt > rss_getnumbuckets())
2179 		ring_cnt = rss_getnumbuckets();
2180 #endif
2181 
2182 	tx_ring_cnt = hn_tx_ring_cnt;
2183 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2184 		tx_ring_cnt = ring_cnt;
2185 #ifdef HN_IFSTART_SUPPORT
2186 	if (hn_use_if_start) {
2187 		/* ifnet.if_start only needs one TX ring. */
2188 		tx_ring_cnt = 1;
2189 	}
2190 #endif
2191 
2192 	/*
2193 	 * Set the leader CPU for channels.
2194 	 */
2195 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2196 
2197 	/*
2198 	 * Create enough TX/RX rings, even if only limited number of
2199 	 * channels can be allocated.
2200 	 */
2201 	error = hn_create_tx_data(sc, tx_ring_cnt);
2202 	if (error)
2203 		goto failed;
2204 	error = hn_create_rx_data(sc, ring_cnt);
2205 	if (error)
2206 		goto failed;
2207 
2208 	/*
2209 	 * Create transaction context for NVS and RNDIS transactions.
2210 	 */
2211 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2212 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2213 	if (sc->hn_xact == NULL) {
2214 		error = ENXIO;
2215 		goto failed;
2216 	}
2217 
2218 	/*
2219 	 * Install orphan handler for the revocation of this device's
2220 	 * primary channel.
2221 	 *
2222 	 * NOTE:
2223 	 * The processing order is critical here:
2224 	 * Install the orphan handler, _before_ testing whether this
2225 	 * device's primary channel has been revoked or not.
2226 	 */
2227 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2228 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2229 		error = ENXIO;
2230 		goto failed;
2231 	}
2232 
2233 	/*
2234 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2235 	 */
2236 	error = hn_synth_attach(sc, ETHERMTU);
2237 	if (error)
2238 		goto failed;
2239 
2240 	error = hn_rndis_get_eaddr(sc, eaddr);
2241 	if (error)
2242 		goto failed;
2243 
2244 	error = hn_rndis_get_mtu(sc, &mtu);
2245 	if (error)
2246 		mtu = ETHERMTU;
2247 	else if (bootverbose)
2248 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2249 
2250 #if __FreeBSD_version >= 1100099
2251 	if (sc->hn_rx_ring_inuse > 1) {
2252 		/*
2253 		 * Reduce TCP segment aggregation limit for multiple
2254 		 * RX rings to increase ACK timeliness.
2255 		 */
2256 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2257 	}
2258 #endif
2259 
2260 	/*
2261 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2262 	 */
2263 	hn_fixup_tx_data(sc);
2264 	hn_fixup_rx_data(sc);
2265 
2266 	ctx = device_get_sysctl_ctx(dev);
2267 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2268 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2269 	    &sc->hn_nvs_ver, 0, "NVS version");
2270 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2271 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2272 	    hn_ndis_version_sysctl, "A", "NDIS version");
2273 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2274 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2275 	    hn_caps_sysctl, "A", "capabilities");
2276 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2277 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2278 	    hn_hwassist_sysctl, "A", "hwassist");
2279 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2280 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2281 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2282 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2283 	    "max # of TSO segments");
2284 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2285 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2286 	    "max size of TSO segment");
2287 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2288 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2289 	    hn_rxfilter_sysctl, "A", "rxfilter");
2290 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2291 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2292 	    hn_rss_hash_sysctl, "A", "RSS hash");
2293 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2294 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2295 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2297 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2299 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2300 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2301 #ifndef RSS
2302 	/*
2303 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2304 	 */
2305 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2306 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2307 	    hn_rss_key_sysctl, "IU", "RSS key");
2308 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2309 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2310 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2311 #endif
2312 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2313 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2314 	    "RNDIS offered packet transmission aggregation size limit");
2315 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2316 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2317 	    "RNDIS offered packet transmission aggregation count limit");
2318 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2319 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2320 	    "RNDIS packet transmission aggregation alignment");
2321 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2322 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2323 	    hn_txagg_size_sysctl, "I",
2324 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2325 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2326 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 	    hn_txagg_pkts_sysctl, "I",
2328 	    "Packet transmission aggregation packets, "
2329 	    "0 -- disable, -1 -- auto");
2330 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2331 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 	    hn_polling_sysctl, "I",
2333 	    "Polling frequency: [100,1000000], 0 disable polling");
2334 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2335 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2336 	    hn_vf_sysctl, "A", "Virtual Function's name");
2337 	if (!hn_xpnt_vf) {
2338 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2339 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2340 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2341 	} else {
2342 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2343 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2344 		    hn_xpnt_vf_enabled_sysctl, "I",
2345 		    "Transparent VF enabled");
2346 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2347 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348 		    hn_xpnt_vf_accbpf_sysctl, "I",
2349 		    "Accurate BPF for transparent VF");
2350 	}
2351 
2352 	/*
2353 	 * Setup the ifmedia, which has been initialized earlier.
2354 	 */
2355 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2356 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2357 	/* XXX ifmedia_set really should do this for us */
2358 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2359 
2360 	/*
2361 	 * Setup the ifnet for this interface.
2362 	 */
2363 
2364 	ifp->if_baudrate = IF_Gbps(10);
2365 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST |
2366 	    IFF_NEEDSEPOCH;
2367 	ifp->if_ioctl = hn_ioctl;
2368 	ifp->if_init = hn_init;
2369 #ifdef HN_IFSTART_SUPPORT
2370 	if (hn_use_if_start) {
2371 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2372 
2373 		ifp->if_start = hn_start;
2374 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2375 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2376 		IFQ_SET_READY(&ifp->if_snd);
2377 	} else
2378 #endif
2379 	{
2380 		ifp->if_transmit = hn_transmit;
2381 		ifp->if_qflush = hn_xmit_qflush;
2382 	}
2383 
2384 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2385 #ifdef foo
2386 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2387 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2388 #endif
2389 	if (sc->hn_caps & HN_CAP_VLAN) {
2390 		/* XXX not sure about VLAN_MTU. */
2391 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2392 	}
2393 
2394 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2395 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2396 		ifp->if_capabilities |= IFCAP_TXCSUM;
2397 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2398 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2399 	if (sc->hn_caps & HN_CAP_TSO4) {
2400 		ifp->if_capabilities |= IFCAP_TSO4;
2401 		ifp->if_hwassist |= CSUM_IP_TSO;
2402 	}
2403 	if (sc->hn_caps & HN_CAP_TSO6) {
2404 		ifp->if_capabilities |= IFCAP_TSO6;
2405 		ifp->if_hwassist |= CSUM_IP6_TSO;
2406 	}
2407 
2408 	/* Enable all available capabilities by default. */
2409 	ifp->if_capenable = ifp->if_capabilities;
2410 
2411 	/*
2412 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2413 	 * be enabled through SIOCSIFCAP.
2414 	 */
2415 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2416 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2417 
2418 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2419 		/*
2420 		 * Lock hn_set_tso_maxsize() to simplify its
2421 		 * internal logic.
2422 		 */
2423 		HN_LOCK(sc);
2424 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2425 		HN_UNLOCK(sc);
2426 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2427 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2428 	}
2429 
2430 	ether_ifattach(ifp, eaddr);
2431 
2432 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2433 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2434 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2435 	}
2436 	if (mtu < ETHERMTU) {
2437 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2438 		ifp->if_mtu = mtu;
2439 	}
2440 
2441 	/* Inform the upper layer about the long frame support. */
2442 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2443 
2444 	/*
2445 	 * Kick off link status check.
2446 	 */
2447 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2448 	hn_update_link_status(sc);
2449 
2450 	if (!hn_xpnt_vf) {
2451 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2452 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2453 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2454 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2455 	} else {
2456 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2457 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2458 	}
2459 
2460 	/*
2461 	 * NOTE:
2462 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2463 	 * since interface's LLADDR is needed; interface LLADDR is not
2464 	 * available when ifnet_arrival event is triggered.
2465 	 */
2466 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2467 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2468 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2469 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2470 
2471 	return (0);
2472 failed:
2473 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2474 		hn_synth_detach(sc);
2475 	hn_detach(dev);
2476 	return (error);
2477 }
2478 
2479 static int
2480 hn_detach(device_t dev)
2481 {
2482 	struct hn_softc *sc = device_get_softc(dev);
2483 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2484 
2485 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2486 		/*
2487 		 * In case that the vmbus missed the orphan handler
2488 		 * installation.
2489 		 */
2490 		vmbus_xact_ctx_orphan(sc->hn_xact);
2491 	}
2492 
2493 	if (sc->hn_ifaddr_evthand != NULL)
2494 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2495 	if (sc->hn_ifnet_evthand != NULL)
2496 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2497 	if (sc->hn_ifnet_atthand != NULL) {
2498 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2499 		    sc->hn_ifnet_atthand);
2500 	}
2501 	if (sc->hn_ifnet_dethand != NULL) {
2502 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2503 		    sc->hn_ifnet_dethand);
2504 	}
2505 	if (sc->hn_ifnet_lnkhand != NULL)
2506 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2507 
2508 	vf_ifp = sc->hn_vf_ifp;
2509 	__compiler_membar();
2510 	if (vf_ifp != NULL)
2511 		hn_ifnet_detevent(sc, vf_ifp);
2512 
2513 	if (device_is_attached(dev)) {
2514 		HN_LOCK(sc);
2515 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2516 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2517 				hn_stop(sc, true);
2518 			/*
2519 			 * NOTE:
2520 			 * hn_stop() only suspends data, so managment
2521 			 * stuffs have to be suspended manually here.
2522 			 */
2523 			hn_suspend_mgmt(sc);
2524 			hn_synth_detach(sc);
2525 		}
2526 		HN_UNLOCK(sc);
2527 		ether_ifdetach(ifp);
2528 	}
2529 
2530 	ifmedia_removeall(&sc->hn_media);
2531 	hn_destroy_rx_data(sc);
2532 	hn_destroy_tx_data(sc);
2533 
2534 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2535 		int i;
2536 
2537 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2538 			taskqueue_free(sc->hn_tx_taskqs[i]);
2539 		free(sc->hn_tx_taskqs, M_DEVBUF);
2540 	}
2541 	taskqueue_free(sc->hn_mgmt_taskq0);
2542 	if (sc->hn_vf_taskq != NULL)
2543 		taskqueue_free(sc->hn_vf_taskq);
2544 
2545 	if (sc->hn_xact != NULL) {
2546 		/*
2547 		 * Uninstall the orphan handler _before_ the xact is
2548 		 * destructed.
2549 		 */
2550 		vmbus_chan_unset_orphan(sc->hn_prichan);
2551 		vmbus_xact_ctx_destroy(sc->hn_xact);
2552 	}
2553 
2554 	if_free(ifp);
2555 
2556 	HN_LOCK_DESTROY(sc);
2557 	rm_destroy(&sc->hn_vf_lock);
2558 	return (0);
2559 }
2560 
2561 static int
2562 hn_shutdown(device_t dev)
2563 {
2564 
2565 	return (0);
2566 }
2567 
2568 static void
2569 hn_link_status(struct hn_softc *sc)
2570 {
2571 	uint32_t link_status;
2572 	int error;
2573 
2574 	error = hn_rndis_get_linkstatus(sc, &link_status);
2575 	if (error) {
2576 		/* XXX what to do? */
2577 		return;
2578 	}
2579 
2580 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2581 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2582 	else
2583 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2584 	if_link_state_change(sc->hn_ifp,
2585 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2586 	    LINK_STATE_UP : LINK_STATE_DOWN);
2587 }
2588 
2589 static void
2590 hn_link_taskfunc(void *xsc, int pending __unused)
2591 {
2592 	struct hn_softc *sc = xsc;
2593 
2594 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2595 		return;
2596 	hn_link_status(sc);
2597 }
2598 
2599 static void
2600 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2601 {
2602 	struct hn_softc *sc = xsc;
2603 
2604 	/* Prevent any link status checks from running. */
2605 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2606 
2607 	/*
2608 	 * Fake up a [link down --> link up] state change; 5 seconds
2609 	 * delay is used, which closely simulates miibus reaction
2610 	 * upon link down event.
2611 	 */
2612 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2613 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2614 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2615 	    &sc->hn_netchg_status, 5 * hz);
2616 }
2617 
2618 static void
2619 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2620 {
2621 	struct hn_softc *sc = xsc;
2622 
2623 	/* Re-allow link status checks. */
2624 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2625 	hn_link_status(sc);
2626 }
2627 
2628 static void
2629 hn_update_link_status(struct hn_softc *sc)
2630 {
2631 
2632 	if (sc->hn_mgmt_taskq != NULL)
2633 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2634 }
2635 
2636 static void
2637 hn_change_network(struct hn_softc *sc)
2638 {
2639 
2640 	if (sc->hn_mgmt_taskq != NULL)
2641 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2642 }
2643 
2644 static __inline int
2645 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2646     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2647 {
2648 	struct mbuf *m = *m_head;
2649 	int error;
2650 
2651 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2652 
2653 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2654 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2655 	if (error == EFBIG) {
2656 		struct mbuf *m_new;
2657 
2658 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2659 		if (m_new == NULL)
2660 			return ENOBUFS;
2661 		else
2662 			*m_head = m = m_new;
2663 		txr->hn_tx_collapsed++;
2664 
2665 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2666 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2667 	}
2668 	if (!error) {
2669 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2670 		    BUS_DMASYNC_PREWRITE);
2671 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2672 	}
2673 	return error;
2674 }
2675 
2676 static __inline int
2677 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2678 {
2679 
2680 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2681 	    ("put an onlist txd %#x", txd->flags));
2682 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2683 	    ("put an onagg txd %#x", txd->flags));
2684 
2685 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2686 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2687 		return 0;
2688 
2689 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2690 		struct hn_txdesc *tmp_txd;
2691 
2692 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2693 			int freed;
2694 
2695 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2696 			    ("resursive aggregation on aggregated txdesc"));
2697 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2698 			    ("not aggregated txdesc"));
2699 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2700 			    ("aggregated txdesc uses dmamap"));
2701 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2702 			    ("aggregated txdesc consumes "
2703 			     "chimney sending buffer"));
2704 			KASSERT(tmp_txd->chim_size == 0,
2705 			    ("aggregated txdesc has non-zero "
2706 			     "chimney sending size"));
2707 
2708 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2709 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2710 			freed = hn_txdesc_put(txr, tmp_txd);
2711 			KASSERT(freed, ("failed to free aggregated txdesc"));
2712 		}
2713 	}
2714 
2715 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2716 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2717 		    ("chim txd uses dmamap"));
2718 		hn_chim_free(txr->hn_sc, txd->chim_index);
2719 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2720 		txd->chim_size = 0;
2721 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2722 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2723 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2724 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2725 		    txd->data_dmap);
2726 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2727 	}
2728 
2729 	if (txd->m != NULL) {
2730 		m_freem(txd->m);
2731 		txd->m = NULL;
2732 	}
2733 
2734 	txd->flags |= HN_TXD_FLAG_ONLIST;
2735 #ifndef HN_USE_TXDESC_BUFRING
2736 	mtx_lock_spin(&txr->hn_txlist_spin);
2737 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2738 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2739 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2740 	txr->hn_txdesc_avail++;
2741 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2742 	mtx_unlock_spin(&txr->hn_txlist_spin);
2743 #else	/* HN_USE_TXDESC_BUFRING */
2744 #ifdef HN_DEBUG
2745 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2746 #endif
2747 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2748 #endif	/* !HN_USE_TXDESC_BUFRING */
2749 
2750 	return 1;
2751 }
2752 
2753 static __inline struct hn_txdesc *
2754 hn_txdesc_get(struct hn_tx_ring *txr)
2755 {
2756 	struct hn_txdesc *txd;
2757 
2758 #ifndef HN_USE_TXDESC_BUFRING
2759 	mtx_lock_spin(&txr->hn_txlist_spin);
2760 	txd = SLIST_FIRST(&txr->hn_txlist);
2761 	if (txd != NULL) {
2762 		KASSERT(txr->hn_txdesc_avail > 0,
2763 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2764 		txr->hn_txdesc_avail--;
2765 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2766 	}
2767 	mtx_unlock_spin(&txr->hn_txlist_spin);
2768 #else
2769 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2770 #endif
2771 
2772 	if (txd != NULL) {
2773 #ifdef HN_USE_TXDESC_BUFRING
2774 #ifdef HN_DEBUG
2775 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2776 #endif
2777 #endif	/* HN_USE_TXDESC_BUFRING */
2778 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2779 		    STAILQ_EMPTY(&txd->agg_list) &&
2780 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2781 		    txd->chim_size == 0 &&
2782 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2783 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2784 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2785 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2786 		txd->refs = 1;
2787 	}
2788 	return txd;
2789 }
2790 
2791 static __inline void
2792 hn_txdesc_hold(struct hn_txdesc *txd)
2793 {
2794 
2795 	/* 0->1 transition will never work */
2796 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2797 	atomic_add_int(&txd->refs, 1);
2798 }
2799 
2800 static __inline void
2801 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2802 {
2803 
2804 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2805 	    ("recursive aggregation on aggregating txdesc"));
2806 
2807 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2808 	    ("already aggregated"));
2809 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2810 	    ("recursive aggregation on to-be-aggregated txdesc"));
2811 
2812 	txd->flags |= HN_TXD_FLAG_ONAGG;
2813 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2814 }
2815 
2816 static bool
2817 hn_tx_ring_pending(struct hn_tx_ring *txr)
2818 {
2819 	bool pending = false;
2820 
2821 #ifndef HN_USE_TXDESC_BUFRING
2822 	mtx_lock_spin(&txr->hn_txlist_spin);
2823 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2824 		pending = true;
2825 	mtx_unlock_spin(&txr->hn_txlist_spin);
2826 #else
2827 	if (!buf_ring_full(txr->hn_txdesc_br))
2828 		pending = true;
2829 #endif
2830 	return (pending);
2831 }
2832 
2833 static __inline void
2834 hn_txeof(struct hn_tx_ring *txr)
2835 {
2836 	txr->hn_has_txeof = 0;
2837 	txr->hn_txeof(txr);
2838 }
2839 
2840 static void
2841 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2842     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2843 {
2844 	struct hn_txdesc *txd = sndc->hn_cbarg;
2845 	struct hn_tx_ring *txr;
2846 
2847 	txr = txd->txr;
2848 	KASSERT(txr->hn_chan == chan,
2849 	    ("channel mismatch, on chan%u, should be chan%u",
2850 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2851 
2852 	txr->hn_has_txeof = 1;
2853 	hn_txdesc_put(txr, txd);
2854 
2855 	++txr->hn_txdone_cnt;
2856 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2857 		txr->hn_txdone_cnt = 0;
2858 		if (txr->hn_oactive)
2859 			hn_txeof(txr);
2860 	}
2861 }
2862 
2863 static void
2864 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2865 {
2866 #if defined(INET) || defined(INET6)
2867 	tcp_lro_flush_all(&rxr->hn_lro);
2868 #endif
2869 
2870 	/*
2871 	 * NOTE:
2872 	 * 'txr' could be NULL, if multiple channels and
2873 	 * ifnet.if_start method are enabled.
2874 	 */
2875 	if (txr == NULL || !txr->hn_has_txeof)
2876 		return;
2877 
2878 	txr->hn_txdone_cnt = 0;
2879 	hn_txeof(txr);
2880 }
2881 
2882 static __inline uint32_t
2883 hn_rndis_pktmsg_offset(uint32_t ofs)
2884 {
2885 
2886 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2887 	    ("invalid RNDIS packet msg offset %u", ofs));
2888 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2889 }
2890 
2891 static __inline void *
2892 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2893     size_t pi_dlen, uint32_t pi_type)
2894 {
2895 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2896 	struct rndis_pktinfo *pi;
2897 
2898 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2899 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2900 
2901 	/*
2902 	 * Per-packet-info does not move; it only grows.
2903 	 *
2904 	 * NOTE:
2905 	 * rm_pktinfooffset in this phase counts from the beginning
2906 	 * of rndis_packet_msg.
2907 	 */
2908 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2909 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2910 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2911 	    pkt->rm_pktinfolen);
2912 	pkt->rm_pktinfolen += pi_size;
2913 
2914 	pi->rm_size = pi_size;
2915 	pi->rm_type = pi_type;
2916 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2917 
2918 	return (pi->rm_data);
2919 }
2920 
2921 static __inline int
2922 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2923 {
2924 	struct hn_txdesc *txd;
2925 	struct mbuf *m;
2926 	int error, pkts;
2927 
2928 	txd = txr->hn_agg_txd;
2929 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2930 
2931 	/*
2932 	 * Since hn_txpkt() will reset this temporary stat, save
2933 	 * it now, so that oerrors can be updated properly, if
2934 	 * hn_txpkt() ever fails.
2935 	 */
2936 	pkts = txr->hn_stat_pkts;
2937 
2938 	/*
2939 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2940 	 * failure, save it for later freeing, if hn_txpkt() ever
2941 	 * fails.
2942 	 */
2943 	m = txd->m;
2944 	error = hn_txpkt(ifp, txr, txd);
2945 	if (__predict_false(error)) {
2946 		/* txd is freed, but m is not. */
2947 		m_freem(m);
2948 
2949 		txr->hn_flush_failed++;
2950 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2951 	}
2952 
2953 	/* Reset all aggregation states. */
2954 	txr->hn_agg_txd = NULL;
2955 	txr->hn_agg_szleft = 0;
2956 	txr->hn_agg_pktleft = 0;
2957 	txr->hn_agg_prevpkt = NULL;
2958 
2959 	return (error);
2960 }
2961 
2962 static void *
2963 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2964     int pktsize)
2965 {
2966 	void *chim;
2967 
2968 	if (txr->hn_agg_txd != NULL) {
2969 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2970 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2971 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2972 			int olen;
2973 
2974 			/*
2975 			 * Update the previous RNDIS packet's total length,
2976 			 * it can be increased due to the mandatory alignment
2977 			 * padding for this RNDIS packet.  And update the
2978 			 * aggregating txdesc's chimney sending buffer size
2979 			 * accordingly.
2980 			 *
2981 			 * XXX
2982 			 * Zero-out the padding, as required by the RNDIS spec.
2983 			 */
2984 			olen = pkt->rm_len;
2985 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2986 			agg_txd->chim_size += pkt->rm_len - olen;
2987 
2988 			/* Link this txdesc to the parent. */
2989 			hn_txdesc_agg(agg_txd, txd);
2990 
2991 			chim = (uint8_t *)pkt + pkt->rm_len;
2992 			/* Save the current packet for later fixup. */
2993 			txr->hn_agg_prevpkt = chim;
2994 
2995 			txr->hn_agg_pktleft--;
2996 			txr->hn_agg_szleft -= pktsize;
2997 			if (txr->hn_agg_szleft <=
2998 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2999 				/*
3000 				 * Probably can't aggregate more packets,
3001 				 * flush this aggregating txdesc proactively.
3002 				 */
3003 				txr->hn_agg_pktleft = 0;
3004 			}
3005 			/* Done! */
3006 			return (chim);
3007 		}
3008 		hn_flush_txagg(ifp, txr);
3009 	}
3010 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3011 
3012 	txr->hn_tx_chimney_tried++;
3013 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3014 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3015 		return (NULL);
3016 	txr->hn_tx_chimney++;
3017 
3018 	chim = txr->hn_sc->hn_chim +
3019 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3020 
3021 	if (txr->hn_agg_pktmax > 1 &&
3022 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3023 		txr->hn_agg_txd = txd;
3024 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3025 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3026 		txr->hn_agg_prevpkt = chim;
3027 	}
3028 	return (chim);
3029 }
3030 
3031 /*
3032  * NOTE:
3033  * If this function fails, then both txd and m_head0 will be freed.
3034  */
3035 static int
3036 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3037     struct mbuf **m_head0)
3038 {
3039 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3040 	int error, nsegs, i;
3041 	struct mbuf *m_head = *m_head0;
3042 	struct rndis_packet_msg *pkt;
3043 	uint32_t *pi_data;
3044 	void *chim = NULL;
3045 	int pkt_hlen, pkt_size;
3046 
3047 	pkt = txd->rndis_pkt;
3048 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3049 	if (pkt_size < txr->hn_chim_size) {
3050 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3051 		if (chim != NULL)
3052 			pkt = chim;
3053 	} else {
3054 		if (txr->hn_agg_txd != NULL)
3055 			hn_flush_txagg(ifp, txr);
3056 	}
3057 
3058 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3059 	pkt->rm_len = m_head->m_pkthdr.len;
3060 	pkt->rm_dataoffset = 0;
3061 	pkt->rm_datalen = m_head->m_pkthdr.len;
3062 	pkt->rm_oobdataoffset = 0;
3063 	pkt->rm_oobdatalen = 0;
3064 	pkt->rm_oobdataelements = 0;
3065 	pkt->rm_pktinfooffset = sizeof(*pkt);
3066 	pkt->rm_pktinfolen = 0;
3067 	pkt->rm_vchandle = 0;
3068 	pkt->rm_reserved = 0;
3069 
3070 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3071 		/*
3072 		 * Set the hash value for this packet, so that the host could
3073 		 * dispatch the TX done event for this packet back to this TX
3074 		 * ring's channel.
3075 		 */
3076 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3077 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3078 		*pi_data = txr->hn_tx_idx;
3079 	}
3080 
3081 	if (m_head->m_flags & M_VLANTAG) {
3082 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3083 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3084 		*pi_data = NDIS_VLAN_INFO_MAKE(
3085 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3086 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3087 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3088 	}
3089 
3090 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3091 #if defined(INET6) || defined(INET)
3092 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3093 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3094 #ifdef INET
3095 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3096 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3097 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3098 			    m_head->m_pkthdr.tso_segsz);
3099 		}
3100 #endif
3101 #if defined(INET6) && defined(INET)
3102 		else
3103 #endif
3104 #ifdef INET6
3105 		{
3106 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3107 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3108 			    m_head->m_pkthdr.tso_segsz);
3109 		}
3110 #endif
3111 #endif	/* INET6 || INET */
3112 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3113 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3114 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3115 		if (m_head->m_pkthdr.csum_flags &
3116 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3117 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3118 		} else {
3119 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3120 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3121 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3122 		}
3123 
3124 		if (m_head->m_pkthdr.csum_flags &
3125 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3126 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3127 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3128 		} else if (m_head->m_pkthdr.csum_flags &
3129 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3130 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3131 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3132 		}
3133 	}
3134 
3135 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3136 	/* Fixup RNDIS packet message total length */
3137 	pkt->rm_len += pkt_hlen;
3138 	/* Convert RNDIS packet message offsets */
3139 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3140 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3141 
3142 	/*
3143 	 * Fast path: Chimney sending.
3144 	 */
3145 	if (chim != NULL) {
3146 		struct hn_txdesc *tgt_txd = txd;
3147 
3148 		if (txr->hn_agg_txd != NULL) {
3149 			tgt_txd = txr->hn_agg_txd;
3150 #ifdef INVARIANTS
3151 			*m_head0 = NULL;
3152 #endif
3153 		}
3154 
3155 		KASSERT(pkt == chim,
3156 		    ("RNDIS pkt not in chimney sending buffer"));
3157 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3158 		    ("chimney sending buffer is not used"));
3159 		tgt_txd->chim_size += pkt->rm_len;
3160 
3161 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3162 		    ((uint8_t *)chim) + pkt_hlen);
3163 
3164 		txr->hn_gpa_cnt = 0;
3165 		txr->hn_sendpkt = hn_txpkt_chim;
3166 		goto done;
3167 	}
3168 
3169 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3170 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3171 	    ("chimney buffer is used"));
3172 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3173 
3174 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3175 	if (__predict_false(error)) {
3176 		int freed;
3177 
3178 		/*
3179 		 * This mbuf is not linked w/ the txd yet, so free it now.
3180 		 */
3181 		m_freem(m_head);
3182 		*m_head0 = NULL;
3183 
3184 		freed = hn_txdesc_put(txr, txd);
3185 		KASSERT(freed != 0,
3186 		    ("fail to free txd upon txdma error"));
3187 
3188 		txr->hn_txdma_failed++;
3189 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3190 		return error;
3191 	}
3192 	*m_head0 = m_head;
3193 
3194 	/* +1 RNDIS packet message */
3195 	txr->hn_gpa_cnt = nsegs + 1;
3196 
3197 	/* send packet with page buffer */
3198 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3199 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3200 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3201 
3202 	/*
3203 	 * Fill the page buffers with mbuf info after the page
3204 	 * buffer for RNDIS packet message.
3205 	 */
3206 	for (i = 0; i < nsegs; ++i) {
3207 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3208 
3209 		gpa->gpa_page = atop(segs[i].ds_addr);
3210 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3211 		gpa->gpa_len = segs[i].ds_len;
3212 	}
3213 
3214 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3215 	txd->chim_size = 0;
3216 	txr->hn_sendpkt = hn_txpkt_sglist;
3217 done:
3218 	txd->m = m_head;
3219 
3220 	/* Set the completion routine */
3221 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3222 
3223 	/* Update temporary stats for later use. */
3224 	txr->hn_stat_pkts++;
3225 	txr->hn_stat_size += m_head->m_pkthdr.len;
3226 	if (m_head->m_flags & M_MCAST)
3227 		txr->hn_stat_mcasts++;
3228 
3229 	return 0;
3230 }
3231 
3232 /*
3233  * NOTE:
3234  * If this function fails, then txd will be freed, but the mbuf
3235  * associated w/ the txd will _not_ be freed.
3236  */
3237 static int
3238 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3239 {
3240 	int error, send_failed = 0, has_bpf;
3241 
3242 again:
3243 	has_bpf = bpf_peers_present(ifp->if_bpf);
3244 	if (has_bpf) {
3245 		/*
3246 		 * Make sure that this txd and any aggregated txds are not
3247 		 * freed before ETHER_BPF_MTAP.
3248 		 */
3249 		hn_txdesc_hold(txd);
3250 	}
3251 	error = txr->hn_sendpkt(txr, txd);
3252 	if (!error) {
3253 		if (has_bpf) {
3254 			const struct hn_txdesc *tmp_txd;
3255 
3256 			ETHER_BPF_MTAP(ifp, txd->m);
3257 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3258 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3259 		}
3260 
3261 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3262 #ifdef HN_IFSTART_SUPPORT
3263 		if (!hn_use_if_start)
3264 #endif
3265 		{
3266 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3267 			    txr->hn_stat_size);
3268 			if (txr->hn_stat_mcasts != 0) {
3269 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3270 				    txr->hn_stat_mcasts);
3271 			}
3272 		}
3273 		txr->hn_pkts += txr->hn_stat_pkts;
3274 		txr->hn_sends++;
3275 	}
3276 	if (has_bpf)
3277 		hn_txdesc_put(txr, txd);
3278 
3279 	if (__predict_false(error)) {
3280 		int freed;
3281 
3282 		/*
3283 		 * This should "really rarely" happen.
3284 		 *
3285 		 * XXX Too many RX to be acked or too many sideband
3286 		 * commands to run?  Ask netvsc_channel_rollup()
3287 		 * to kick start later.
3288 		 */
3289 		txr->hn_has_txeof = 1;
3290 		if (!send_failed) {
3291 			txr->hn_send_failed++;
3292 			send_failed = 1;
3293 			/*
3294 			 * Try sending again after set hn_has_txeof;
3295 			 * in case that we missed the last
3296 			 * netvsc_channel_rollup().
3297 			 */
3298 			goto again;
3299 		}
3300 		if_printf(ifp, "send failed\n");
3301 
3302 		/*
3303 		 * Caller will perform further processing on the
3304 		 * associated mbuf, so don't free it in hn_txdesc_put();
3305 		 * only unload it from the DMA map in hn_txdesc_put(),
3306 		 * if it was loaded.
3307 		 */
3308 		txd->m = NULL;
3309 		freed = hn_txdesc_put(txr, txd);
3310 		KASSERT(freed != 0,
3311 		    ("fail to free txd upon send error"));
3312 
3313 		txr->hn_send_failed++;
3314 	}
3315 
3316 	/* Reset temporary stats, after this sending is done. */
3317 	txr->hn_stat_size = 0;
3318 	txr->hn_stat_pkts = 0;
3319 	txr->hn_stat_mcasts = 0;
3320 
3321 	return (error);
3322 }
3323 
3324 /*
3325  * Append the specified data to the indicated mbuf chain,
3326  * Extend the mbuf chain if the new data does not fit in
3327  * existing space.
3328  *
3329  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3330  * There should be an equivalent in the kernel mbuf code,
3331  * but there does not appear to be one yet.
3332  *
3333  * Differs from m_append() in that additional mbufs are
3334  * allocated with cluster size MJUMPAGESIZE, and filled
3335  * accordingly.
3336  *
3337  * Return 1 if able to complete the job; otherwise 0.
3338  */
3339 static int
3340 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3341 {
3342 	struct mbuf *m, *n;
3343 	int remainder, space;
3344 
3345 	for (m = m0; m->m_next != NULL; m = m->m_next)
3346 		;
3347 	remainder = len;
3348 	space = M_TRAILINGSPACE(m);
3349 	if (space > 0) {
3350 		/*
3351 		 * Copy into available space.
3352 		 */
3353 		if (space > remainder)
3354 			space = remainder;
3355 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3356 		m->m_len += space;
3357 		cp += space;
3358 		remainder -= space;
3359 	}
3360 	while (remainder > 0) {
3361 		/*
3362 		 * Allocate a new mbuf; could check space
3363 		 * and allocate a cluster instead.
3364 		 */
3365 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3366 		if (n == NULL)
3367 			break;
3368 		n->m_len = min(MJUMPAGESIZE, remainder);
3369 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3370 		cp += n->m_len;
3371 		remainder -= n->m_len;
3372 		m->m_next = n;
3373 		m = n;
3374 	}
3375 	if (m0->m_flags & M_PKTHDR)
3376 		m0->m_pkthdr.len += len - remainder;
3377 
3378 	return (remainder == 0);
3379 }
3380 
3381 #if defined(INET) || defined(INET6)
3382 static __inline int
3383 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3384 {
3385 #if __FreeBSD_version >= 1100095
3386 	if (hn_lro_mbufq_depth) {
3387 		tcp_lro_queue_mbuf(lc, m);
3388 		return 0;
3389 	}
3390 #endif
3391 	return tcp_lro_rx(lc, m, 0);
3392 }
3393 #endif
3394 
3395 static int
3396 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3397     const struct hn_rxinfo *info)
3398 {
3399 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3400 	struct mbuf *m_new;
3401 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3402 	int hash_type = M_HASHTYPE_NONE;
3403 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3404 
3405 	ifp = hn_ifp;
3406 	if (rxr->hn_rxvf_ifp != NULL) {
3407 		/*
3408 		 * Non-transparent mode VF; pretend this packet is from
3409 		 * the VF.
3410 		 */
3411 		ifp = rxr->hn_rxvf_ifp;
3412 		is_vf = 1;
3413 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3414 		/* Transparent mode VF. */
3415 		is_vf = 1;
3416 	}
3417 
3418 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3419 		/*
3420 		 * NOTE:
3421 		 * See the NOTE of hn_rndis_init_fixat().  This
3422 		 * function can be reached, immediately after the
3423 		 * RNDIS is initialized but before the ifnet is
3424 		 * setup on the hn_attach() path; drop the unexpected
3425 		 * packets.
3426 		 */
3427 		return (0);
3428 	}
3429 
3430 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3431 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3432 		return (0);
3433 	}
3434 
3435 	if (dlen <= MHLEN) {
3436 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3437 		if (m_new == NULL) {
3438 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3439 			return (0);
3440 		}
3441 		memcpy(mtod(m_new, void *), data, dlen);
3442 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3443 		rxr->hn_small_pkts++;
3444 	} else {
3445 		/*
3446 		 * Get an mbuf with a cluster.  For packets 2K or less,
3447 		 * get a standard 2K cluster.  For anything larger, get a
3448 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3449 		 * if looped around to the Hyper-V TX channel, so avoid them.
3450 		 */
3451 		size = MCLBYTES;
3452 		if (dlen > MCLBYTES) {
3453 			/* 4096 */
3454 			size = MJUMPAGESIZE;
3455 		}
3456 
3457 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3458 		if (m_new == NULL) {
3459 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3460 			return (0);
3461 		}
3462 
3463 		hv_m_append(m_new, dlen, data);
3464 	}
3465 	m_new->m_pkthdr.rcvif = ifp;
3466 
3467 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3468 		do_csum = 0;
3469 
3470 	/* receive side checksum offload */
3471 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3472 		/* IP csum offload */
3473 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3474 			m_new->m_pkthdr.csum_flags |=
3475 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3476 			rxr->hn_csum_ip++;
3477 		}
3478 
3479 		/* TCP/UDP csum offload */
3480 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3481 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3482 			m_new->m_pkthdr.csum_flags |=
3483 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3484 			m_new->m_pkthdr.csum_data = 0xffff;
3485 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3486 				rxr->hn_csum_tcp++;
3487 			else
3488 				rxr->hn_csum_udp++;
3489 		}
3490 
3491 		/*
3492 		 * XXX
3493 		 * As of this write (Oct 28th, 2016), host side will turn
3494 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3495 		 * the do_lro setting here is actually _not_ accurate.  We
3496 		 * depend on the RSS hash type check to reset do_lro.
3497 		 */
3498 		if ((info->csum_info &
3499 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3500 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3501 			do_lro = 1;
3502 	} else {
3503 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3504 		if (l3proto == ETHERTYPE_IP) {
3505 			if (l4proto == IPPROTO_TCP) {
3506 				if (do_csum &&
3507 				    (rxr->hn_trust_hcsum &
3508 				     HN_TRUST_HCSUM_TCP)) {
3509 					rxr->hn_csum_trusted++;
3510 					m_new->m_pkthdr.csum_flags |=
3511 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3512 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3513 					m_new->m_pkthdr.csum_data = 0xffff;
3514 				}
3515 				do_lro = 1;
3516 			} else if (l4proto == IPPROTO_UDP) {
3517 				if (do_csum &&
3518 				    (rxr->hn_trust_hcsum &
3519 				     HN_TRUST_HCSUM_UDP)) {
3520 					rxr->hn_csum_trusted++;
3521 					m_new->m_pkthdr.csum_flags |=
3522 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3523 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3524 					m_new->m_pkthdr.csum_data = 0xffff;
3525 				}
3526 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3527 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3528 				rxr->hn_csum_trusted++;
3529 				m_new->m_pkthdr.csum_flags |=
3530 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3531 			}
3532 		}
3533 	}
3534 
3535 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3536 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3537 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3538 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3539 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3540 		m_new->m_flags |= M_VLANTAG;
3541 	}
3542 
3543 	/*
3544 	 * If VF is activated (tranparent/non-transparent mode does not
3545 	 * matter here).
3546 	 *
3547 	 * - Disable LRO
3548 	 *
3549 	 *   hn(4) will only receive broadcast packets, multicast packets,
3550 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3551 	 *   packet types.
3552 	 *
3553 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3554 	 *   all, since the LRO flush will use hn(4) as the receiving
3555 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3556 	 */
3557 	if (is_vf)
3558 		do_lro = 0;
3559 
3560 	/*
3561 	 * If VF is activated (tranparent/non-transparent mode does not
3562 	 * matter here), do _not_ mess with unsupported hash types or
3563 	 * functions.
3564 	 */
3565 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3566 		rxr->hn_rss_pkts++;
3567 		m_new->m_pkthdr.flowid = info->hash_value;
3568 		if (!is_vf)
3569 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3570 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3571 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3572 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3573 			    rxr->hn_mbuf_hash);
3574 
3575 			/*
3576 			 * NOTE:
3577 			 * do_lro is resetted, if the hash types are not TCP
3578 			 * related.  See the comment in the above csum_flags
3579 			 * setup section.
3580 			 */
3581 			switch (type) {
3582 			case NDIS_HASH_IPV4:
3583 				hash_type = M_HASHTYPE_RSS_IPV4;
3584 				do_lro = 0;
3585 				break;
3586 
3587 			case NDIS_HASH_TCP_IPV4:
3588 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3589 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3590 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3591 
3592 					if (is_vf)
3593 						def_htype = M_HASHTYPE_NONE;
3594 
3595 					/*
3596 					 * UDP 4-tuple hash is delivered as
3597 					 * TCP 4-tuple hash.
3598 					 */
3599 					if (l3proto == ETHERTYPE_MAX) {
3600 						hn_rxpkt_proto(m_new,
3601 						    &l3proto, &l4proto);
3602 					}
3603 					if (l3proto == ETHERTYPE_IP) {
3604 						if (l4proto == IPPROTO_UDP &&
3605 						    (rxr->hn_mbuf_hash &
3606 						     NDIS_HASH_UDP_IPV4_X)) {
3607 							hash_type =
3608 							M_HASHTYPE_RSS_UDP_IPV4;
3609 							do_lro = 0;
3610 						} else if (l4proto !=
3611 						    IPPROTO_TCP) {
3612 							hash_type = def_htype;
3613 							do_lro = 0;
3614 						}
3615 					} else {
3616 						hash_type = def_htype;
3617 						do_lro = 0;
3618 					}
3619 				}
3620 				break;
3621 
3622 			case NDIS_HASH_IPV6:
3623 				hash_type = M_HASHTYPE_RSS_IPV6;
3624 				do_lro = 0;
3625 				break;
3626 
3627 			case NDIS_HASH_IPV6_EX:
3628 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3629 				do_lro = 0;
3630 				break;
3631 
3632 			case NDIS_HASH_TCP_IPV6:
3633 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3634 				break;
3635 
3636 			case NDIS_HASH_TCP_IPV6_EX:
3637 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3638 				break;
3639 			}
3640 		}
3641 	} else if (!is_vf) {
3642 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3643 		hash_type = M_HASHTYPE_OPAQUE;
3644 	}
3645 	M_HASHTYPE_SET(m_new, hash_type);
3646 
3647 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3648 	if (hn_ifp != ifp) {
3649 		const struct ether_header *eh;
3650 
3651 		/*
3652 		 * Non-transparent mode VF is activated.
3653 		 */
3654 
3655 		/*
3656 		 * Allow tapping on hn(4).
3657 		 */
3658 		ETHER_BPF_MTAP(hn_ifp, m_new);
3659 
3660 		/*
3661 		 * Update hn(4)'s stats.
3662 		 */
3663 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3664 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3665 		/* Checked at the beginning of this function. */
3666 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3667 		eh = mtod(m_new, struct ether_header *);
3668 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3669 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3670 	}
3671 	rxr->hn_pkts++;
3672 
3673 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3674 #if defined(INET) || defined(INET6)
3675 		struct lro_ctrl *lro = &rxr->hn_lro;
3676 
3677 		if (lro->lro_cnt) {
3678 			rxr->hn_lro_tried++;
3679 			if (hn_lro_rx(lro, m_new) == 0) {
3680 				/* DONE! */
3681 				return 0;
3682 			}
3683 		}
3684 #endif
3685 	}
3686 	ifp->if_input(ifp, m_new);
3687 
3688 	return (0);
3689 }
3690 
3691 static int
3692 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3693 {
3694 	struct hn_softc *sc = ifp->if_softc;
3695 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3696 	struct ifnet *vf_ifp;
3697 	int mask, error = 0;
3698 	struct ifrsskey *ifrk;
3699 	struct ifrsshash *ifrh;
3700 	uint32_t mtu;
3701 
3702 	switch (cmd) {
3703 	case SIOCSIFMTU:
3704 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3705 			error = EINVAL;
3706 			break;
3707 		}
3708 
3709 		HN_LOCK(sc);
3710 
3711 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3712 			HN_UNLOCK(sc);
3713 			break;
3714 		}
3715 
3716 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3717 			/* Can't change MTU */
3718 			HN_UNLOCK(sc);
3719 			error = EOPNOTSUPP;
3720 			break;
3721 		}
3722 
3723 		if (ifp->if_mtu == ifr->ifr_mtu) {
3724 			HN_UNLOCK(sc);
3725 			break;
3726 		}
3727 
3728 		if (hn_xpnt_vf_isready(sc)) {
3729 			vf_ifp = sc->hn_vf_ifp;
3730 			ifr_vf = *ifr;
3731 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3732 			    sizeof(ifr_vf.ifr_name));
3733 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3734 			    (caddr_t)&ifr_vf);
3735 			if (error) {
3736 				HN_UNLOCK(sc);
3737 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3738 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3739 				break;
3740 			}
3741 		}
3742 
3743 		/*
3744 		 * Suspend this interface before the synthetic parts
3745 		 * are ripped.
3746 		 */
3747 		hn_suspend(sc);
3748 
3749 		/*
3750 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3751 		 */
3752 		hn_synth_detach(sc);
3753 
3754 		/*
3755 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3756 		 * with the new MTU setting.
3757 		 */
3758 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3759 		if (error) {
3760 			HN_UNLOCK(sc);
3761 			break;
3762 		}
3763 
3764 		error = hn_rndis_get_mtu(sc, &mtu);
3765 		if (error)
3766 			mtu = ifr->ifr_mtu;
3767 		else if (bootverbose)
3768 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3769 
3770 		/*
3771 		 * Commit the requested MTU, after the synthetic parts
3772 		 * have been successfully attached.
3773 		 */
3774 		if (mtu >= ifr->ifr_mtu) {
3775 			mtu = ifr->ifr_mtu;
3776 		} else {
3777 			if_printf(ifp, "fixup mtu %d -> %u\n",
3778 			    ifr->ifr_mtu, mtu);
3779 		}
3780 		ifp->if_mtu = mtu;
3781 
3782 		/*
3783 		 * Synthetic parts' reattach may change the chimney
3784 		 * sending size; update it.
3785 		 */
3786 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3787 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3788 
3789 		/*
3790 		 * Make sure that various parameters based on MTU are
3791 		 * still valid, after the MTU change.
3792 		 */
3793 		hn_mtu_change_fixup(sc);
3794 
3795 		/*
3796 		 * All done!  Resume the interface now.
3797 		 */
3798 		hn_resume(sc);
3799 
3800 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3801 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3802 			/*
3803 			 * Since we have reattached the NVS part,
3804 			 * change the datapath to VF again; in case
3805 			 * that it is lost, after the NVS was detached.
3806 			 */
3807 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3808 		}
3809 
3810 		HN_UNLOCK(sc);
3811 		break;
3812 
3813 	case SIOCSIFFLAGS:
3814 		HN_LOCK(sc);
3815 
3816 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3817 			HN_UNLOCK(sc);
3818 			break;
3819 		}
3820 
3821 		if (hn_xpnt_vf_isready(sc))
3822 			hn_xpnt_vf_saveifflags(sc);
3823 
3824 		if (ifp->if_flags & IFF_UP) {
3825 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3826 				/*
3827 				 * Caller meight hold mutex, e.g.
3828 				 * bpf; use busy-wait for the RNDIS
3829 				 * reply.
3830 				 */
3831 				HN_NO_SLEEPING(sc);
3832 				hn_rxfilter_config(sc);
3833 				HN_SLEEPING_OK(sc);
3834 
3835 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3836 					error = hn_xpnt_vf_iocsetflags(sc);
3837 			} else {
3838 				hn_init_locked(sc);
3839 			}
3840 		} else {
3841 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3842 				hn_stop(sc, false);
3843 		}
3844 		sc->hn_if_flags = ifp->if_flags;
3845 
3846 		HN_UNLOCK(sc);
3847 		break;
3848 
3849 	case SIOCSIFCAP:
3850 		HN_LOCK(sc);
3851 
3852 		if (hn_xpnt_vf_isready(sc)) {
3853 			ifr_vf = *ifr;
3854 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3855 			    sizeof(ifr_vf.ifr_name));
3856 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3857 			HN_UNLOCK(sc);
3858 			break;
3859 		}
3860 
3861 		/*
3862 		 * Fix up requested capabilities w/ supported capabilities,
3863 		 * since the supported capabilities could have been changed.
3864 		 */
3865 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3866 		    ifp->if_capenable;
3867 
3868 		if (mask & IFCAP_TXCSUM) {
3869 			ifp->if_capenable ^= IFCAP_TXCSUM;
3870 			if (ifp->if_capenable & IFCAP_TXCSUM)
3871 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3872 			else
3873 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3874 		}
3875 		if (mask & IFCAP_TXCSUM_IPV6) {
3876 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3877 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3878 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3879 			else
3880 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3881 		}
3882 
3883 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3884 		if (mask & IFCAP_RXCSUM)
3885 			ifp->if_capenable ^= IFCAP_RXCSUM;
3886 #ifdef foo
3887 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3888 		if (mask & IFCAP_RXCSUM_IPV6)
3889 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3890 #endif
3891 
3892 		if (mask & IFCAP_LRO)
3893 			ifp->if_capenable ^= IFCAP_LRO;
3894 
3895 		if (mask & IFCAP_TSO4) {
3896 			ifp->if_capenable ^= IFCAP_TSO4;
3897 			if (ifp->if_capenable & IFCAP_TSO4)
3898 				ifp->if_hwassist |= CSUM_IP_TSO;
3899 			else
3900 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3901 		}
3902 		if (mask & IFCAP_TSO6) {
3903 			ifp->if_capenable ^= IFCAP_TSO6;
3904 			if (ifp->if_capenable & IFCAP_TSO6)
3905 				ifp->if_hwassist |= CSUM_IP6_TSO;
3906 			else
3907 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3908 		}
3909 
3910 		HN_UNLOCK(sc);
3911 		break;
3912 
3913 	case SIOCADDMULTI:
3914 	case SIOCDELMULTI:
3915 		HN_LOCK(sc);
3916 
3917 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3918 			HN_UNLOCK(sc);
3919 			break;
3920 		}
3921 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3922 			/*
3923 			 * Multicast uses mutex; use busy-wait for
3924 			 * the RNDIS reply.
3925 			 */
3926 			HN_NO_SLEEPING(sc);
3927 			hn_rxfilter_config(sc);
3928 			HN_SLEEPING_OK(sc);
3929 		}
3930 
3931 		/* XXX vlan(4) style mcast addr maintenance */
3932 		if (hn_xpnt_vf_isready(sc)) {
3933 			int old_if_flags;
3934 
3935 			old_if_flags = sc->hn_vf_ifp->if_flags;
3936 			hn_xpnt_vf_saveifflags(sc);
3937 
3938 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3939 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3940 			     IFF_ALLMULTI))
3941 				error = hn_xpnt_vf_iocsetflags(sc);
3942 		}
3943 
3944 		HN_UNLOCK(sc);
3945 		break;
3946 
3947 	case SIOCSIFMEDIA:
3948 	case SIOCGIFMEDIA:
3949 		HN_LOCK(sc);
3950 		if (hn_xpnt_vf_isready(sc)) {
3951 			/*
3952 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3953 			 * create and pass ifr_vf to the VF here; just
3954 			 * replace the ifr_name.
3955 			 */
3956 			vf_ifp = sc->hn_vf_ifp;
3957 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3958 			    sizeof(ifr->ifr_name));
3959 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3960 			/* Restore the ifr_name. */
3961 			strlcpy(ifr->ifr_name, ifp->if_xname,
3962 			    sizeof(ifr->ifr_name));
3963 			HN_UNLOCK(sc);
3964 			break;
3965 		}
3966 		HN_UNLOCK(sc);
3967 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3968 		break;
3969 
3970 	case SIOCGIFRSSHASH:
3971 		ifrh = (struct ifrsshash *)data;
3972 		HN_LOCK(sc);
3973 		if (sc->hn_rx_ring_inuse == 1) {
3974 			HN_UNLOCK(sc);
3975 			ifrh->ifrh_func = RSS_FUNC_NONE;
3976 			ifrh->ifrh_types = 0;
3977 			break;
3978 		}
3979 
3980 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3981 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3982 		else
3983 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3984 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3985 		HN_UNLOCK(sc);
3986 		break;
3987 
3988 	case SIOCGIFRSSKEY:
3989 		ifrk = (struct ifrsskey *)data;
3990 		HN_LOCK(sc);
3991 		if (sc->hn_rx_ring_inuse == 1) {
3992 			HN_UNLOCK(sc);
3993 			ifrk->ifrk_func = RSS_FUNC_NONE;
3994 			ifrk->ifrk_keylen = 0;
3995 			break;
3996 		}
3997 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3998 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3999 		else
4000 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4001 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4002 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4003 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4004 		HN_UNLOCK(sc);
4005 		break;
4006 
4007 	default:
4008 		error = ether_ioctl(ifp, cmd, data);
4009 		break;
4010 	}
4011 	return (error);
4012 }
4013 
4014 static void
4015 hn_stop(struct hn_softc *sc, bool detaching)
4016 {
4017 	struct ifnet *ifp = sc->hn_ifp;
4018 	int i;
4019 
4020 	HN_LOCK_ASSERT(sc);
4021 
4022 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4023 	    ("synthetic parts were not attached"));
4024 
4025 	/* Clear RUNNING bit ASAP. */
4026 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4027 
4028 	/* Disable polling. */
4029 	hn_polling(sc, 0);
4030 
4031 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4032 		KASSERT(sc->hn_vf_ifp != NULL,
4033 		    ("%s: VF is not attached", ifp->if_xname));
4034 
4035 		/* Mark transparent mode VF as disabled. */
4036 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4037 
4038 		/*
4039 		 * NOTE:
4040 		 * Datapath setting must happen _before_ bringing
4041 		 * the VF down.
4042 		 */
4043 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4044 
4045 		/*
4046 		 * Bring the VF down.
4047 		 */
4048 		hn_xpnt_vf_saveifflags(sc);
4049 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4050 		hn_xpnt_vf_iocsetflags(sc);
4051 	}
4052 
4053 	/* Suspend data transfers. */
4054 	hn_suspend_data(sc);
4055 
4056 	/* Clear OACTIVE bit. */
4057 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4058 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4059 		sc->hn_tx_ring[i].hn_oactive = 0;
4060 
4061 	/*
4062 	 * If the non-transparent mode VF is active, make sure
4063 	 * that the RX filter still allows packet reception.
4064 	 */
4065 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4066 		hn_rxfilter_config(sc);
4067 }
4068 
4069 static void
4070 hn_init_locked(struct hn_softc *sc)
4071 {
4072 	struct ifnet *ifp = sc->hn_ifp;
4073 	int i;
4074 
4075 	HN_LOCK_ASSERT(sc);
4076 
4077 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4078 		return;
4079 
4080 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4081 		return;
4082 
4083 	/* Configure RX filter */
4084 	hn_rxfilter_config(sc);
4085 
4086 	/* Clear OACTIVE bit. */
4087 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4088 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4089 		sc->hn_tx_ring[i].hn_oactive = 0;
4090 
4091 	/* Clear TX 'suspended' bit. */
4092 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4093 
4094 	if (hn_xpnt_vf_isready(sc)) {
4095 		/* Initialize transparent VF. */
4096 		hn_xpnt_vf_init(sc);
4097 	}
4098 
4099 	/* Everything is ready; unleash! */
4100 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4101 
4102 	/* Re-enable polling if requested. */
4103 	if (sc->hn_pollhz > 0)
4104 		hn_polling(sc, sc->hn_pollhz);
4105 }
4106 
4107 static void
4108 hn_init(void *xsc)
4109 {
4110 	struct hn_softc *sc = xsc;
4111 
4112 	HN_LOCK(sc);
4113 	hn_init_locked(sc);
4114 	HN_UNLOCK(sc);
4115 }
4116 
4117 #if __FreeBSD_version >= 1100099
4118 
4119 static int
4120 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4121 {
4122 	struct hn_softc *sc = arg1;
4123 	unsigned int lenlim;
4124 	int error;
4125 
4126 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4127 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4128 	if (error || req->newptr == NULL)
4129 		return error;
4130 
4131 	HN_LOCK(sc);
4132 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4133 	    lenlim > TCP_LRO_LENGTH_MAX) {
4134 		HN_UNLOCK(sc);
4135 		return EINVAL;
4136 	}
4137 	hn_set_lro_lenlim(sc, lenlim);
4138 	HN_UNLOCK(sc);
4139 
4140 	return 0;
4141 }
4142 
4143 static int
4144 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4145 {
4146 	struct hn_softc *sc = arg1;
4147 	int ackcnt, error, i;
4148 
4149 	/*
4150 	 * lro_ackcnt_lim is append count limit,
4151 	 * +1 to turn it into aggregation limit.
4152 	 */
4153 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4154 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4155 	if (error || req->newptr == NULL)
4156 		return error;
4157 
4158 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4159 		return EINVAL;
4160 
4161 	/*
4162 	 * Convert aggregation limit back to append
4163 	 * count limit.
4164 	 */
4165 	--ackcnt;
4166 	HN_LOCK(sc);
4167 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4168 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4169 	HN_UNLOCK(sc);
4170 	return 0;
4171 }
4172 
4173 #endif
4174 
4175 static int
4176 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4177 {
4178 	struct hn_softc *sc = arg1;
4179 	int hcsum = arg2;
4180 	int on, error, i;
4181 
4182 	on = 0;
4183 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4184 		on = 1;
4185 
4186 	error = sysctl_handle_int(oidp, &on, 0, req);
4187 	if (error || req->newptr == NULL)
4188 		return error;
4189 
4190 	HN_LOCK(sc);
4191 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4192 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4193 
4194 		if (on)
4195 			rxr->hn_trust_hcsum |= hcsum;
4196 		else
4197 			rxr->hn_trust_hcsum &= ~hcsum;
4198 	}
4199 	HN_UNLOCK(sc);
4200 	return 0;
4201 }
4202 
4203 static int
4204 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4205 {
4206 	struct hn_softc *sc = arg1;
4207 	int chim_size, error;
4208 
4209 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4210 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4211 	if (error || req->newptr == NULL)
4212 		return error;
4213 
4214 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4215 		return EINVAL;
4216 
4217 	HN_LOCK(sc);
4218 	hn_set_chim_size(sc, chim_size);
4219 	HN_UNLOCK(sc);
4220 	return 0;
4221 }
4222 
4223 #if __FreeBSD_version < 1100095
4224 static int
4225 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4226 {
4227 	struct hn_softc *sc = arg1;
4228 	int ofs = arg2, i, error;
4229 	struct hn_rx_ring *rxr;
4230 	uint64_t stat;
4231 
4232 	stat = 0;
4233 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4234 		rxr = &sc->hn_rx_ring[i];
4235 		stat += *((int *)((uint8_t *)rxr + ofs));
4236 	}
4237 
4238 	error = sysctl_handle_64(oidp, &stat, 0, req);
4239 	if (error || req->newptr == NULL)
4240 		return error;
4241 
4242 	/* Zero out this stat. */
4243 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4244 		rxr = &sc->hn_rx_ring[i];
4245 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4246 	}
4247 	return 0;
4248 }
4249 #else
4250 static int
4251 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4252 {
4253 	struct hn_softc *sc = arg1;
4254 	int ofs = arg2, i, error;
4255 	struct hn_rx_ring *rxr;
4256 	uint64_t stat;
4257 
4258 	stat = 0;
4259 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4260 		rxr = &sc->hn_rx_ring[i];
4261 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4262 	}
4263 
4264 	error = sysctl_handle_64(oidp, &stat, 0, req);
4265 	if (error || req->newptr == NULL)
4266 		return error;
4267 
4268 	/* Zero out this stat. */
4269 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4270 		rxr = &sc->hn_rx_ring[i];
4271 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4272 	}
4273 	return 0;
4274 }
4275 
4276 #endif
4277 
4278 static int
4279 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4280 {
4281 	struct hn_softc *sc = arg1;
4282 	int ofs = arg2, i, error;
4283 	struct hn_rx_ring *rxr;
4284 	u_long stat;
4285 
4286 	stat = 0;
4287 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4288 		rxr = &sc->hn_rx_ring[i];
4289 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4290 	}
4291 
4292 	error = sysctl_handle_long(oidp, &stat, 0, req);
4293 	if (error || req->newptr == NULL)
4294 		return error;
4295 
4296 	/* Zero out this stat. */
4297 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4298 		rxr = &sc->hn_rx_ring[i];
4299 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4300 	}
4301 	return 0;
4302 }
4303 
4304 static int
4305 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4306 {
4307 	struct hn_softc *sc = arg1;
4308 	int ofs = arg2, i, error;
4309 	struct hn_tx_ring *txr;
4310 	u_long stat;
4311 
4312 	stat = 0;
4313 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4314 		txr = &sc->hn_tx_ring[i];
4315 		stat += *((u_long *)((uint8_t *)txr + ofs));
4316 	}
4317 
4318 	error = sysctl_handle_long(oidp, &stat, 0, req);
4319 	if (error || req->newptr == NULL)
4320 		return error;
4321 
4322 	/* Zero out this stat. */
4323 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4324 		txr = &sc->hn_tx_ring[i];
4325 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4326 	}
4327 	return 0;
4328 }
4329 
4330 static int
4331 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4332 {
4333 	struct hn_softc *sc = arg1;
4334 	int ofs = arg2, i, error, conf;
4335 	struct hn_tx_ring *txr;
4336 
4337 	txr = &sc->hn_tx_ring[0];
4338 	conf = *((int *)((uint8_t *)txr + ofs));
4339 
4340 	error = sysctl_handle_int(oidp, &conf, 0, req);
4341 	if (error || req->newptr == NULL)
4342 		return error;
4343 
4344 	HN_LOCK(sc);
4345 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4346 		txr = &sc->hn_tx_ring[i];
4347 		*((int *)((uint8_t *)txr + ofs)) = conf;
4348 	}
4349 	HN_UNLOCK(sc);
4350 
4351 	return 0;
4352 }
4353 
4354 static int
4355 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4356 {
4357 	struct hn_softc *sc = arg1;
4358 	int error, size;
4359 
4360 	size = sc->hn_agg_size;
4361 	error = sysctl_handle_int(oidp, &size, 0, req);
4362 	if (error || req->newptr == NULL)
4363 		return (error);
4364 
4365 	HN_LOCK(sc);
4366 	sc->hn_agg_size = size;
4367 	hn_set_txagg(sc);
4368 	HN_UNLOCK(sc);
4369 
4370 	return (0);
4371 }
4372 
4373 static int
4374 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4375 {
4376 	struct hn_softc *sc = arg1;
4377 	int error, pkts;
4378 
4379 	pkts = sc->hn_agg_pkts;
4380 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4381 	if (error || req->newptr == NULL)
4382 		return (error);
4383 
4384 	HN_LOCK(sc);
4385 	sc->hn_agg_pkts = pkts;
4386 	hn_set_txagg(sc);
4387 	HN_UNLOCK(sc);
4388 
4389 	return (0);
4390 }
4391 
4392 static int
4393 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4394 {
4395 	struct hn_softc *sc = arg1;
4396 	int pkts;
4397 
4398 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4399 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4400 }
4401 
4402 static int
4403 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4404 {
4405 	struct hn_softc *sc = arg1;
4406 	int align;
4407 
4408 	align = sc->hn_tx_ring[0].hn_agg_align;
4409 	return (sysctl_handle_int(oidp, &align, 0, req));
4410 }
4411 
4412 static void
4413 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4414 {
4415 	if (pollhz == 0)
4416 		vmbus_chan_poll_disable(chan);
4417 	else
4418 		vmbus_chan_poll_enable(chan, pollhz);
4419 }
4420 
4421 static void
4422 hn_polling(struct hn_softc *sc, u_int pollhz)
4423 {
4424 	int nsubch = sc->hn_rx_ring_inuse - 1;
4425 
4426 	HN_LOCK_ASSERT(sc);
4427 
4428 	if (nsubch > 0) {
4429 		struct vmbus_channel **subch;
4430 		int i;
4431 
4432 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4433 		for (i = 0; i < nsubch; ++i)
4434 			hn_chan_polling(subch[i], pollhz);
4435 		vmbus_subchan_rel(subch, nsubch);
4436 	}
4437 	hn_chan_polling(sc->hn_prichan, pollhz);
4438 }
4439 
4440 static int
4441 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4442 {
4443 	struct hn_softc *sc = arg1;
4444 	int pollhz, error;
4445 
4446 	pollhz = sc->hn_pollhz;
4447 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4448 	if (error || req->newptr == NULL)
4449 		return (error);
4450 
4451 	if (pollhz != 0 &&
4452 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4453 		return (EINVAL);
4454 
4455 	HN_LOCK(sc);
4456 	if (sc->hn_pollhz != pollhz) {
4457 		sc->hn_pollhz = pollhz;
4458 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4459 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4460 			hn_polling(sc, sc->hn_pollhz);
4461 	}
4462 	HN_UNLOCK(sc);
4463 
4464 	return (0);
4465 }
4466 
4467 static int
4468 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4469 {
4470 	struct hn_softc *sc = arg1;
4471 	char verstr[16];
4472 
4473 	snprintf(verstr, sizeof(verstr), "%u.%u",
4474 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4475 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4476 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4477 }
4478 
4479 static int
4480 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4481 {
4482 	struct hn_softc *sc = arg1;
4483 	char caps_str[128];
4484 	uint32_t caps;
4485 
4486 	HN_LOCK(sc);
4487 	caps = sc->hn_caps;
4488 	HN_UNLOCK(sc);
4489 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4490 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4491 }
4492 
4493 static int
4494 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4495 {
4496 	struct hn_softc *sc = arg1;
4497 	char assist_str[128];
4498 	uint32_t hwassist;
4499 
4500 	HN_LOCK(sc);
4501 	hwassist = sc->hn_ifp->if_hwassist;
4502 	HN_UNLOCK(sc);
4503 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4504 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4505 }
4506 
4507 static int
4508 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4509 {
4510 	struct hn_softc *sc = arg1;
4511 	char filter_str[128];
4512 	uint32_t filter;
4513 
4514 	HN_LOCK(sc);
4515 	filter = sc->hn_rx_filter;
4516 	HN_UNLOCK(sc);
4517 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4518 	    NDIS_PACKET_TYPES);
4519 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4520 }
4521 
4522 #ifndef RSS
4523 
4524 static int
4525 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4526 {
4527 	struct hn_softc *sc = arg1;
4528 	int error;
4529 
4530 	HN_LOCK(sc);
4531 
4532 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4533 	if (error || req->newptr == NULL)
4534 		goto back;
4535 
4536 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4537 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4538 		/*
4539 		 * RSS key is synchronized w/ VF's, don't allow users
4540 		 * to change it.
4541 		 */
4542 		error = EBUSY;
4543 		goto back;
4544 	}
4545 
4546 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4547 	if (error)
4548 		goto back;
4549 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4550 
4551 	if (sc->hn_rx_ring_inuse > 1) {
4552 		error = hn_rss_reconfig(sc);
4553 	} else {
4554 		/* Not RSS capable, at least for now; just save the RSS key. */
4555 		error = 0;
4556 	}
4557 back:
4558 	HN_UNLOCK(sc);
4559 	return (error);
4560 }
4561 
4562 static int
4563 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4564 {
4565 	struct hn_softc *sc = arg1;
4566 	int error;
4567 
4568 	HN_LOCK(sc);
4569 
4570 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4571 	if (error || req->newptr == NULL)
4572 		goto back;
4573 
4574 	/*
4575 	 * Don't allow RSS indirect table change, if this interface is not
4576 	 * RSS capable currently.
4577 	 */
4578 	if (sc->hn_rx_ring_inuse == 1) {
4579 		error = EOPNOTSUPP;
4580 		goto back;
4581 	}
4582 
4583 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4584 	if (error)
4585 		goto back;
4586 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4587 
4588 	hn_rss_ind_fixup(sc);
4589 	error = hn_rss_reconfig(sc);
4590 back:
4591 	HN_UNLOCK(sc);
4592 	return (error);
4593 }
4594 
4595 #endif	/* !RSS */
4596 
4597 static int
4598 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4599 {
4600 	struct hn_softc *sc = arg1;
4601 	char hash_str[128];
4602 	uint32_t hash;
4603 
4604 	HN_LOCK(sc);
4605 	hash = sc->hn_rss_hash;
4606 	HN_UNLOCK(sc);
4607 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4608 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4609 }
4610 
4611 static int
4612 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4613 {
4614 	struct hn_softc *sc = arg1;
4615 	char hash_str[128];
4616 	uint32_t hash;
4617 
4618 	HN_LOCK(sc);
4619 	hash = sc->hn_rss_hcap;
4620 	HN_UNLOCK(sc);
4621 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4622 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4623 }
4624 
4625 static int
4626 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4627 {
4628 	struct hn_softc *sc = arg1;
4629 	char hash_str[128];
4630 	uint32_t hash;
4631 
4632 	HN_LOCK(sc);
4633 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4634 	HN_UNLOCK(sc);
4635 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4636 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4637 }
4638 
4639 static int
4640 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4641 {
4642 	struct hn_softc *sc = arg1;
4643 	char vf_name[IFNAMSIZ + 1];
4644 	struct ifnet *vf_ifp;
4645 
4646 	HN_LOCK(sc);
4647 	vf_name[0] = '\0';
4648 	vf_ifp = sc->hn_vf_ifp;
4649 	if (vf_ifp != NULL)
4650 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4651 	HN_UNLOCK(sc);
4652 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4653 }
4654 
4655 static int
4656 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4657 {
4658 	struct hn_softc *sc = arg1;
4659 	char vf_name[IFNAMSIZ + 1];
4660 	struct ifnet *vf_ifp;
4661 
4662 	HN_LOCK(sc);
4663 	vf_name[0] = '\0';
4664 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4665 	if (vf_ifp != NULL)
4666 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4667 	HN_UNLOCK(sc);
4668 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4669 }
4670 
4671 static int
4672 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4673 {
4674 	struct rm_priotracker pt;
4675 	struct sbuf *sb;
4676 	int error, i;
4677 	bool first;
4678 
4679 	error = sysctl_wire_old_buffer(req, 0);
4680 	if (error != 0)
4681 		return (error);
4682 
4683 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4684 	if (sb == NULL)
4685 		return (ENOMEM);
4686 
4687 	rm_rlock(&hn_vfmap_lock, &pt);
4688 
4689 	first = true;
4690 	for (i = 0; i < hn_vfmap_size; ++i) {
4691 		struct ifnet *ifp;
4692 
4693 		if (hn_vfmap[i] == NULL)
4694 			continue;
4695 
4696 		ifp = ifnet_byindex(i);
4697 		if (ifp != NULL) {
4698 			if (first)
4699 				sbuf_printf(sb, "%s", ifp->if_xname);
4700 			else
4701 				sbuf_printf(sb, " %s", ifp->if_xname);
4702 			first = false;
4703 		}
4704 	}
4705 
4706 	rm_runlock(&hn_vfmap_lock, &pt);
4707 
4708 	error = sbuf_finish(sb);
4709 	sbuf_delete(sb);
4710 	return (error);
4711 }
4712 
4713 static int
4714 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4715 {
4716 	struct rm_priotracker pt;
4717 	struct sbuf *sb;
4718 	int error, i;
4719 	bool first;
4720 
4721 	error = sysctl_wire_old_buffer(req, 0);
4722 	if (error != 0)
4723 		return (error);
4724 
4725 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4726 	if (sb == NULL)
4727 		return (ENOMEM);
4728 
4729 	rm_rlock(&hn_vfmap_lock, &pt);
4730 
4731 	first = true;
4732 	for (i = 0; i < hn_vfmap_size; ++i) {
4733 		struct ifnet *ifp, *hn_ifp;
4734 
4735 		hn_ifp = hn_vfmap[i];
4736 		if (hn_ifp == NULL)
4737 			continue;
4738 
4739 		ifp = ifnet_byindex(i);
4740 		if (ifp != NULL) {
4741 			if (first) {
4742 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4743 				    hn_ifp->if_xname);
4744 			} else {
4745 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4746 				    hn_ifp->if_xname);
4747 			}
4748 			first = false;
4749 		}
4750 	}
4751 
4752 	rm_runlock(&hn_vfmap_lock, &pt);
4753 
4754 	error = sbuf_finish(sb);
4755 	sbuf_delete(sb);
4756 	return (error);
4757 }
4758 
4759 static int
4760 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4761 {
4762 	struct hn_softc *sc = arg1;
4763 	int error, onoff = 0;
4764 
4765 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4766 		onoff = 1;
4767 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4768 	if (error || req->newptr == NULL)
4769 		return (error);
4770 
4771 	HN_LOCK(sc);
4772 	/* NOTE: hn_vf_lock for hn_transmit() */
4773 	rm_wlock(&sc->hn_vf_lock);
4774 	if (onoff)
4775 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4776 	else
4777 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4778 	rm_wunlock(&sc->hn_vf_lock);
4779 	HN_UNLOCK(sc);
4780 
4781 	return (0);
4782 }
4783 
4784 static int
4785 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4786 {
4787 	struct hn_softc *sc = arg1;
4788 	int enabled = 0;
4789 
4790 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4791 		enabled = 1;
4792 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4793 }
4794 
4795 static int
4796 hn_check_iplen(const struct mbuf *m, int hoff)
4797 {
4798 	const struct ip *ip;
4799 	int len, iphlen, iplen;
4800 	const struct tcphdr *th;
4801 	int thoff;				/* TCP data offset */
4802 
4803 	len = hoff + sizeof(struct ip);
4804 
4805 	/* The packet must be at least the size of an IP header. */
4806 	if (m->m_pkthdr.len < len)
4807 		return IPPROTO_DONE;
4808 
4809 	/* The fixed IP header must reside completely in the first mbuf. */
4810 	if (m->m_len < len)
4811 		return IPPROTO_DONE;
4812 
4813 	ip = mtodo(m, hoff);
4814 
4815 	/* Bound check the packet's stated IP header length. */
4816 	iphlen = ip->ip_hl << 2;
4817 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4818 		return IPPROTO_DONE;
4819 
4820 	/* The full IP header must reside completely in the one mbuf. */
4821 	if (m->m_len < hoff + iphlen)
4822 		return IPPROTO_DONE;
4823 
4824 	iplen = ntohs(ip->ip_len);
4825 
4826 	/*
4827 	 * Check that the amount of data in the buffers is as
4828 	 * at least much as the IP header would have us expect.
4829 	 */
4830 	if (m->m_pkthdr.len < hoff + iplen)
4831 		return IPPROTO_DONE;
4832 
4833 	/*
4834 	 * Ignore IP fragments.
4835 	 */
4836 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4837 		return IPPROTO_DONE;
4838 
4839 	/*
4840 	 * The TCP/IP or UDP/IP header must be entirely contained within
4841 	 * the first fragment of a packet.
4842 	 */
4843 	switch (ip->ip_p) {
4844 	case IPPROTO_TCP:
4845 		if (iplen < iphlen + sizeof(struct tcphdr))
4846 			return IPPROTO_DONE;
4847 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4848 			return IPPROTO_DONE;
4849 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4850 		thoff = th->th_off << 2;
4851 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4852 			return IPPROTO_DONE;
4853 		if (m->m_len < hoff + iphlen + thoff)
4854 			return IPPROTO_DONE;
4855 		break;
4856 	case IPPROTO_UDP:
4857 		if (iplen < iphlen + sizeof(struct udphdr))
4858 			return IPPROTO_DONE;
4859 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4860 			return IPPROTO_DONE;
4861 		break;
4862 	default:
4863 		if (iplen < iphlen)
4864 			return IPPROTO_DONE;
4865 		break;
4866 	}
4867 	return ip->ip_p;
4868 }
4869 
4870 static void
4871 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4872 {
4873 	const struct ether_header *eh;
4874 	uint16_t etype;
4875 	int hoff;
4876 
4877 	hoff = sizeof(*eh);
4878 	/* Checked at the beginning of this function. */
4879 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4880 
4881 	eh = mtod(m_new, const struct ether_header *);
4882 	etype = ntohs(eh->ether_type);
4883 	if (etype == ETHERTYPE_VLAN) {
4884 		const struct ether_vlan_header *evl;
4885 
4886 		hoff = sizeof(*evl);
4887 		if (m_new->m_len < hoff)
4888 			return;
4889 		evl = mtod(m_new, const struct ether_vlan_header *);
4890 		etype = ntohs(evl->evl_proto);
4891 	}
4892 	*l3proto = etype;
4893 
4894 	if (etype == ETHERTYPE_IP)
4895 		*l4proto = hn_check_iplen(m_new, hoff);
4896 	else
4897 		*l4proto = IPPROTO_DONE;
4898 }
4899 
4900 static int
4901 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4902 {
4903 	struct sysctl_oid_list *child;
4904 	struct sysctl_ctx_list *ctx;
4905 	device_t dev = sc->hn_dev;
4906 #if defined(INET) || defined(INET6)
4907 #if __FreeBSD_version >= 1100095
4908 	int lroent_cnt;
4909 #endif
4910 #endif
4911 	int i;
4912 
4913 	/*
4914 	 * Create RXBUF for reception.
4915 	 *
4916 	 * NOTE:
4917 	 * - It is shared by all channels.
4918 	 * - A large enough buffer is allocated, certain version of NVSes
4919 	 *   may further limit the usable space.
4920 	 */
4921 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4922 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4923 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4924 	if (sc->hn_rxbuf == NULL) {
4925 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4926 		return (ENOMEM);
4927 	}
4928 
4929 	sc->hn_rx_ring_cnt = ring_cnt;
4930 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4931 
4932 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4933 	    M_DEVBUF, M_WAITOK | M_ZERO);
4934 
4935 #if defined(INET) || defined(INET6)
4936 #if __FreeBSD_version >= 1100095
4937 	lroent_cnt = hn_lro_entry_count;
4938 	if (lroent_cnt < TCP_LRO_ENTRIES)
4939 		lroent_cnt = TCP_LRO_ENTRIES;
4940 	if (bootverbose)
4941 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4942 #endif
4943 #endif	/* INET || INET6 */
4944 
4945 	ctx = device_get_sysctl_ctx(dev);
4946 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4947 
4948 	/* Create dev.hn.UNIT.rx sysctl tree */
4949 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4950 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4951 
4952 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4953 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4954 
4955 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4956 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4957 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4958 		if (rxr->hn_br == NULL) {
4959 			device_printf(dev, "allocate bufring failed\n");
4960 			return (ENOMEM);
4961 		}
4962 
4963 		if (hn_trust_hosttcp)
4964 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4965 		if (hn_trust_hostudp)
4966 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4967 		if (hn_trust_hostip)
4968 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4969 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4970 		rxr->hn_ifp = sc->hn_ifp;
4971 		if (i < sc->hn_tx_ring_cnt)
4972 			rxr->hn_txr = &sc->hn_tx_ring[i];
4973 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4974 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4975 		rxr->hn_rx_idx = i;
4976 		rxr->hn_rxbuf = sc->hn_rxbuf;
4977 
4978 		/*
4979 		 * Initialize LRO.
4980 		 */
4981 #if defined(INET) || defined(INET6)
4982 #if __FreeBSD_version >= 1100095
4983 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4984 		    hn_lro_mbufq_depth);
4985 #else
4986 		tcp_lro_init(&rxr->hn_lro);
4987 		rxr->hn_lro.ifp = sc->hn_ifp;
4988 #endif
4989 #if __FreeBSD_version >= 1100099
4990 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4991 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4992 #endif
4993 #endif	/* INET || INET6 */
4994 
4995 		if (sc->hn_rx_sysctl_tree != NULL) {
4996 			char name[16];
4997 
4998 			/*
4999 			 * Create per RX ring sysctl tree:
5000 			 * dev.hn.UNIT.rx.RINGID
5001 			 */
5002 			snprintf(name, sizeof(name), "%d", i);
5003 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5004 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5005 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5006 
5007 			if (rxr->hn_rx_sysctl_tree != NULL) {
5008 				SYSCTL_ADD_ULONG(ctx,
5009 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5010 				    OID_AUTO, "packets", CTLFLAG_RW,
5011 				    &rxr->hn_pkts, "# of packets received");
5012 				SYSCTL_ADD_ULONG(ctx,
5013 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5014 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5015 				    &rxr->hn_rss_pkts,
5016 				    "# of packets w/ RSS info received");
5017 				SYSCTL_ADD_INT(ctx,
5018 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5019 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5020 				    &rxr->hn_pktbuf_len, 0,
5021 				    "Temporary channel packet buffer length");
5022 			}
5023 		}
5024 	}
5025 
5026 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5027 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5028 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5029 #if __FreeBSD_version < 1100095
5030 	    hn_rx_stat_int_sysctl,
5031 #else
5032 	    hn_rx_stat_u64_sysctl,
5033 #endif
5034 	    "LU", "LRO queued");
5035 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5036 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5037 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5038 #if __FreeBSD_version < 1100095
5039 	    hn_rx_stat_int_sysctl,
5040 #else
5041 	    hn_rx_stat_u64_sysctl,
5042 #endif
5043 	    "LU", "LRO flushed");
5044 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5045 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5046 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5047 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5048 #if __FreeBSD_version >= 1100099
5049 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5050 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5051 	    hn_lro_lenlim_sysctl, "IU",
5052 	    "Max # of data bytes to be aggregated by LRO");
5053 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5054 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5055 	    hn_lro_ackcnt_sysctl, "I",
5056 	    "Max # of ACKs to be aggregated by LRO");
5057 #endif
5058 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5059 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5060 	    hn_trust_hcsum_sysctl, "I",
5061 	    "Trust tcp segement verification on host side, "
5062 	    "when csum info is missing");
5063 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5064 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5065 	    hn_trust_hcsum_sysctl, "I",
5066 	    "Trust udp datagram verification on host side, "
5067 	    "when csum info is missing");
5068 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5069 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5070 	    hn_trust_hcsum_sysctl, "I",
5071 	    "Trust ip packet verification on host side, "
5072 	    "when csum info is missing");
5073 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5074 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5075 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5076 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5077 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5078 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5079 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5080 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5081 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5082 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5083 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5084 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5085 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5086 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5087 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5088 	    hn_rx_stat_ulong_sysctl, "LU",
5089 	    "# of packets that we trust host's csum verification");
5090 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5091 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5092 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5093 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5094 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5095 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5096 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5097 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5098 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5099 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5100 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5101 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5102 
5103 	return (0);
5104 }
5105 
5106 static void
5107 hn_destroy_rx_data(struct hn_softc *sc)
5108 {
5109 	int i;
5110 
5111 	if (sc->hn_rxbuf != NULL) {
5112 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5113 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5114 		else
5115 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5116 		sc->hn_rxbuf = NULL;
5117 	}
5118 
5119 	if (sc->hn_rx_ring_cnt == 0)
5120 		return;
5121 
5122 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5123 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5124 
5125 		if (rxr->hn_br == NULL)
5126 			continue;
5127 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5128 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5129 		} else {
5130 			device_printf(sc->hn_dev,
5131 			    "%dth channel bufring is referenced", i);
5132 		}
5133 		rxr->hn_br = NULL;
5134 
5135 #if defined(INET) || defined(INET6)
5136 		tcp_lro_free(&rxr->hn_lro);
5137 #endif
5138 		free(rxr->hn_pktbuf, M_DEVBUF);
5139 	}
5140 	free(sc->hn_rx_ring, M_DEVBUF);
5141 	sc->hn_rx_ring = NULL;
5142 
5143 	sc->hn_rx_ring_cnt = 0;
5144 	sc->hn_rx_ring_inuse = 0;
5145 }
5146 
5147 static int
5148 hn_tx_ring_create(struct hn_softc *sc, int id)
5149 {
5150 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5151 	device_t dev = sc->hn_dev;
5152 	bus_dma_tag_t parent_dtag;
5153 	int error, i;
5154 
5155 	txr->hn_sc = sc;
5156 	txr->hn_tx_idx = id;
5157 
5158 #ifndef HN_USE_TXDESC_BUFRING
5159 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5160 #endif
5161 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5162 
5163 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5164 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5165 	    M_DEVBUF, M_WAITOK | M_ZERO);
5166 #ifndef HN_USE_TXDESC_BUFRING
5167 	SLIST_INIT(&txr->hn_txlist);
5168 #else
5169 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5170 	    M_WAITOK, &txr->hn_tx_lock);
5171 #endif
5172 
5173 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5174 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5175 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5176 	} else {
5177 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5178 	}
5179 
5180 #ifdef HN_IFSTART_SUPPORT
5181 	if (hn_use_if_start) {
5182 		txr->hn_txeof = hn_start_txeof;
5183 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5184 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5185 	} else
5186 #endif
5187 	{
5188 		int br_depth;
5189 
5190 		txr->hn_txeof = hn_xmit_txeof;
5191 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5192 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5193 
5194 		br_depth = hn_get_txswq_depth(txr);
5195 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5196 		    M_WAITOK, &txr->hn_tx_lock);
5197 	}
5198 
5199 	txr->hn_direct_tx_size = hn_direct_tx_size;
5200 
5201 	/*
5202 	 * Always schedule transmission instead of trying to do direct
5203 	 * transmission.  This one gives the best performance so far.
5204 	 */
5205 	txr->hn_sched_tx = 1;
5206 
5207 	parent_dtag = bus_get_dma_tag(dev);
5208 
5209 	/* DMA tag for RNDIS packet messages. */
5210 	error = bus_dma_tag_create(parent_dtag, /* parent */
5211 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5212 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5213 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5214 	    BUS_SPACE_MAXADDR,		/* highaddr */
5215 	    NULL, NULL,			/* filter, filterarg */
5216 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5217 	    1,				/* nsegments */
5218 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5219 	    0,				/* flags */
5220 	    NULL,			/* lockfunc */
5221 	    NULL,			/* lockfuncarg */
5222 	    &txr->hn_tx_rndis_dtag);
5223 	if (error) {
5224 		device_printf(dev, "failed to create rndis dmatag\n");
5225 		return error;
5226 	}
5227 
5228 	/* DMA tag for data. */
5229 	error = bus_dma_tag_create(parent_dtag, /* parent */
5230 	    1,				/* alignment */
5231 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5232 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5233 	    BUS_SPACE_MAXADDR,		/* highaddr */
5234 	    NULL, NULL,			/* filter, filterarg */
5235 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5236 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5237 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5238 	    0,				/* flags */
5239 	    NULL,			/* lockfunc */
5240 	    NULL,			/* lockfuncarg */
5241 	    &txr->hn_tx_data_dtag);
5242 	if (error) {
5243 		device_printf(dev, "failed to create data dmatag\n");
5244 		return error;
5245 	}
5246 
5247 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5248 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5249 
5250 		txd->txr = txr;
5251 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5252 		STAILQ_INIT(&txd->agg_list);
5253 
5254 		/*
5255 		 * Allocate and load RNDIS packet message.
5256 		 */
5257         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5258 		    (void **)&txd->rndis_pkt,
5259 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5260 		    &txd->rndis_pkt_dmap);
5261 		if (error) {
5262 			device_printf(dev,
5263 			    "failed to allocate rndis_packet_msg, %d\n", i);
5264 			return error;
5265 		}
5266 
5267 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5268 		    txd->rndis_pkt_dmap,
5269 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5270 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5271 		    BUS_DMA_NOWAIT);
5272 		if (error) {
5273 			device_printf(dev,
5274 			    "failed to load rndis_packet_msg, %d\n", i);
5275 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5276 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5277 			return error;
5278 		}
5279 
5280 		/* DMA map for TX data. */
5281 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5282 		    &txd->data_dmap);
5283 		if (error) {
5284 			device_printf(dev,
5285 			    "failed to allocate tx data dmamap\n");
5286 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5287 			    txd->rndis_pkt_dmap);
5288 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5289 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5290 			return error;
5291 		}
5292 
5293 		/* All set, put it to list */
5294 		txd->flags |= HN_TXD_FLAG_ONLIST;
5295 #ifndef HN_USE_TXDESC_BUFRING
5296 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5297 #else
5298 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5299 #endif
5300 	}
5301 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5302 
5303 	if (sc->hn_tx_sysctl_tree != NULL) {
5304 		struct sysctl_oid_list *child;
5305 		struct sysctl_ctx_list *ctx;
5306 		char name[16];
5307 
5308 		/*
5309 		 * Create per TX ring sysctl tree:
5310 		 * dev.hn.UNIT.tx.RINGID
5311 		 */
5312 		ctx = device_get_sysctl_ctx(dev);
5313 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5314 
5315 		snprintf(name, sizeof(name), "%d", id);
5316 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5317 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5318 
5319 		if (txr->hn_tx_sysctl_tree != NULL) {
5320 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5321 
5322 #ifdef HN_DEBUG
5323 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5324 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5325 			    "# of available TX descs");
5326 #endif
5327 #ifdef HN_IFSTART_SUPPORT
5328 			if (!hn_use_if_start)
5329 #endif
5330 			{
5331 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5332 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5333 				    "over active");
5334 			}
5335 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5336 			    CTLFLAG_RW, &txr->hn_pkts,
5337 			    "# of packets transmitted");
5338 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5339 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5340 		}
5341 	}
5342 
5343 	return 0;
5344 }
5345 
5346 static void
5347 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5348 {
5349 	struct hn_tx_ring *txr = txd->txr;
5350 
5351 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5352 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5353 
5354 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5355 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5356 	    txd->rndis_pkt_dmap);
5357 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5358 }
5359 
5360 static void
5361 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5362 {
5363 
5364 	KASSERT(txd->refs == 0 || txd->refs == 1,
5365 	    ("invalid txd refs %d", txd->refs));
5366 
5367 	/* Aggregated txds will be freed by their aggregating txd. */
5368 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5369 		int freed;
5370 
5371 		freed = hn_txdesc_put(txr, txd);
5372 		KASSERT(freed, ("can't free txdesc"));
5373 	}
5374 }
5375 
5376 static void
5377 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5378 {
5379 	int i;
5380 
5381 	if (txr->hn_txdesc == NULL)
5382 		return;
5383 
5384 	/*
5385 	 * NOTE:
5386 	 * Because the freeing of aggregated txds will be deferred
5387 	 * to the aggregating txd, two passes are used here:
5388 	 * - The first pass GCes any pending txds.  This GC is necessary,
5389 	 *   since if the channels are revoked, hypervisor will not
5390 	 *   deliver send-done for all pending txds.
5391 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5392 	 *   were freed.
5393 	 */
5394 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5395 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5396 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5397 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5398 
5399 	if (txr->hn_tx_data_dtag != NULL)
5400 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5401 	if (txr->hn_tx_rndis_dtag != NULL)
5402 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5403 
5404 #ifdef HN_USE_TXDESC_BUFRING
5405 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5406 #endif
5407 
5408 	free(txr->hn_txdesc, M_DEVBUF);
5409 	txr->hn_txdesc = NULL;
5410 
5411 	if (txr->hn_mbuf_br != NULL)
5412 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5413 
5414 #ifndef HN_USE_TXDESC_BUFRING
5415 	mtx_destroy(&txr->hn_txlist_spin);
5416 #endif
5417 	mtx_destroy(&txr->hn_tx_lock);
5418 }
5419 
5420 static int
5421 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5422 {
5423 	struct sysctl_oid_list *child;
5424 	struct sysctl_ctx_list *ctx;
5425 	int i;
5426 
5427 	/*
5428 	 * Create TXBUF for chimney sending.
5429 	 *
5430 	 * NOTE: It is shared by all channels.
5431 	 */
5432 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5433 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5434 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5435 	if (sc->hn_chim == NULL) {
5436 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5437 		return (ENOMEM);
5438 	}
5439 
5440 	sc->hn_tx_ring_cnt = ring_cnt;
5441 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5442 
5443 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5444 	    M_DEVBUF, M_WAITOK | M_ZERO);
5445 
5446 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5447 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5448 
5449 	/* Create dev.hn.UNIT.tx sysctl tree */
5450 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5451 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5452 
5453 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5454 		int error;
5455 
5456 		error = hn_tx_ring_create(sc, i);
5457 		if (error)
5458 			return error;
5459 	}
5460 
5461 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5462 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5463 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5464 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5465 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5466 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5467 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5468 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5469 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5470 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5471 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5472 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5473 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5474 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5475 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5476 	    hn_tx_stat_ulong_sysctl, "LU",
5477 	    "# of packet transmission aggregation flush failure");
5478 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5479 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5480 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5481 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5482 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5483 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5484 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5485 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5486 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5487 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5488 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5489 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5490 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5491 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5492 	    "# of total TX descs");
5493 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5494 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5495 	    "Chimney send packet size upper boundary");
5496 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5497 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5498 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5499 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5500 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5501 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5502 	    hn_tx_conf_int_sysctl, "I",
5503 	    "Size of the packet for direct transmission");
5504 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5505 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5506 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5507 	    hn_tx_conf_int_sysctl, "I",
5508 	    "Always schedule transmission "
5509 	    "instead of doing direct transmission");
5510 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5511 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5512 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5513 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5514 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5515 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5516 	    "Applied packet transmission aggregation size");
5517 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5518 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5519 	    hn_txagg_pktmax_sysctl, "I",
5520 	    "Applied packet transmission aggregation packets");
5521 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5522 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5523 	    hn_txagg_align_sysctl, "I",
5524 	    "Applied packet transmission aggregation alignment");
5525 
5526 	return 0;
5527 }
5528 
5529 static void
5530 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5531 {
5532 	int i;
5533 
5534 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5535 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5536 }
5537 
5538 static void
5539 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5540 {
5541 	struct ifnet *ifp = sc->hn_ifp;
5542 	u_int hw_tsomax;
5543 	int tso_minlen;
5544 
5545 	HN_LOCK_ASSERT(sc);
5546 
5547 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5548 		return;
5549 
5550 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5551 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5552 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5553 
5554 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5555 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5556 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5557 
5558 	if (tso_maxlen < tso_minlen)
5559 		tso_maxlen = tso_minlen;
5560 	else if (tso_maxlen > IP_MAXPACKET)
5561 		tso_maxlen = IP_MAXPACKET;
5562 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5563 		tso_maxlen = sc->hn_ndis_tso_szmax;
5564 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5565 
5566 	if (hn_xpnt_vf_isready(sc)) {
5567 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5568 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5569 	}
5570 	ifp->if_hw_tsomax = hw_tsomax;
5571 	if (bootverbose)
5572 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5573 }
5574 
5575 static void
5576 hn_fixup_tx_data(struct hn_softc *sc)
5577 {
5578 	uint64_t csum_assist;
5579 	int i;
5580 
5581 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5582 	if (hn_tx_chimney_size > 0 &&
5583 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5584 		hn_set_chim_size(sc, hn_tx_chimney_size);
5585 
5586 	csum_assist = 0;
5587 	if (sc->hn_caps & HN_CAP_IPCS)
5588 		csum_assist |= CSUM_IP;
5589 	if (sc->hn_caps & HN_CAP_TCP4CS)
5590 		csum_assist |= CSUM_IP_TCP;
5591 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5592 		csum_assist |= CSUM_IP_UDP;
5593 	if (sc->hn_caps & HN_CAP_TCP6CS)
5594 		csum_assist |= CSUM_IP6_TCP;
5595 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5596 		csum_assist |= CSUM_IP6_UDP;
5597 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5598 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5599 
5600 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5601 		/*
5602 		 * Support HASHVAL pktinfo on TX path.
5603 		 */
5604 		if (bootverbose)
5605 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5606 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5607 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5608 	}
5609 }
5610 
5611 static void
5612 hn_fixup_rx_data(struct hn_softc *sc)
5613 {
5614 
5615 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5616 		int i;
5617 
5618 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5619 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5620 	}
5621 }
5622 
5623 static void
5624 hn_destroy_tx_data(struct hn_softc *sc)
5625 {
5626 	int i;
5627 
5628 	if (sc->hn_chim != NULL) {
5629 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5630 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5631 		} else {
5632 			device_printf(sc->hn_dev,
5633 			    "chimney sending buffer is referenced");
5634 		}
5635 		sc->hn_chim = NULL;
5636 	}
5637 
5638 	if (sc->hn_tx_ring_cnt == 0)
5639 		return;
5640 
5641 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5642 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5643 
5644 	free(sc->hn_tx_ring, M_DEVBUF);
5645 	sc->hn_tx_ring = NULL;
5646 
5647 	sc->hn_tx_ring_cnt = 0;
5648 	sc->hn_tx_ring_inuse = 0;
5649 }
5650 
5651 #ifdef HN_IFSTART_SUPPORT
5652 
5653 static void
5654 hn_start_taskfunc(void *xtxr, int pending __unused)
5655 {
5656 	struct hn_tx_ring *txr = xtxr;
5657 
5658 	mtx_lock(&txr->hn_tx_lock);
5659 	hn_start_locked(txr, 0);
5660 	mtx_unlock(&txr->hn_tx_lock);
5661 }
5662 
5663 static int
5664 hn_start_locked(struct hn_tx_ring *txr, int len)
5665 {
5666 	struct hn_softc *sc = txr->hn_sc;
5667 	struct ifnet *ifp = sc->hn_ifp;
5668 	int sched = 0;
5669 
5670 	KASSERT(hn_use_if_start,
5671 	    ("hn_start_locked is called, when if_start is disabled"));
5672 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5673 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5674 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5675 
5676 	if (__predict_false(txr->hn_suspended))
5677 		return (0);
5678 
5679 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5680 	    IFF_DRV_RUNNING)
5681 		return (0);
5682 
5683 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5684 		struct hn_txdesc *txd;
5685 		struct mbuf *m_head;
5686 		int error;
5687 
5688 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5689 		if (m_head == NULL)
5690 			break;
5691 
5692 		if (len > 0 && m_head->m_pkthdr.len > len) {
5693 			/*
5694 			 * This sending could be time consuming; let callers
5695 			 * dispatch this packet sending (and sending of any
5696 			 * following up packets) to tx taskqueue.
5697 			 */
5698 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5699 			sched = 1;
5700 			break;
5701 		}
5702 
5703 #if defined(INET6) || defined(INET)
5704 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5705 			m_head = hn_tso_fixup(m_head);
5706 			if (__predict_false(m_head == NULL)) {
5707 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5708 				continue;
5709 			}
5710 		} else if (m_head->m_pkthdr.csum_flags &
5711 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5712 			m_head = hn_set_hlen(m_head);
5713 			if (__predict_false(m_head == NULL)) {
5714 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5715 				continue;
5716 			}
5717 		}
5718 #endif
5719 
5720 		txd = hn_txdesc_get(txr);
5721 		if (txd == NULL) {
5722 			txr->hn_no_txdescs++;
5723 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5724 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5725 			break;
5726 		}
5727 
5728 		error = hn_encap(ifp, txr, txd, &m_head);
5729 		if (error) {
5730 			/* Both txd and m_head are freed */
5731 			KASSERT(txr->hn_agg_txd == NULL,
5732 			    ("encap failed w/ pending aggregating txdesc"));
5733 			continue;
5734 		}
5735 
5736 		if (txr->hn_agg_pktleft == 0) {
5737 			if (txr->hn_agg_txd != NULL) {
5738 				KASSERT(m_head == NULL,
5739 				    ("pending mbuf for aggregating txdesc"));
5740 				error = hn_flush_txagg(ifp, txr);
5741 				if (__predict_false(error)) {
5742 					atomic_set_int(&ifp->if_drv_flags,
5743 					    IFF_DRV_OACTIVE);
5744 					break;
5745 				}
5746 			} else {
5747 				KASSERT(m_head != NULL, ("mbuf was freed"));
5748 				error = hn_txpkt(ifp, txr, txd);
5749 				if (__predict_false(error)) {
5750 					/* txd is freed, but m_head is not */
5751 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5752 					atomic_set_int(&ifp->if_drv_flags,
5753 					    IFF_DRV_OACTIVE);
5754 					break;
5755 				}
5756 			}
5757 		}
5758 #ifdef INVARIANTS
5759 		else {
5760 			KASSERT(txr->hn_agg_txd != NULL,
5761 			    ("no aggregating txdesc"));
5762 			KASSERT(m_head == NULL,
5763 			    ("pending mbuf for aggregating txdesc"));
5764 		}
5765 #endif
5766 	}
5767 
5768 	/* Flush pending aggerated transmission. */
5769 	if (txr->hn_agg_txd != NULL)
5770 		hn_flush_txagg(ifp, txr);
5771 	return (sched);
5772 }
5773 
5774 static void
5775 hn_start(struct ifnet *ifp)
5776 {
5777 	struct hn_softc *sc = ifp->if_softc;
5778 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5779 
5780 	if (txr->hn_sched_tx)
5781 		goto do_sched;
5782 
5783 	if (mtx_trylock(&txr->hn_tx_lock)) {
5784 		int sched;
5785 
5786 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5787 		mtx_unlock(&txr->hn_tx_lock);
5788 		if (!sched)
5789 			return;
5790 	}
5791 do_sched:
5792 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5793 }
5794 
5795 static void
5796 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5797 {
5798 	struct hn_tx_ring *txr = xtxr;
5799 
5800 	mtx_lock(&txr->hn_tx_lock);
5801 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5802 	hn_start_locked(txr, 0);
5803 	mtx_unlock(&txr->hn_tx_lock);
5804 }
5805 
5806 static void
5807 hn_start_txeof(struct hn_tx_ring *txr)
5808 {
5809 	struct hn_softc *sc = txr->hn_sc;
5810 	struct ifnet *ifp = sc->hn_ifp;
5811 
5812 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5813 
5814 	if (txr->hn_sched_tx)
5815 		goto do_sched;
5816 
5817 	if (mtx_trylock(&txr->hn_tx_lock)) {
5818 		int sched;
5819 
5820 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5821 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5822 		mtx_unlock(&txr->hn_tx_lock);
5823 		if (sched) {
5824 			taskqueue_enqueue(txr->hn_tx_taskq,
5825 			    &txr->hn_tx_task);
5826 		}
5827 	} else {
5828 do_sched:
5829 		/*
5830 		 * Release the OACTIVE earlier, with the hope, that
5831 		 * others could catch up.  The task will clear the
5832 		 * flag again with the hn_tx_lock to avoid possible
5833 		 * races.
5834 		 */
5835 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5836 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5837 	}
5838 }
5839 
5840 #endif	/* HN_IFSTART_SUPPORT */
5841 
5842 static int
5843 hn_xmit(struct hn_tx_ring *txr, int len)
5844 {
5845 	struct hn_softc *sc = txr->hn_sc;
5846 	struct ifnet *ifp = sc->hn_ifp;
5847 	struct mbuf *m_head;
5848 	int sched = 0;
5849 
5850 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5851 #ifdef HN_IFSTART_SUPPORT
5852 	KASSERT(hn_use_if_start == 0,
5853 	    ("hn_xmit is called, when if_start is enabled"));
5854 #endif
5855 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5856 
5857 	if (__predict_false(txr->hn_suspended))
5858 		return (0);
5859 
5860 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5861 		return (0);
5862 
5863 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5864 		struct hn_txdesc *txd;
5865 		int error;
5866 
5867 		if (len > 0 && m_head->m_pkthdr.len > len) {
5868 			/*
5869 			 * This sending could be time consuming; let callers
5870 			 * dispatch this packet sending (and sending of any
5871 			 * following up packets) to tx taskqueue.
5872 			 */
5873 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5874 			sched = 1;
5875 			break;
5876 		}
5877 
5878 		txd = hn_txdesc_get(txr);
5879 		if (txd == NULL) {
5880 			txr->hn_no_txdescs++;
5881 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5882 			txr->hn_oactive = 1;
5883 			break;
5884 		}
5885 
5886 		error = hn_encap(ifp, txr, txd, &m_head);
5887 		if (error) {
5888 			/* Both txd and m_head are freed; discard */
5889 			KASSERT(txr->hn_agg_txd == NULL,
5890 			    ("encap failed w/ pending aggregating txdesc"));
5891 			drbr_advance(ifp, txr->hn_mbuf_br);
5892 			continue;
5893 		}
5894 
5895 		if (txr->hn_agg_pktleft == 0) {
5896 			if (txr->hn_agg_txd != NULL) {
5897 				KASSERT(m_head == NULL,
5898 				    ("pending mbuf for aggregating txdesc"));
5899 				error = hn_flush_txagg(ifp, txr);
5900 				if (__predict_false(error)) {
5901 					txr->hn_oactive = 1;
5902 					break;
5903 				}
5904 			} else {
5905 				KASSERT(m_head != NULL, ("mbuf was freed"));
5906 				error = hn_txpkt(ifp, txr, txd);
5907 				if (__predict_false(error)) {
5908 					/* txd is freed, but m_head is not */
5909 					drbr_putback(ifp, txr->hn_mbuf_br,
5910 					    m_head);
5911 					txr->hn_oactive = 1;
5912 					break;
5913 				}
5914 			}
5915 		}
5916 #ifdef INVARIANTS
5917 		else {
5918 			KASSERT(txr->hn_agg_txd != NULL,
5919 			    ("no aggregating txdesc"));
5920 			KASSERT(m_head == NULL,
5921 			    ("pending mbuf for aggregating txdesc"));
5922 		}
5923 #endif
5924 
5925 		/* Sent */
5926 		drbr_advance(ifp, txr->hn_mbuf_br);
5927 	}
5928 
5929 	/* Flush pending aggerated transmission. */
5930 	if (txr->hn_agg_txd != NULL)
5931 		hn_flush_txagg(ifp, txr);
5932 	return (sched);
5933 }
5934 
5935 static int
5936 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5937 {
5938 	struct hn_softc *sc = ifp->if_softc;
5939 	struct hn_tx_ring *txr;
5940 	int error, idx = 0;
5941 
5942 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5943 		struct rm_priotracker pt;
5944 
5945 		rm_rlock(&sc->hn_vf_lock, &pt);
5946 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5947 			struct mbuf *m_bpf = NULL;
5948 			int obytes, omcast;
5949 
5950 			obytes = m->m_pkthdr.len;
5951 			omcast = (m->m_flags & M_MCAST) != 0;
5952 
5953 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5954 				if (bpf_peers_present(ifp->if_bpf)) {
5955 					m_bpf = m_copypacket(m, M_NOWAIT);
5956 					if (m_bpf == NULL) {
5957 						/*
5958 						 * Failed to grab a shallow
5959 						 * copy; tap now.
5960 						 */
5961 						ETHER_BPF_MTAP(ifp, m);
5962 					}
5963 				}
5964 			} else {
5965 				ETHER_BPF_MTAP(ifp, m);
5966 			}
5967 
5968 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5969 			rm_runlock(&sc->hn_vf_lock, &pt);
5970 
5971 			if (m_bpf != NULL) {
5972 				if (!error)
5973 					ETHER_BPF_MTAP(ifp, m_bpf);
5974 				m_freem(m_bpf);
5975 			}
5976 
5977 			if (error == ENOBUFS) {
5978 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5979 			} else if (error) {
5980 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5981 			} else {
5982 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5983 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5984 				if (omcast) {
5985 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5986 					    omcast);
5987 				}
5988 			}
5989 			return (error);
5990 		}
5991 		rm_runlock(&sc->hn_vf_lock, &pt);
5992 	}
5993 
5994 #if defined(INET6) || defined(INET)
5995 	/*
5996 	 * Perform TSO packet header fixup or get l2/l3 header length now,
5997 	 * since packet headers should be cache-hot.
5998 	 */
5999 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6000 		m = hn_tso_fixup(m);
6001 		if (__predict_false(m == NULL)) {
6002 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6003 			return EIO;
6004 		}
6005 	} else if (m->m_pkthdr.csum_flags &
6006 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6007 		m = hn_set_hlen(m);
6008 		if (__predict_false(m == NULL)) {
6009 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6010 			return EIO;
6011 		}
6012 	}
6013 #endif
6014 
6015 	/*
6016 	 * Select the TX ring based on flowid
6017 	 */
6018 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6019 #ifdef RSS
6020 		uint32_t bid;
6021 
6022 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6023 		    &bid) == 0)
6024 			idx = bid % sc->hn_tx_ring_inuse;
6025 		else
6026 #endif
6027 		{
6028 #if defined(INET6) || defined(INET)
6029 			int tcpsyn = 0;
6030 
6031 			if (m->m_pkthdr.len < 128 &&
6032 			    (m->m_pkthdr.csum_flags &
6033 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6034 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6035 				m = hn_check_tcpsyn(m, &tcpsyn);
6036 				if (__predict_false(m == NULL)) {
6037 					if_inc_counter(ifp,
6038 					    IFCOUNTER_OERRORS, 1);
6039 					return (EIO);
6040 				}
6041 			}
6042 #else
6043 			const int tcpsyn = 0;
6044 #endif
6045 			if (tcpsyn)
6046 				idx = 0;
6047 			else
6048 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6049 		}
6050 	}
6051 	txr = &sc->hn_tx_ring[idx];
6052 
6053 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6054 	if (error) {
6055 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6056 		return error;
6057 	}
6058 
6059 	if (txr->hn_oactive)
6060 		return 0;
6061 
6062 	if (txr->hn_sched_tx)
6063 		goto do_sched;
6064 
6065 	if (mtx_trylock(&txr->hn_tx_lock)) {
6066 		int sched;
6067 
6068 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6069 		mtx_unlock(&txr->hn_tx_lock);
6070 		if (!sched)
6071 			return 0;
6072 	}
6073 do_sched:
6074 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6075 	return 0;
6076 }
6077 
6078 static void
6079 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6080 {
6081 	struct mbuf *m;
6082 
6083 	mtx_lock(&txr->hn_tx_lock);
6084 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6085 		m_freem(m);
6086 	mtx_unlock(&txr->hn_tx_lock);
6087 }
6088 
6089 static void
6090 hn_xmit_qflush(struct ifnet *ifp)
6091 {
6092 	struct hn_softc *sc = ifp->if_softc;
6093 	struct rm_priotracker pt;
6094 	int i;
6095 
6096 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6097 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6098 	if_qflush(ifp);
6099 
6100 	rm_rlock(&sc->hn_vf_lock, &pt);
6101 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6102 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6103 	rm_runlock(&sc->hn_vf_lock, &pt);
6104 }
6105 
6106 static void
6107 hn_xmit_txeof(struct hn_tx_ring *txr)
6108 {
6109 
6110 	if (txr->hn_sched_tx)
6111 		goto do_sched;
6112 
6113 	if (mtx_trylock(&txr->hn_tx_lock)) {
6114 		int sched;
6115 
6116 		txr->hn_oactive = 0;
6117 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6118 		mtx_unlock(&txr->hn_tx_lock);
6119 		if (sched) {
6120 			taskqueue_enqueue(txr->hn_tx_taskq,
6121 			    &txr->hn_tx_task);
6122 		}
6123 	} else {
6124 do_sched:
6125 		/*
6126 		 * Release the oactive earlier, with the hope, that
6127 		 * others could catch up.  The task will clear the
6128 		 * oactive again with the hn_tx_lock to avoid possible
6129 		 * races.
6130 		 */
6131 		txr->hn_oactive = 0;
6132 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6133 	}
6134 }
6135 
6136 static void
6137 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6138 {
6139 	struct hn_tx_ring *txr = xtxr;
6140 
6141 	mtx_lock(&txr->hn_tx_lock);
6142 	hn_xmit(txr, 0);
6143 	mtx_unlock(&txr->hn_tx_lock);
6144 }
6145 
6146 static void
6147 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6148 {
6149 	struct hn_tx_ring *txr = xtxr;
6150 
6151 	mtx_lock(&txr->hn_tx_lock);
6152 	txr->hn_oactive = 0;
6153 	hn_xmit(txr, 0);
6154 	mtx_unlock(&txr->hn_tx_lock);
6155 }
6156 
6157 static int
6158 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6159 {
6160 	struct vmbus_chan_br cbr;
6161 	struct hn_rx_ring *rxr;
6162 	struct hn_tx_ring *txr = NULL;
6163 	int idx, error;
6164 
6165 	idx = vmbus_chan_subidx(chan);
6166 
6167 	/*
6168 	 * Link this channel to RX/TX ring.
6169 	 */
6170 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6171 	    ("invalid channel index %d, should > 0 && < %d",
6172 	     idx, sc->hn_rx_ring_inuse));
6173 	rxr = &sc->hn_rx_ring[idx];
6174 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6175 	    ("RX ring %d already attached", idx));
6176 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6177 	rxr->hn_chan = chan;
6178 
6179 	if (bootverbose) {
6180 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6181 		    idx, vmbus_chan_id(chan));
6182 	}
6183 
6184 	if (idx < sc->hn_tx_ring_inuse) {
6185 		txr = &sc->hn_tx_ring[idx];
6186 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6187 		    ("TX ring %d already attached", idx));
6188 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6189 
6190 		txr->hn_chan = chan;
6191 		if (bootverbose) {
6192 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6193 			    idx, vmbus_chan_id(chan));
6194 		}
6195 	}
6196 
6197 	/* Bind this channel to a proper CPU. */
6198 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6199 
6200 	/*
6201 	 * Open this channel
6202 	 */
6203 	cbr.cbr = rxr->hn_br;
6204 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6205 	cbr.cbr_txsz = HN_TXBR_SIZE;
6206 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6207 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6208 	if (error) {
6209 		if (error == EISCONN) {
6210 			if_printf(sc->hn_ifp, "bufring is connected after "
6211 			    "chan%u open failure\n", vmbus_chan_id(chan));
6212 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6213 		} else {
6214 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6215 			    vmbus_chan_id(chan), error);
6216 		}
6217 	}
6218 	return (error);
6219 }
6220 
6221 static void
6222 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6223 {
6224 	struct hn_rx_ring *rxr;
6225 	int idx, error;
6226 
6227 	idx = vmbus_chan_subidx(chan);
6228 
6229 	/*
6230 	 * Link this channel to RX/TX ring.
6231 	 */
6232 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6233 	    ("invalid channel index %d, should > 0 && < %d",
6234 	     idx, sc->hn_rx_ring_inuse));
6235 	rxr = &sc->hn_rx_ring[idx];
6236 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6237 	    ("RX ring %d is not attached", idx));
6238 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6239 
6240 	if (idx < sc->hn_tx_ring_inuse) {
6241 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6242 
6243 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6244 		    ("TX ring %d is not attached attached", idx));
6245 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6246 	}
6247 
6248 	/*
6249 	 * Close this channel.
6250 	 *
6251 	 * NOTE:
6252 	 * Channel closing does _not_ destroy the target channel.
6253 	 */
6254 	error = vmbus_chan_close_direct(chan);
6255 	if (error == EISCONN) {
6256 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6257 		    "after being closed\n", vmbus_chan_id(chan));
6258 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6259 	} else if (error) {
6260 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6261 		    vmbus_chan_id(chan), error);
6262 	}
6263 }
6264 
6265 static int
6266 hn_attach_subchans(struct hn_softc *sc)
6267 {
6268 	struct vmbus_channel **subchans;
6269 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6270 	int i, error = 0;
6271 
6272 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6273 
6274 	/* Attach the sub-channels. */
6275 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6276 	for (i = 0; i < subchan_cnt; ++i) {
6277 		int error1;
6278 
6279 		error1 = hn_chan_attach(sc, subchans[i]);
6280 		if (error1) {
6281 			error = error1;
6282 			/* Move on; all channels will be detached later. */
6283 		}
6284 	}
6285 	vmbus_subchan_rel(subchans, subchan_cnt);
6286 
6287 	if (error) {
6288 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6289 	} else {
6290 		if (bootverbose) {
6291 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6292 			    subchan_cnt);
6293 		}
6294 	}
6295 	return (error);
6296 }
6297 
6298 static void
6299 hn_detach_allchans(struct hn_softc *sc)
6300 {
6301 	struct vmbus_channel **subchans;
6302 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6303 	int i;
6304 
6305 	if (subchan_cnt == 0)
6306 		goto back;
6307 
6308 	/* Detach the sub-channels. */
6309 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6310 	for (i = 0; i < subchan_cnt; ++i)
6311 		hn_chan_detach(sc, subchans[i]);
6312 	vmbus_subchan_rel(subchans, subchan_cnt);
6313 
6314 back:
6315 	/*
6316 	 * Detach the primary channel, _after_ all sub-channels
6317 	 * are detached.
6318 	 */
6319 	hn_chan_detach(sc, sc->hn_prichan);
6320 
6321 	/* Wait for sub-channels to be destroyed, if any. */
6322 	vmbus_subchan_drain(sc->hn_prichan);
6323 
6324 #ifdef INVARIANTS
6325 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6326 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6327 		    HN_RX_FLAG_ATTACHED) == 0,
6328 		    ("%dth RX ring is still attached", i));
6329 	}
6330 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6331 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6332 		    HN_TX_FLAG_ATTACHED) == 0,
6333 		    ("%dth TX ring is still attached", i));
6334 	}
6335 #endif
6336 }
6337 
6338 static int
6339 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6340 {
6341 	struct vmbus_channel **subchans;
6342 	int nchan, rxr_cnt, error;
6343 
6344 	nchan = *nsubch + 1;
6345 	if (nchan == 1) {
6346 		/*
6347 		 * Multiple RX/TX rings are not requested.
6348 		 */
6349 		*nsubch = 0;
6350 		return (0);
6351 	}
6352 
6353 	/*
6354 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6355 	 * table entries.
6356 	 */
6357 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6358 	if (error) {
6359 		/* No RSS; this is benign. */
6360 		*nsubch = 0;
6361 		return (0);
6362 	}
6363 	if (bootverbose) {
6364 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6365 		    rxr_cnt, nchan);
6366 	}
6367 
6368 	if (nchan > rxr_cnt)
6369 		nchan = rxr_cnt;
6370 	if (nchan == 1) {
6371 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6372 		*nsubch = 0;
6373 		return (0);
6374 	}
6375 
6376 	/*
6377 	 * Allocate sub-channels from NVS.
6378 	 */
6379 	*nsubch = nchan - 1;
6380 	error = hn_nvs_alloc_subchans(sc, nsubch);
6381 	if (error || *nsubch == 0) {
6382 		/* Failed to allocate sub-channels. */
6383 		*nsubch = 0;
6384 		return (0);
6385 	}
6386 
6387 	/*
6388 	 * Wait for all sub-channels to become ready before moving on.
6389 	 */
6390 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6391 	vmbus_subchan_rel(subchans, *nsubch);
6392 	return (0);
6393 }
6394 
6395 static bool
6396 hn_synth_attachable(const struct hn_softc *sc)
6397 {
6398 	int i;
6399 
6400 	if (sc->hn_flags & HN_FLAG_ERRORS)
6401 		return (false);
6402 
6403 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6404 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6405 
6406 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6407 			return (false);
6408 	}
6409 	return (true);
6410 }
6411 
6412 /*
6413  * Make sure that the RX filter is zero after the successful
6414  * RNDIS initialization.
6415  *
6416  * NOTE:
6417  * Under certain conditions on certain versions of Hyper-V,
6418  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6419  * after the successful RNDIS initialization, which breaks
6420  * the assumption of any following code (well, it breaks the
6421  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6422  * explicitly, drain packets sneaking through, and drain the
6423  * interrupt taskqueues scheduled due to the stealth packets.
6424  */
6425 static void
6426 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6427 {
6428 
6429 	hn_disable_rx(sc);
6430 	hn_drain_rxtx(sc, nchan);
6431 }
6432 
6433 static int
6434 hn_synth_attach(struct hn_softc *sc, int mtu)
6435 {
6436 #define ATTACHED_NVS		0x0002
6437 #define ATTACHED_RNDIS		0x0004
6438 
6439 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6440 	int error, nsubch, nchan = 1, i, rndis_inited;
6441 	uint32_t old_caps, attached = 0;
6442 
6443 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6444 	    ("synthetic parts were attached"));
6445 
6446 	if (!hn_synth_attachable(sc))
6447 		return (ENXIO);
6448 
6449 	/* Save capabilities for later verification. */
6450 	old_caps = sc->hn_caps;
6451 	sc->hn_caps = 0;
6452 
6453 	/* Clear RSS stuffs. */
6454 	sc->hn_rss_ind_size = 0;
6455 	sc->hn_rss_hash = 0;
6456 	sc->hn_rss_hcap = 0;
6457 
6458 	/*
6459 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6460 	 */
6461 	error = hn_chan_attach(sc, sc->hn_prichan);
6462 	if (error)
6463 		goto failed;
6464 
6465 	/*
6466 	 * Attach NVS.
6467 	 */
6468 	error = hn_nvs_attach(sc, mtu);
6469 	if (error)
6470 		goto failed;
6471 	attached |= ATTACHED_NVS;
6472 
6473 	/*
6474 	 * Attach RNDIS _after_ NVS is attached.
6475 	 */
6476 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6477 	if (rndis_inited)
6478 		attached |= ATTACHED_RNDIS;
6479 	if (error)
6480 		goto failed;
6481 
6482 	/*
6483 	 * Make sure capabilities are not changed.
6484 	 */
6485 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6486 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6487 		    old_caps, sc->hn_caps);
6488 		error = ENXIO;
6489 		goto failed;
6490 	}
6491 
6492 	/*
6493 	 * Allocate sub-channels for multi-TX/RX rings.
6494 	 *
6495 	 * NOTE:
6496 	 * The # of RX rings that can be used is equivalent to the # of
6497 	 * channels to be requested.
6498 	 */
6499 	nsubch = sc->hn_rx_ring_cnt - 1;
6500 	error = hn_synth_alloc_subchans(sc, &nsubch);
6501 	if (error)
6502 		goto failed;
6503 	/* NOTE: _Full_ synthetic parts detach is required now. */
6504 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6505 
6506 	/*
6507 	 * Set the # of TX/RX rings that could be used according to
6508 	 * the # of channels that NVS offered.
6509 	 */
6510 	nchan = nsubch + 1;
6511 	hn_set_ring_inuse(sc, nchan);
6512 	if (nchan == 1) {
6513 		/* Only the primary channel can be used; done */
6514 		goto back;
6515 	}
6516 
6517 	/*
6518 	 * Attach the sub-channels.
6519 	 *
6520 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6521 	 */
6522 	error = hn_attach_subchans(sc);
6523 	if (error)
6524 		goto failed;
6525 
6526 	/*
6527 	 * Configure RSS key and indirect table _after_ all sub-channels
6528 	 * are attached.
6529 	 */
6530 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6531 		/*
6532 		 * RSS key is not set yet; set it to the default RSS key.
6533 		 */
6534 		if (bootverbose)
6535 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6536 #ifdef RSS
6537 		rss_getkey(rss->rss_key);
6538 #else
6539 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6540 #endif
6541 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6542 	}
6543 
6544 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6545 		/*
6546 		 * RSS indirect table is not set yet; set it up in round-
6547 		 * robin fashion.
6548 		 */
6549 		if (bootverbose) {
6550 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6551 			    "table\n");
6552 		}
6553 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6554 			uint32_t subidx;
6555 
6556 #ifdef RSS
6557 			subidx = rss_get_indirection_to_bucket(i);
6558 #else
6559 			subidx = i;
6560 #endif
6561 			rss->rss_ind[i] = subidx % nchan;
6562 		}
6563 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6564 	} else {
6565 		/*
6566 		 * # of usable channels may be changed, so we have to
6567 		 * make sure that all entries in RSS indirect table
6568 		 * are valid.
6569 		 *
6570 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6571 		 */
6572 		hn_rss_ind_fixup(sc);
6573 	}
6574 
6575 	sc->hn_rss_hash = sc->hn_rss_hcap;
6576 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6577 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6578 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6579 		hn_vf_rss_fixup(sc, false);
6580 	}
6581 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6582 	if (error)
6583 		goto failed;
6584 back:
6585 	/*
6586 	 * Fixup transmission aggregation setup.
6587 	 */
6588 	hn_set_txagg(sc);
6589 	hn_rndis_init_fixat(sc, nchan);
6590 	return (0);
6591 
6592 failed:
6593 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6594 		hn_rndis_init_fixat(sc, nchan);
6595 		hn_synth_detach(sc);
6596 	} else {
6597 		if (attached & ATTACHED_RNDIS) {
6598 			hn_rndis_init_fixat(sc, nchan);
6599 			hn_rndis_detach(sc);
6600 		}
6601 		if (attached & ATTACHED_NVS)
6602 			hn_nvs_detach(sc);
6603 		hn_chan_detach(sc, sc->hn_prichan);
6604 		/* Restore old capabilities. */
6605 		sc->hn_caps = old_caps;
6606 	}
6607 	return (error);
6608 
6609 #undef ATTACHED_RNDIS
6610 #undef ATTACHED_NVS
6611 }
6612 
6613 /*
6614  * NOTE:
6615  * The interface must have been suspended though hn_suspend(), before
6616  * this function get called.
6617  */
6618 static void
6619 hn_synth_detach(struct hn_softc *sc)
6620 {
6621 
6622 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6623 	    ("synthetic parts were not attached"));
6624 
6625 	/* Detach the RNDIS first. */
6626 	hn_rndis_detach(sc);
6627 
6628 	/* Detach NVS. */
6629 	hn_nvs_detach(sc);
6630 
6631 	/* Detach all of the channels. */
6632 	hn_detach_allchans(sc);
6633 
6634 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6635 		/*
6636 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6637 		 */
6638 		int error;
6639 
6640 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6641 		    sc->hn_rxbuf_gpadl);
6642 		if (error) {
6643 			if_printf(sc->hn_ifp,
6644 			    "rxbuf gpadl disconn failed: %d\n", error);
6645 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6646 		}
6647 		sc->hn_rxbuf_gpadl = 0;
6648 	}
6649 
6650 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6651 		/*
6652 		 * Host is post-Win2016, disconnect chimney sending buffer from
6653 		 * primary channel here.
6654 		 */
6655 		int error;
6656 
6657 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6658 		    sc->hn_chim_gpadl);
6659 		if (error) {
6660 			if_printf(sc->hn_ifp,
6661 			    "chim gpadl disconn failed: %d\n", error);
6662 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6663 		}
6664 		sc->hn_chim_gpadl = 0;
6665 	}
6666 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6667 }
6668 
6669 static void
6670 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6671 {
6672 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6673 	    ("invalid ring count %d", ring_cnt));
6674 
6675 	if (sc->hn_tx_ring_cnt > ring_cnt)
6676 		sc->hn_tx_ring_inuse = ring_cnt;
6677 	else
6678 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6679 	sc->hn_rx_ring_inuse = ring_cnt;
6680 
6681 #ifdef RSS
6682 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6683 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6684 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6685 		    rss_getnumbuckets());
6686 	}
6687 #endif
6688 
6689 	if (bootverbose) {
6690 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6691 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6692 	}
6693 }
6694 
6695 static void
6696 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6697 {
6698 
6699 	/*
6700 	 * NOTE:
6701 	 * The TX bufring will not be drained by the hypervisor,
6702 	 * if the primary channel is revoked.
6703 	 */
6704 	while (!vmbus_chan_rx_empty(chan) ||
6705 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6706 	     !vmbus_chan_tx_empty(chan)))
6707 		pause("waitch", 1);
6708 	vmbus_chan_intr_drain(chan);
6709 }
6710 
6711 static void
6712 hn_disable_rx(struct hn_softc *sc)
6713 {
6714 
6715 	/*
6716 	 * Disable RX by clearing RX filter forcefully.
6717 	 */
6718 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6719 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6720 
6721 	/*
6722 	 * Give RNDIS enough time to flush all pending data packets.
6723 	 */
6724 	pause("waitrx", (200 * hz) / 1000);
6725 }
6726 
6727 /*
6728  * NOTE:
6729  * RX/TX _must_ have been suspended/disabled, before this function
6730  * is called.
6731  */
6732 static void
6733 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6734 {
6735 	struct vmbus_channel **subch = NULL;
6736 	int nsubch;
6737 
6738 	/*
6739 	 * Drain RX/TX bufrings and interrupts.
6740 	 */
6741 	nsubch = nchan - 1;
6742 	if (nsubch > 0)
6743 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6744 
6745 	if (subch != NULL) {
6746 		int i;
6747 
6748 		for (i = 0; i < nsubch; ++i)
6749 			hn_chan_drain(sc, subch[i]);
6750 	}
6751 	hn_chan_drain(sc, sc->hn_prichan);
6752 
6753 	if (subch != NULL)
6754 		vmbus_subchan_rel(subch, nsubch);
6755 }
6756 
6757 static void
6758 hn_suspend_data(struct hn_softc *sc)
6759 {
6760 	struct hn_tx_ring *txr;
6761 	int i;
6762 
6763 	HN_LOCK_ASSERT(sc);
6764 
6765 	/*
6766 	 * Suspend TX.
6767 	 */
6768 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6769 		txr = &sc->hn_tx_ring[i];
6770 
6771 		mtx_lock(&txr->hn_tx_lock);
6772 		txr->hn_suspended = 1;
6773 		mtx_unlock(&txr->hn_tx_lock);
6774 		/* No one is able send more packets now. */
6775 
6776 		/*
6777 		 * Wait for all pending sends to finish.
6778 		 *
6779 		 * NOTE:
6780 		 * We will _not_ receive all pending send-done, if the
6781 		 * primary channel is revoked.
6782 		 */
6783 		while (hn_tx_ring_pending(txr) &&
6784 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6785 			pause("hnwtx", 1 /* 1 tick */);
6786 	}
6787 
6788 	/*
6789 	 * Disable RX.
6790 	 */
6791 	hn_disable_rx(sc);
6792 
6793 	/*
6794 	 * Drain RX/TX.
6795 	 */
6796 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6797 
6798 	/*
6799 	 * Drain any pending TX tasks.
6800 	 *
6801 	 * NOTE:
6802 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6803 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6804 	 */
6805 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6806 		txr = &sc->hn_tx_ring[i];
6807 
6808 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6809 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6810 	}
6811 }
6812 
6813 static void
6814 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6815 {
6816 
6817 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6818 }
6819 
6820 static void
6821 hn_suspend_mgmt(struct hn_softc *sc)
6822 {
6823 	struct task task;
6824 
6825 	HN_LOCK_ASSERT(sc);
6826 
6827 	/*
6828 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6829 	 * through hn_mgmt_taskq.
6830 	 */
6831 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6832 	vmbus_chan_run_task(sc->hn_prichan, &task);
6833 
6834 	/*
6835 	 * Make sure that all pending management tasks are completed.
6836 	 */
6837 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6838 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6839 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6840 }
6841 
6842 static void
6843 hn_suspend(struct hn_softc *sc)
6844 {
6845 
6846 	/* Disable polling. */
6847 	hn_polling(sc, 0);
6848 
6849 	/*
6850 	 * If the non-transparent mode VF is activated, the synthetic
6851 	 * device is receiving packets, so the data path of the
6852 	 * synthetic device must be suspended.
6853 	 */
6854 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6855 	    (sc->hn_flags & HN_FLAG_RXVF))
6856 		hn_suspend_data(sc);
6857 	hn_suspend_mgmt(sc);
6858 }
6859 
6860 static void
6861 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6862 {
6863 	int i;
6864 
6865 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6866 	    ("invalid TX ring count %d", tx_ring_cnt));
6867 
6868 	for (i = 0; i < tx_ring_cnt; ++i) {
6869 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6870 
6871 		mtx_lock(&txr->hn_tx_lock);
6872 		txr->hn_suspended = 0;
6873 		mtx_unlock(&txr->hn_tx_lock);
6874 	}
6875 }
6876 
6877 static void
6878 hn_resume_data(struct hn_softc *sc)
6879 {
6880 	int i;
6881 
6882 	HN_LOCK_ASSERT(sc);
6883 
6884 	/*
6885 	 * Re-enable RX.
6886 	 */
6887 	hn_rxfilter_config(sc);
6888 
6889 	/*
6890 	 * Make sure to clear suspend status on "all" TX rings,
6891 	 * since hn_tx_ring_inuse can be changed after
6892 	 * hn_suspend_data().
6893 	 */
6894 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6895 
6896 #ifdef HN_IFSTART_SUPPORT
6897 	if (!hn_use_if_start)
6898 #endif
6899 	{
6900 		/*
6901 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6902 		 * reduced.
6903 		 */
6904 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6905 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6906 	}
6907 
6908 	/*
6909 	 * Kick start TX.
6910 	 */
6911 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6912 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6913 
6914 		/*
6915 		 * Use txeof task, so that any pending oactive can be
6916 		 * cleared properly.
6917 		 */
6918 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6919 	}
6920 }
6921 
6922 static void
6923 hn_resume_mgmt(struct hn_softc *sc)
6924 {
6925 
6926 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6927 
6928 	/*
6929 	 * Kick off network change detection, if it was pending.
6930 	 * If no network change was pending, start link status
6931 	 * checks, which is more lightweight than network change
6932 	 * detection.
6933 	 */
6934 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6935 		hn_change_network(sc);
6936 	else
6937 		hn_update_link_status(sc);
6938 }
6939 
6940 static void
6941 hn_resume(struct hn_softc *sc)
6942 {
6943 
6944 	/*
6945 	 * If the non-transparent mode VF is activated, the synthetic
6946 	 * device have to receive packets, so the data path of the
6947 	 * synthetic device must be resumed.
6948 	 */
6949 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6950 	    (sc->hn_flags & HN_FLAG_RXVF))
6951 		hn_resume_data(sc);
6952 
6953 	/*
6954 	 * Don't resume link status change if VF is attached/activated.
6955 	 * - In the non-transparent VF mode, the synthetic device marks
6956 	 *   link down until the VF is deactivated; i.e. VF is down.
6957 	 * - In transparent VF mode, VF's media status is used until
6958 	 *   the VF is detached.
6959 	 */
6960 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6961 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6962 		hn_resume_mgmt(sc);
6963 
6964 	/*
6965 	 * Re-enable polling if this interface is running and
6966 	 * the polling is requested.
6967 	 */
6968 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6969 		hn_polling(sc, sc->hn_pollhz);
6970 }
6971 
6972 static void
6973 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6974 {
6975 	const struct rndis_status_msg *msg;
6976 	int ofs;
6977 
6978 	if (dlen < sizeof(*msg)) {
6979 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6980 		return;
6981 	}
6982 	msg = data;
6983 
6984 	switch (msg->rm_status) {
6985 	case RNDIS_STATUS_MEDIA_CONNECT:
6986 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6987 		hn_update_link_status(sc);
6988 		break;
6989 
6990 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6991 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6992 		/* Not really useful; ignore. */
6993 		break;
6994 
6995 	case RNDIS_STATUS_NETWORK_CHANGE:
6996 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6997 		if (dlen < ofs + msg->rm_stbuflen ||
6998 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6999 			if_printf(sc->hn_ifp, "network changed\n");
7000 		} else {
7001 			uint32_t change;
7002 
7003 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7004 			    sizeof(change));
7005 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7006 			    change);
7007 		}
7008 		hn_change_network(sc);
7009 		break;
7010 
7011 	default:
7012 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7013 		    msg->rm_status);
7014 		break;
7015 	}
7016 }
7017 
7018 static int
7019 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7020 {
7021 	const struct rndis_pktinfo *pi = info_data;
7022 	uint32_t mask = 0;
7023 
7024 	while (info_dlen != 0) {
7025 		const void *data;
7026 		uint32_t dlen;
7027 
7028 		if (__predict_false(info_dlen < sizeof(*pi)))
7029 			return (EINVAL);
7030 		if (__predict_false(info_dlen < pi->rm_size))
7031 			return (EINVAL);
7032 		info_dlen -= pi->rm_size;
7033 
7034 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7035 			return (EINVAL);
7036 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7037 			return (EINVAL);
7038 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7039 		data = pi->rm_data;
7040 
7041 		switch (pi->rm_type) {
7042 		case NDIS_PKTINFO_TYPE_VLAN:
7043 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7044 				return (EINVAL);
7045 			info->vlan_info = *((const uint32_t *)data);
7046 			mask |= HN_RXINFO_VLAN;
7047 			break;
7048 
7049 		case NDIS_PKTINFO_TYPE_CSUM:
7050 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7051 				return (EINVAL);
7052 			info->csum_info = *((const uint32_t *)data);
7053 			mask |= HN_RXINFO_CSUM;
7054 			break;
7055 
7056 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7057 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7058 				return (EINVAL);
7059 			info->hash_value = *((const uint32_t *)data);
7060 			mask |= HN_RXINFO_HASHVAL;
7061 			break;
7062 
7063 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
7064 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7065 				return (EINVAL);
7066 			info->hash_info = *((const uint32_t *)data);
7067 			mask |= HN_RXINFO_HASHINF;
7068 			break;
7069 
7070 		default:
7071 			goto next;
7072 		}
7073 
7074 		if (mask == HN_RXINFO_ALL) {
7075 			/* All found; done */
7076 			break;
7077 		}
7078 next:
7079 		pi = (const struct rndis_pktinfo *)
7080 		    ((const uint8_t *)pi + pi->rm_size);
7081 	}
7082 
7083 	/*
7084 	 * Final fixup.
7085 	 * - If there is no hash value, invalidate the hash info.
7086 	 */
7087 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7088 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7089 	return (0);
7090 }
7091 
7092 static __inline bool
7093 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7094 {
7095 
7096 	if (off < check_off) {
7097 		if (__predict_true(off + len <= check_off))
7098 			return (false);
7099 	} else if (off > check_off) {
7100 		if (__predict_true(check_off + check_len <= off))
7101 			return (false);
7102 	}
7103 	return (true);
7104 }
7105 
7106 static void
7107 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7108 {
7109 	const struct rndis_packet_msg *pkt;
7110 	struct hn_rxinfo info;
7111 	int data_off, pktinfo_off, data_len, pktinfo_len;
7112 
7113 	/*
7114 	 * Check length.
7115 	 */
7116 	if (__predict_false(dlen < sizeof(*pkt))) {
7117 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7118 		return;
7119 	}
7120 	pkt = data;
7121 
7122 	if (__predict_false(dlen < pkt->rm_len)) {
7123 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7124 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7125 		return;
7126 	}
7127 	if (__predict_false(pkt->rm_len <
7128 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7129 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7130 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7131 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7132 		    pkt->rm_pktinfolen);
7133 		return;
7134 	}
7135 	if (__predict_false(pkt->rm_datalen == 0)) {
7136 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7137 		return;
7138 	}
7139 
7140 	/*
7141 	 * Check offests.
7142 	 */
7143 #define IS_OFFSET_INVALID(ofs)			\
7144 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7145 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7146 
7147 	/* XXX Hyper-V does not meet data offset alignment requirement */
7148 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7149 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7150 		    "data offset %u\n", pkt->rm_dataoffset);
7151 		return;
7152 	}
7153 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7154 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7155 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7156 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7157 		return;
7158 	}
7159 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7160 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7161 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7162 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7163 		return;
7164 	}
7165 
7166 #undef IS_OFFSET_INVALID
7167 
7168 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7169 	data_len = pkt->rm_datalen;
7170 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7171 	pktinfo_len = pkt->rm_pktinfolen;
7172 
7173 	/*
7174 	 * Check OOB coverage.
7175 	 */
7176 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7177 		int oob_off, oob_len;
7178 
7179 		if_printf(rxr->hn_ifp, "got oobdata\n");
7180 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7181 		oob_len = pkt->rm_oobdatalen;
7182 
7183 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7184 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7185 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7186 			    pkt->rm_len, oob_off, oob_len);
7187 			return;
7188 		}
7189 
7190 		/*
7191 		 * Check against data.
7192 		 */
7193 		if (hn_rndis_check_overlap(oob_off, oob_len,
7194 		    data_off, data_len)) {
7195 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7196 			    "oob overlaps data, oob abs %d len %d, "
7197 			    "data abs %d len %d\n",
7198 			    oob_off, oob_len, data_off, data_len);
7199 			return;
7200 		}
7201 
7202 		/*
7203 		 * Check against pktinfo.
7204 		 */
7205 		if (pktinfo_len != 0 &&
7206 		    hn_rndis_check_overlap(oob_off, oob_len,
7207 		    pktinfo_off, pktinfo_len)) {
7208 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7209 			    "oob overlaps pktinfo, oob abs %d len %d, "
7210 			    "pktinfo abs %d len %d\n",
7211 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7212 			return;
7213 		}
7214 	}
7215 
7216 	/*
7217 	 * Check per-packet-info coverage and find useful per-packet-info.
7218 	 */
7219 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7220 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7221 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7222 	if (__predict_true(pktinfo_len != 0)) {
7223 		bool overlap;
7224 		int error;
7225 
7226 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7227 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7228 			    "pktinfo overflow, msglen %u, "
7229 			    "pktinfo abs %d len %d\n",
7230 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7231 			return;
7232 		}
7233 
7234 		/*
7235 		 * Check packet info coverage.
7236 		 */
7237 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7238 		    data_off, data_len);
7239 		if (__predict_false(overlap)) {
7240 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7241 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7242 			    "data abs %d len %d\n",
7243 			    pktinfo_off, pktinfo_len, data_off, data_len);
7244 			return;
7245 		}
7246 
7247 		/*
7248 		 * Find useful per-packet-info.
7249 		 */
7250 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7251 		    pktinfo_len, &info);
7252 		if (__predict_false(error)) {
7253 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7254 			    "pktinfo\n");
7255 			return;
7256 		}
7257 	}
7258 
7259 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7260 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7261 		    "data overflow, msglen %u, data abs %d len %d\n",
7262 		    pkt->rm_len, data_off, data_len);
7263 		return;
7264 	}
7265 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7266 }
7267 
7268 static __inline void
7269 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7270 {
7271 	const struct rndis_msghdr *hdr;
7272 
7273 	if (__predict_false(dlen < sizeof(*hdr))) {
7274 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7275 		return;
7276 	}
7277 	hdr = data;
7278 
7279 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7280 		/* Hot data path. */
7281 		hn_rndis_rx_data(rxr, data, dlen);
7282 		/* Done! */
7283 		return;
7284 	}
7285 
7286 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7287 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7288 	else
7289 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7290 }
7291 
7292 static void
7293 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7294 {
7295 	const struct hn_nvs_hdr *hdr;
7296 
7297 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7298 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7299 		return;
7300 	}
7301 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7302 
7303 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7304 		/* Useless; ignore */
7305 		return;
7306 	}
7307 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7308 }
7309 
7310 static void
7311 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7312     const struct vmbus_chanpkt_hdr *pkt)
7313 {
7314 	struct hn_nvs_sendctx *sndc;
7315 
7316 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7317 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7318 	    VMBUS_CHANPKT_DATALEN(pkt));
7319 	/*
7320 	 * NOTE:
7321 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7322 	 * its callback.
7323 	 */
7324 }
7325 
7326 static void
7327 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7328     const struct vmbus_chanpkt_hdr *pkthdr)
7329 {
7330 	const struct vmbus_chanpkt_rxbuf *pkt;
7331 	const struct hn_nvs_hdr *nvs_hdr;
7332 	int count, i, hlen;
7333 
7334 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7335 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7336 		return;
7337 	}
7338 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7339 
7340 	/* Make sure that this is a RNDIS message. */
7341 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7342 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7343 		    nvs_hdr->nvs_type);
7344 		return;
7345 	}
7346 
7347 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7348 	if (__predict_false(hlen < sizeof(*pkt))) {
7349 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7350 		return;
7351 	}
7352 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7353 
7354 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7355 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7356 		    pkt->cp_rxbuf_id);
7357 		return;
7358 	}
7359 
7360 	count = pkt->cp_rxbuf_cnt;
7361 	if (__predict_false(hlen <
7362 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7363 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7364 		return;
7365 	}
7366 
7367 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7368 	for (i = 0; i < count; ++i) {
7369 		int ofs, len;
7370 
7371 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7372 		len = pkt->cp_rxbuf[i].rb_len;
7373 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7374 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7375 			    "ofs %d, len %d\n", i, ofs, len);
7376 			continue;
7377 		}
7378 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7379 	}
7380 
7381 	/*
7382 	 * Ack the consumed RXBUF associated w/ this channel packet,
7383 	 * so that this RXBUF can be recycled by the hypervisor.
7384 	 */
7385 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7386 }
7387 
7388 static void
7389 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7390     uint64_t tid)
7391 {
7392 	struct hn_nvs_rndis_ack ack;
7393 	int retries, error;
7394 
7395 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7396 	ack.nvs_status = HN_NVS_STATUS_OK;
7397 
7398 	retries = 0;
7399 again:
7400 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7401 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7402 	if (__predict_false(error == EAGAIN)) {
7403 		/*
7404 		 * NOTE:
7405 		 * This should _not_ happen in real world, since the
7406 		 * consumption of the TX bufring from the TX path is
7407 		 * controlled.
7408 		 */
7409 		if (rxr->hn_ack_failed == 0)
7410 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7411 		rxr->hn_ack_failed++;
7412 		retries++;
7413 		if (retries < 10) {
7414 			DELAY(100);
7415 			goto again;
7416 		}
7417 		/* RXBUF leaks! */
7418 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7419 	}
7420 }
7421 
7422 static void
7423 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7424 {
7425 	struct hn_rx_ring *rxr = xrxr;
7426 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7427 
7428 	for (;;) {
7429 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7430 		int error, pktlen;
7431 
7432 		pktlen = rxr->hn_pktbuf_len;
7433 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7434 		if (__predict_false(error == ENOBUFS)) {
7435 			void *nbuf;
7436 			int nlen;
7437 
7438 			/*
7439 			 * Expand channel packet buffer.
7440 			 *
7441 			 * XXX
7442 			 * Use M_WAITOK here, since allocation failure
7443 			 * is fatal.
7444 			 */
7445 			nlen = rxr->hn_pktbuf_len * 2;
7446 			while (nlen < pktlen)
7447 				nlen *= 2;
7448 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7449 
7450 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7451 			    rxr->hn_pktbuf_len, nlen);
7452 
7453 			free(rxr->hn_pktbuf, M_DEVBUF);
7454 			rxr->hn_pktbuf = nbuf;
7455 			rxr->hn_pktbuf_len = nlen;
7456 			/* Retry! */
7457 			continue;
7458 		} else if (__predict_false(error == EAGAIN)) {
7459 			/* No more channel packets; done! */
7460 			break;
7461 		}
7462 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7463 
7464 		switch (pkt->cph_type) {
7465 		case VMBUS_CHANPKT_TYPE_COMP:
7466 			hn_nvs_handle_comp(sc, chan, pkt);
7467 			break;
7468 
7469 		case VMBUS_CHANPKT_TYPE_RXBUF:
7470 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7471 			break;
7472 
7473 		case VMBUS_CHANPKT_TYPE_INBAND:
7474 			hn_nvs_handle_notify(sc, pkt);
7475 			break;
7476 
7477 		default:
7478 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7479 			    pkt->cph_type);
7480 			break;
7481 		}
7482 	}
7483 	hn_chan_rollup(rxr, rxr->hn_txr);
7484 }
7485 
7486 static void
7487 hn_sysinit(void *arg __unused)
7488 {
7489 	int i;
7490 
7491 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7492 
7493 #ifdef HN_IFSTART_SUPPORT
7494 	/*
7495 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7496 	 * mainly due to the IFF_DRV_OACTIVE flag.
7497 	 */
7498 	if (hn_xpnt_vf && hn_use_if_start) {
7499 		hn_use_if_start = 0;
7500 		printf("hn: tranparent VF mode, if_transmit will be used, "
7501 		    "instead of if_start\n");
7502 	}
7503 #endif
7504 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7505 		printf("hn: invalid transparent VF attach routing "
7506 		    "wait timeout %d, reset to %d\n",
7507 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7508 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7509 	}
7510 
7511 	/*
7512 	 * Initialize VF map.
7513 	 */
7514 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7515 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7516 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7517 	    M_WAITOK | M_ZERO);
7518 
7519 	/*
7520 	 * Fix the # of TX taskqueues.
7521 	 */
7522 	if (hn_tx_taskq_cnt <= 0)
7523 		hn_tx_taskq_cnt = 1;
7524 	else if (hn_tx_taskq_cnt > mp_ncpus)
7525 		hn_tx_taskq_cnt = mp_ncpus;
7526 
7527 	/*
7528 	 * Fix the TX taskqueue mode.
7529 	 */
7530 	switch (hn_tx_taskq_mode) {
7531 	case HN_TX_TASKQ_M_INDEP:
7532 	case HN_TX_TASKQ_M_GLOBAL:
7533 	case HN_TX_TASKQ_M_EVTTQ:
7534 		break;
7535 	default:
7536 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7537 		break;
7538 	}
7539 
7540 	if (vm_guest != VM_GUEST_HV)
7541 		return;
7542 
7543 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7544 		return;
7545 
7546 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7547 	    M_DEVBUF, M_WAITOK);
7548 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7549 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7550 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7551 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7552 		    "hn tx%d", i);
7553 	}
7554 }
7555 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7556 
7557 static void
7558 hn_sysuninit(void *arg __unused)
7559 {
7560 
7561 	if (hn_tx_taskque != NULL) {
7562 		int i;
7563 
7564 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7565 			taskqueue_free(hn_tx_taskque[i]);
7566 		free(hn_tx_taskque, M_DEVBUF);
7567 	}
7568 
7569 	if (hn_vfmap != NULL)
7570 		free(hn_vfmap, M_DEVBUF);
7571 	rm_destroy(&hn_vfmap_lock);
7572 
7573 	counter_u64_free(hn_udpcs_fixup);
7574 }
7575 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7576