xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision e12ff891366cf94db4bfe4c2c810b26a5531053d)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/rmlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
87 
88 #include <net/bpf.h>
89 #include <net/ethernet.h>
90 #include <net/if.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/rndis.h>
96 #ifdef RSS
97 #include <net/rss_config.h>
98 #endif
99 
100 #include <netinet/in_systm.h>
101 #include <netinet/in.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_lro.h>
106 #include <netinet/udp.h>
107 
108 #include <dev/hyperv/include/hyperv.h>
109 #include <dev/hyperv/include/hyperv_busdma.h>
110 #include <dev/hyperv/include/vmbus.h>
111 #include <dev/hyperv/include/vmbus_xact.h>
112 
113 #include <dev/hyperv/netvsc/ndis.h>
114 #include <dev/hyperv/netvsc/if_hnreg.h>
115 #include <dev/hyperv/netvsc/if_hnvar.h>
116 #include <dev/hyperv/netvsc/hn_nvs.h>
117 #include <dev/hyperv/netvsc/hn_rndis.h>
118 
119 #include "vmbus_if.h"
120 
121 #define HN_IFSTART_SUPPORT
122 
123 #define HN_RING_CNT_DEF_MAX		8
124 
125 #define HN_VFMAP_SIZE_DEF		8
126 
127 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
128 
129 /* YYY should get it from the underlying channel */
130 #define HN_TX_DESC_CNT			512
131 
132 #define HN_RNDIS_PKT_LEN					\
133 	(sizeof(struct rndis_packet_msg) +			\
134 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
137 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
138 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
139 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
140 
141 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
142 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
143 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
144 /* -1 for RNDIS packet message */
145 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
146 
147 #define HN_DIRECT_TX_SIZE_DEF		128
148 
149 #define HN_EARLY_TXEOF_THRESH		8
150 
151 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
152 
153 #define HN_LROENT_CNT_DEF		128
154 
155 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
156 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
157 /* YYY 2*MTU is a bit rough, but should be good enough. */
158 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
159 
160 #define HN_LRO_ACKCNT_DEF		1
161 
162 #define HN_LOCK_INIT(sc)		\
163 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
164 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
165 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
166 #define HN_LOCK(sc)					\
167 do {							\
168 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
169 		DELAY(1000);				\
170 } while (0)
171 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
172 
173 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
174 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
175 #define HN_CSUM_IP_HWASSIST(sc)		\
176 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
177 #define HN_CSUM_IP6_HWASSIST(sc)	\
178 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 
180 #define HN_PKTSIZE_MIN(align)		\
181 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
182 	    HN_RNDIS_PKT_LEN, (align))
183 #define HN_PKTSIZE(m, align)		\
184 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 
186 #ifdef RSS
187 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
188 #else
189 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #endif
191 
192 struct hn_txdesc {
193 #ifndef HN_USE_TXDESC_BUFRING
194 	SLIST_ENTRY(hn_txdesc)		link;
195 #endif
196 	STAILQ_ENTRY(hn_txdesc)		agg_link;
197 
198 	/* Aggregated txdescs, in sending order. */
199 	STAILQ_HEAD(, hn_txdesc)	agg_list;
200 
201 	/* The oldest packet, if transmission aggregation happens. */
202 	struct mbuf			*m;
203 	struct hn_tx_ring		*txr;
204 	int				refs;
205 	uint32_t			flags;	/* HN_TXD_FLAG_ */
206 	struct hn_nvs_sendctx		send_ctx;
207 	uint32_t			chim_index;
208 	int				chim_size;
209 
210 	bus_dmamap_t			data_dmap;
211 
212 	bus_addr_t			rndis_pkt_paddr;
213 	struct rndis_packet_msg		*rndis_pkt;
214 	bus_dmamap_t			rndis_pkt_dmap;
215 };
216 
217 #define HN_TXD_FLAG_ONLIST		0x0001
218 #define HN_TXD_FLAG_DMAMAP		0x0002
219 #define HN_TXD_FLAG_ONAGG		0x0004
220 
221 struct hn_rxinfo {
222 	uint32_t			vlan_info;
223 	uint32_t			csum_info;
224 	uint32_t			hash_info;
225 	uint32_t			hash_value;
226 };
227 
228 struct hn_rxvf_setarg {
229 	struct hn_rx_ring	*rxr;
230 	struct ifnet		*vf_ifp;
231 };
232 
233 #define HN_RXINFO_VLAN			0x0001
234 #define HN_RXINFO_CSUM			0x0002
235 #define HN_RXINFO_HASHINF		0x0004
236 #define HN_RXINFO_HASHVAL		0x0008
237 #define HN_RXINFO_ALL			\
238 	(HN_RXINFO_VLAN |		\
239 	 HN_RXINFO_CSUM |		\
240 	 HN_RXINFO_HASHINF |		\
241 	 HN_RXINFO_HASHVAL)
242 
243 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID	0
245 #define HN_NDIS_HASH_INFO_INVALID	0
246 
247 static int			hn_probe(device_t);
248 static int			hn_attach(device_t);
249 static int			hn_detach(device_t);
250 static int			hn_shutdown(device_t);
251 static void			hn_chan_callback(struct vmbus_channel *,
252 				    void *);
253 
254 static void			hn_init(void *);
255 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void			hn_start(struct ifnet *);
258 #endif
259 static int			hn_transmit(struct ifnet *, struct mbuf *);
260 static void			hn_xmit_qflush(struct ifnet *);
261 static int			hn_ifmedia_upd(struct ifnet *);
262 static void			hn_ifmedia_sts(struct ifnet *,
263 				    struct ifmediareq *);
264 
265 static void			hn_ifnet_event(void *, struct ifnet *, int);
266 static void			hn_ifaddr_event(void *, struct ifnet *);
267 static void			hn_ifnet_attevent(void *, struct ifnet *);
268 static void			hn_ifnet_detevent(void *, struct ifnet *);
269 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 
271 static bool			hn_ismyvf(const struct hn_softc *,
272 				    const struct ifnet *);
273 static void			hn_rxvf_change(struct hn_softc *,
274 				    struct ifnet *, bool);
275 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void			hn_rxvf_set_task(void *, int);
277 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 				    struct ifreq *);
281 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool			hn_xpnt_vf_isready(struct hn_softc *);
283 static void			hn_xpnt_vf_setready(struct hn_softc *);
284 static void			hn_xpnt_vf_init_taskfunc(void *, int);
285 static void			hn_xpnt_vf_init(struct hn_softc *);
286 static void			hn_xpnt_vf_setenable(struct hn_softc *);
287 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void			hn_vf_rss_restore(struct hn_softc *);
290 
291 static int			hn_rndis_rxinfo(const void *, int,
292 				    struct hn_rxinfo *);
293 static void			hn_rndis_rx_data(struct hn_rx_ring *,
294 				    const void *, int);
295 static void			hn_rndis_rx_status(struct hn_softc *,
296 				    const void *, int);
297 static void			hn_rndis_init_fixat(struct hn_softc *, int);
298 
299 static void			hn_nvs_handle_notify(struct hn_softc *,
300 				    const struct vmbus_chanpkt_hdr *);
301 static void			hn_nvs_handle_comp(struct hn_softc *,
302 				    struct vmbus_channel *,
303 				    const struct vmbus_chanpkt_hdr *);
304 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 				    struct vmbus_channel *,
306 				    const struct vmbus_chanpkt_hdr *);
307 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 				    struct vmbus_channel *, uint64_t);
309 
310 #if __FreeBSD_version >= 1100099
311 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 #endif
314 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 #else
319 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 #endif
321 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 #ifndef RSS
329 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 #endif
332 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
343 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 
347 static void			hn_stop(struct hn_softc *, bool);
348 static void			hn_init_locked(struct hn_softc *);
349 static int			hn_chan_attach(struct hn_softc *,
350 				    struct vmbus_channel *);
351 static void			hn_chan_detach(struct hn_softc *,
352 				    struct vmbus_channel *);
353 static int			hn_attach_subchans(struct hn_softc *);
354 static void			hn_detach_allchans(struct hn_softc *);
355 static void			hn_chan_rollup(struct hn_rx_ring *,
356 				    struct hn_tx_ring *);
357 static void			hn_set_ring_inuse(struct hn_softc *, int);
358 static int			hn_synth_attach(struct hn_softc *, int);
359 static void			hn_synth_detach(struct hn_softc *);
360 static int			hn_synth_alloc_subchans(struct hn_softc *,
361 				    int *);
362 static bool			hn_synth_attachable(const struct hn_softc *);
363 static void			hn_suspend(struct hn_softc *);
364 static void			hn_suspend_data(struct hn_softc *);
365 static void			hn_suspend_mgmt(struct hn_softc *);
366 static void			hn_resume(struct hn_softc *);
367 static void			hn_resume_data(struct hn_softc *);
368 static void			hn_resume_mgmt(struct hn_softc *);
369 static void			hn_suspend_mgmt_taskfunc(void *, int);
370 static void			hn_chan_drain(struct hn_softc *,
371 				    struct vmbus_channel *);
372 static void			hn_disable_rx(struct hn_softc *);
373 static void			hn_drain_rxtx(struct hn_softc *, int);
374 static void			hn_polling(struct hn_softc *, u_int);
375 static void			hn_chan_polling(struct vmbus_channel *, u_int);
376 static void			hn_mtu_change_fixup(struct hn_softc *);
377 
378 static void			hn_update_link_status(struct hn_softc *);
379 static void			hn_change_network(struct hn_softc *);
380 static void			hn_link_taskfunc(void *, int);
381 static void			hn_netchg_init_taskfunc(void *, int);
382 static void			hn_netchg_status_taskfunc(void *, int);
383 static void			hn_link_status(struct hn_softc *);
384 
385 static int			hn_create_rx_data(struct hn_softc *, int);
386 static void			hn_destroy_rx_data(struct hn_softc *);
387 static int			hn_check_iplen(const struct mbuf *, int);
388 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
389 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
390 static int			hn_rxfilter_config(struct hn_softc *);
391 static int			hn_rss_reconfig(struct hn_softc *);
392 static void			hn_rss_ind_fixup(struct hn_softc *);
393 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
394 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
395 				    int, const struct hn_rxinfo *);
396 static uint32_t			hn_rss_type_fromndis(uint32_t);
397 static uint32_t			hn_rss_type_tondis(uint32_t);
398 
399 static int			hn_tx_ring_create(struct hn_softc *, int);
400 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
401 static int			hn_create_tx_data(struct hn_softc *, int);
402 static void			hn_fixup_tx_data(struct hn_softc *);
403 static void			hn_fixup_rx_data(struct hn_softc *);
404 static void			hn_destroy_tx_data(struct hn_softc *);
405 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
406 static void			hn_txdesc_gc(struct hn_tx_ring *,
407 				    struct hn_txdesc *);
408 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
409 				    struct hn_txdesc *, struct mbuf **);
410 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 				    struct hn_txdesc *);
412 static void			hn_set_chim_size(struct hn_softc *, int);
413 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
414 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
415 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
416 static void			hn_resume_tx(struct hn_softc *, int);
417 static void			hn_set_txagg(struct hn_softc *);
418 static void			*hn_try_txagg(struct ifnet *,
419 				    struct hn_tx_ring *, struct hn_txdesc *,
420 				    int);
421 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
422 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
423 				    struct hn_softc *, struct vmbus_channel *,
424 				    const void *, int);
425 static int			hn_txpkt_sglist(struct hn_tx_ring *,
426 				    struct hn_txdesc *);
427 static int			hn_txpkt_chim(struct hn_tx_ring *,
428 				    struct hn_txdesc *);
429 static int			hn_xmit(struct hn_tx_ring *, int);
430 static void			hn_xmit_taskfunc(void *, int);
431 static void			hn_xmit_txeof(struct hn_tx_ring *);
432 static void			hn_xmit_txeof_taskfunc(void *, int);
433 #ifdef HN_IFSTART_SUPPORT
434 static int			hn_start_locked(struct hn_tx_ring *, int);
435 static void			hn_start_taskfunc(void *, int);
436 static void			hn_start_txeof(struct hn_tx_ring *);
437 static void			hn_start_txeof_taskfunc(void *, int);
438 #endif
439 
440 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
441     "Hyper-V network interface");
442 
443 /* Trust tcp segements verification on host side. */
444 static int			hn_trust_hosttcp = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
446     &hn_trust_hosttcp, 0,
447     "Trust tcp segement verification on host side, "
448     "when csum info is missing (global setting)");
449 
450 /* Trust udp datagrams verification on host side. */
451 static int			hn_trust_hostudp = 1;
452 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
453     &hn_trust_hostudp, 0,
454     "Trust udp datagram verification on host side, "
455     "when csum info is missing (global setting)");
456 
457 /* Trust ip packets verification on host side. */
458 static int			hn_trust_hostip = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460     &hn_trust_hostip, 0,
461     "Trust ip packet verification on host side, "
462     "when csum info is missing (global setting)");
463 
464 /*
465  * Offload UDP/IPv4 checksum.
466  */
467 static int			hn_enable_udp4cs = 1;
468 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
469     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 
471 /*
472  * Offload UDP/IPv6 checksum.
473  */
474 static int			hn_enable_udp6cs = 1;
475 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
476     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 
478 /* Stats. */
479 static counter_u64_t		hn_udpcs_fixup;
480 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
481     &hn_udpcs_fixup, "# of UDP checksum fixup");
482 
483 /*
484  * See hn_set_hlen().
485  *
486  * This value is for Azure.  For Hyper-V, set this above
487  * 65536 to disable UDP datagram checksum fixup.
488  */
489 static int			hn_udpcs_fixup_mtu = 1420;
490 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
491     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
492 
493 /* Limit TSO burst size */
494 static int			hn_tso_maxlen = IP_MAXPACKET;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
496     &hn_tso_maxlen, 0, "TSO burst limit");
497 
498 /* Limit chimney send size */
499 static int			hn_tx_chimney_size = 0;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
501     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
502 
503 /* Limit the size of packet for direct transmission */
504 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
505 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
506     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
507 
508 /* # of LRO entries per RX ring */
509 #if defined(INET) || defined(INET6)
510 #if __FreeBSD_version >= 1100095
511 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
512 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
513     &hn_lro_entry_count, 0, "LRO entry count");
514 #endif
515 #endif
516 
517 static int			hn_tx_taskq_cnt = 1;
518 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
519     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
520 
521 #define HN_TX_TASKQ_M_INDEP	0
522 #define HN_TX_TASKQ_M_GLOBAL	1
523 #define HN_TX_TASKQ_M_EVTTQ	2
524 
525 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
526 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
527     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
528     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
529 
530 #ifndef HN_USE_TXDESC_BUFRING
531 static int			hn_use_txdesc_bufring = 0;
532 #else
533 static int			hn_use_txdesc_bufring = 1;
534 #endif
535 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
536     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
537 
538 #ifdef HN_IFSTART_SUPPORT
539 /* Use ifnet.if_start instead of ifnet.if_transmit */
540 static int			hn_use_if_start = 0;
541 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
542     &hn_use_if_start, 0, "Use if_start TX method");
543 #endif
544 
545 /* # of channels to use */
546 static int			hn_chan_cnt = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
548     &hn_chan_cnt, 0,
549     "# of channels to use; each channel has one RX ring and one TX ring");
550 
551 /* # of transmit rings to use */
552 static int			hn_tx_ring_cnt = 0;
553 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
554     &hn_tx_ring_cnt, 0, "# of TX rings to use");
555 
556 /* Software TX ring deptch */
557 static int			hn_tx_swq_depth = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
559     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
560 
561 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
562 #if __FreeBSD_version >= 1100095
563 static u_int			hn_lro_mbufq_depth = 0;
564 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
565     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 #endif
567 
568 /* Packet transmission aggregation size limit */
569 static int			hn_tx_agg_size = -1;
570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
571     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
572 
573 /* Packet transmission aggregation count limit */
574 static int			hn_tx_agg_pkts = -1;
575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
576     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 
578 /* VF list */
579 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
580     0, 0, hn_vflist_sysctl, "A", "VF list");
581 
582 /* VF mapping */
583 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
584     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 
586 /* Transparent VF */
587 static int			hn_xpnt_vf = 1;
588 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
589     &hn_xpnt_vf, 0, "Transparent VF mod");
590 
591 /* Accurate BPF support for Transparent VF */
592 static int			hn_xpnt_vf_accbpf = 0;
593 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
594     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
595 
596 /* Extra wait for transparent VF attach routing; unit seconds. */
597 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
598 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
599     &hn_xpnt_vf_attwait, 0,
600     "Extra wait for transparent VF attach routing; unit: seconds");
601 
602 static u_int			hn_cpu_index;	/* next CPU for channel */
603 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
604 
605 static struct rmlock		hn_vfmap_lock;
606 static int			hn_vfmap_size;
607 static struct ifnet		**hn_vfmap;
608 
609 #ifndef RSS
610 static const uint8_t
611 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
612 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
613 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
614 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
615 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
616 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
617 };
618 #endif	/* !RSS */
619 
620 static const struct hyperv_guid	hn_guid = {
621 	.hv_guid = {
622 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
623 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
624 };
625 
626 static device_method_t hn_methods[] = {
627 	/* Device interface */
628 	DEVMETHOD(device_probe,		hn_probe),
629 	DEVMETHOD(device_attach,	hn_attach),
630 	DEVMETHOD(device_detach,	hn_detach),
631 	DEVMETHOD(device_shutdown,	hn_shutdown),
632 	DEVMETHOD_END
633 };
634 
635 static driver_t hn_driver = {
636 	"hn",
637 	hn_methods,
638 	sizeof(struct hn_softc)
639 };
640 
641 static devclass_t hn_devclass;
642 
643 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
644 MODULE_VERSION(hn, 1);
645 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
646 
647 #if __FreeBSD_version >= 1100099
648 static void
649 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
650 {
651 	int i;
652 
653 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
654 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 }
656 #endif
657 
658 static int
659 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
660 {
661 
662 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
663 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
664 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
665 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
666 }
667 
668 static int
669 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
670 {
671 	struct hn_nvs_rndis rndis;
672 
673 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size > 0, ("invalid rndis chim txd"));
675 
676 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
677 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
678 	rndis.nvs_chim_idx = txd->chim_index;
679 	rndis.nvs_chim_sz = txd->chim_size;
680 
681 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
682 	    &rndis, sizeof(rndis), &txd->send_ctx));
683 }
684 
685 static __inline uint32_t
686 hn_chim_alloc(struct hn_softc *sc)
687 {
688 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
689 	u_long *bmap = sc->hn_chim_bmap;
690 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
691 
692 	for (i = 0; i < bmap_cnt; ++i) {
693 		int idx;
694 
695 		idx = ffsl(~bmap[i]);
696 		if (idx == 0)
697 			continue;
698 
699 		--idx; /* ffsl is 1-based */
700 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
701 		    ("invalid i %d and idx %d", i, idx));
702 
703 		if (atomic_testandset_long(&bmap[i], idx))
704 			continue;
705 
706 		ret = i * LONG_BIT + idx;
707 		break;
708 	}
709 	return (ret);
710 }
711 
712 static __inline void
713 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 {
715 	u_long mask;
716 	uint32_t idx;
717 
718 	idx = chim_idx / LONG_BIT;
719 	KASSERT(idx < sc->hn_chim_bmap_cnt,
720 	    ("invalid chimney index 0x%x", chim_idx));
721 
722 	mask = 1UL << (chim_idx % LONG_BIT);
723 	KASSERT(sc->hn_chim_bmap[idx] & mask,
724 	    ("index bitmap 0x%lx, chimney index %u, "
725 	     "bitmap idx %d, bitmask 0x%lx",
726 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
727 
728 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
729 }
730 
731 #if defined(INET6) || defined(INET)
732 
733 #define PULLUP_HDR(m, len)				\
734 do {							\
735 	if (__predict_false((m)->m_len < (len))) {	\
736 		(m) = m_pullup((m), (len));		\
737 		if ((m) == NULL)			\
738 			return (NULL);			\
739 	}						\
740 } while (0)
741 
742 /*
743  * NOTE: If this function failed, the m_head would be freed.
744  */
745 static __inline struct mbuf *
746 hn_tso_fixup(struct mbuf *m_head)
747 {
748 	struct ether_vlan_header *evl;
749 	struct tcphdr *th;
750 	int ehlen;
751 
752 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
753 
754 	PULLUP_HDR(m_head, sizeof(*evl));
755 	evl = mtod(m_head, struct ether_vlan_header *);
756 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
757 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
758 	else
759 		ehlen = ETHER_HDR_LEN;
760 	m_head->m_pkthdr.l2hlen = ehlen;
761 
762 #ifdef INET
763 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
764 		struct ip *ip;
765 		int iphlen;
766 
767 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
768 		ip = mtodo(m_head, ehlen);
769 		iphlen = ip->ip_hl << 2;
770 		m_head->m_pkthdr.l3hlen = iphlen;
771 
772 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
773 		th = mtodo(m_head, ehlen + iphlen);
774 
775 		ip->ip_len = 0;
776 		ip->ip_sum = 0;
777 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
778 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
779 	}
780 #endif
781 #if defined(INET6) && defined(INET)
782 	else
783 #endif
784 #ifdef INET6
785 	{
786 		struct ip6_hdr *ip6;
787 
788 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
789 		ip6 = mtodo(m_head, ehlen);
790 		if (ip6->ip6_nxt != IPPROTO_TCP) {
791 			m_freem(m_head);
792 			return (NULL);
793 		}
794 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
795 
796 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
797 		th = mtodo(m_head, ehlen + sizeof(*ip6));
798 
799 		ip6->ip6_plen = 0;
800 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
801 	}
802 #endif
803 	return (m_head);
804 }
805 
806 /*
807  * NOTE: If this function failed, the m_head would be freed.
808  */
809 static __inline struct mbuf *
810 hn_set_hlen(struct mbuf *m_head)
811 {
812 	const struct ether_vlan_header *evl;
813 	int ehlen;
814 
815 	PULLUP_HDR(m_head, sizeof(*evl));
816 	evl = mtod(m_head, const struct ether_vlan_header *);
817 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
818 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
819 	else
820 		ehlen = ETHER_HDR_LEN;
821 	m_head->m_pkthdr.l2hlen = ehlen;
822 
823 #ifdef INET
824 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
825 		const struct ip *ip;
826 		int iphlen;
827 
828 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
829 		ip = mtodo(m_head, ehlen);
830 		iphlen = ip->ip_hl << 2;
831 		m_head->m_pkthdr.l3hlen = iphlen;
832 
833 		/*
834 		 * UDP checksum offload does not work in Azure, if the
835 		 * following conditions meet:
836 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
837 		 * - IP_DF is not set in the IP hdr.
838 		 *
839 		 * Fallback to software checksum for these UDP datagrams.
840 		 */
841 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
842 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
843 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
844 			uint16_t off = ehlen + iphlen;
845 
846 			counter_u64_add(hn_udpcs_fixup, 1);
847 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
848 			*(uint16_t *)(m_head->m_data + off +
849                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
850 			    m_head, m_head->m_pkthdr.len, off);
851 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
852 		}
853 	}
854 #endif
855 #if defined(INET6) && defined(INET)
856 	else
857 #endif
858 #ifdef INET6
859 	{
860 		const struct ip6_hdr *ip6;
861 
862 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
863 		ip6 = mtodo(m_head, ehlen);
864 		if (ip6->ip6_nxt != IPPROTO_TCP &&
865 		    ip6->ip6_nxt != IPPROTO_UDP) {
866 			m_freem(m_head);
867 			return (NULL);
868 		}
869 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
870 	}
871 #endif
872 	return (m_head);
873 }
874 
875 /*
876  * NOTE: If this function failed, the m_head would be freed.
877  */
878 static __inline struct mbuf *
879 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
880 {
881 	const struct tcphdr *th;
882 	int ehlen, iphlen;
883 
884 	*tcpsyn = 0;
885 	ehlen = m_head->m_pkthdr.l2hlen;
886 	iphlen = m_head->m_pkthdr.l3hlen;
887 
888 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
889 	th = mtodo(m_head, ehlen + iphlen);
890 	if (th->th_flags & TH_SYN)
891 		*tcpsyn = 1;
892 	return (m_head);
893 }
894 
895 #undef PULLUP_HDR
896 
897 #endif	/* INET6 || INET */
898 
899 static int
900 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
901 {
902 	int error = 0;
903 
904 	HN_LOCK_ASSERT(sc);
905 
906 	if (sc->hn_rx_filter != filter) {
907 		error = hn_rndis_set_rxfilter(sc, filter);
908 		if (!error)
909 			sc->hn_rx_filter = filter;
910 	}
911 	return (error);
912 }
913 
914 static int
915 hn_rxfilter_config(struct hn_softc *sc)
916 {
917 	struct ifnet *ifp = sc->hn_ifp;
918 	uint32_t filter;
919 
920 	HN_LOCK_ASSERT(sc);
921 
922 	/*
923 	 * If the non-transparent mode VF is activated, we don't know how
924 	 * its RX filter is configured, so stick the synthetic device in
925 	 * the promiscous mode.
926 	 */
927 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
928 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
929 	} else {
930 		filter = NDIS_PACKET_TYPE_DIRECTED;
931 		if (ifp->if_flags & IFF_BROADCAST)
932 			filter |= NDIS_PACKET_TYPE_BROADCAST;
933 		/* TODO: support multicast list */
934 		if ((ifp->if_flags & IFF_ALLMULTI) ||
935 		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
936 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
937 	}
938 	return (hn_set_rxfilter(sc, filter));
939 }
940 
941 static void
942 hn_set_txagg(struct hn_softc *sc)
943 {
944 	uint32_t size, pkts;
945 	int i;
946 
947 	/*
948 	 * Setup aggregation size.
949 	 */
950 	if (sc->hn_agg_size < 0)
951 		size = UINT32_MAX;
952 	else
953 		size = sc->hn_agg_size;
954 
955 	if (sc->hn_rndis_agg_size < size)
956 		size = sc->hn_rndis_agg_size;
957 
958 	/* NOTE: We only aggregate packets using chimney sending buffers. */
959 	if (size > (uint32_t)sc->hn_chim_szmax)
960 		size = sc->hn_chim_szmax;
961 
962 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
963 		/* Disable */
964 		size = 0;
965 		pkts = 0;
966 		goto done;
967 	}
968 
969 	/* NOTE: Type of the per TX ring setting is 'int'. */
970 	if (size > INT_MAX)
971 		size = INT_MAX;
972 
973 	/*
974 	 * Setup aggregation packet count.
975 	 */
976 	if (sc->hn_agg_pkts < 0)
977 		pkts = UINT32_MAX;
978 	else
979 		pkts = sc->hn_agg_pkts;
980 
981 	if (sc->hn_rndis_agg_pkts < pkts)
982 		pkts = sc->hn_rndis_agg_pkts;
983 
984 	if (pkts <= 1) {
985 		/* Disable */
986 		size = 0;
987 		pkts = 0;
988 		goto done;
989 	}
990 
991 	/* NOTE: Type of the per TX ring setting is 'short'. */
992 	if (pkts > SHRT_MAX)
993 		pkts = SHRT_MAX;
994 
995 done:
996 	/* NOTE: Type of the per TX ring setting is 'short'. */
997 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
998 		/* Disable */
999 		size = 0;
1000 		pkts = 0;
1001 	}
1002 
1003 	if (bootverbose) {
1004 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1005 		    size, pkts, sc->hn_rndis_agg_align);
1006 	}
1007 
1008 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1009 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1010 
1011 		mtx_lock(&txr->hn_tx_lock);
1012 		txr->hn_agg_szmax = size;
1013 		txr->hn_agg_pktmax = pkts;
1014 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1015 		mtx_unlock(&txr->hn_tx_lock);
1016 	}
1017 }
1018 
1019 static int
1020 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1021 {
1022 
1023 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1024 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1025 		return txr->hn_txdesc_cnt;
1026 	return hn_tx_swq_depth;
1027 }
1028 
1029 static int
1030 hn_rss_reconfig(struct hn_softc *sc)
1031 {
1032 	int error;
1033 
1034 	HN_LOCK_ASSERT(sc);
1035 
1036 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1037 		return (ENXIO);
1038 
1039 	/*
1040 	 * Disable RSS first.
1041 	 *
1042 	 * NOTE:
1043 	 * Direct reconfiguration by setting the UNCHG flags does
1044 	 * _not_ work properly.
1045 	 */
1046 	if (bootverbose)
1047 		if_printf(sc->hn_ifp, "disable RSS\n");
1048 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1049 	if (error) {
1050 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1051 		return (error);
1052 	}
1053 
1054 	/*
1055 	 * Reenable the RSS w/ the updated RSS key or indirect
1056 	 * table.
1057 	 */
1058 	if (bootverbose)
1059 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1060 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1061 	if (error) {
1062 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1063 		return (error);
1064 	}
1065 	return (0);
1066 }
1067 
1068 static void
1069 hn_rss_ind_fixup(struct hn_softc *sc)
1070 {
1071 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1072 	int i, nchan;
1073 
1074 	nchan = sc->hn_rx_ring_inuse;
1075 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1076 
1077 	/*
1078 	 * Check indirect table to make sure that all channels in it
1079 	 * can be used.
1080 	 */
1081 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1082 		if (rss->rss_ind[i] >= nchan) {
1083 			if_printf(sc->hn_ifp,
1084 			    "RSS indirect table %d fixup: %u -> %d\n",
1085 			    i, rss->rss_ind[i], nchan - 1);
1086 			rss->rss_ind[i] = nchan - 1;
1087 		}
1088 	}
1089 }
1090 
1091 static int
1092 hn_ifmedia_upd(struct ifnet *ifp __unused)
1093 {
1094 
1095 	return EOPNOTSUPP;
1096 }
1097 
1098 static void
1099 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1100 {
1101 	struct hn_softc *sc = ifp->if_softc;
1102 
1103 	ifmr->ifm_status = IFM_AVALID;
1104 	ifmr->ifm_active = IFM_ETHER;
1105 
1106 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1107 		ifmr->ifm_active |= IFM_NONE;
1108 		return;
1109 	}
1110 	ifmr->ifm_status |= IFM_ACTIVE;
1111 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1112 }
1113 
1114 static void
1115 hn_rxvf_set_task(void *xarg, int pending __unused)
1116 {
1117 	struct hn_rxvf_setarg *arg = xarg;
1118 
1119 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1120 }
1121 
1122 static void
1123 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1124 {
1125 	struct hn_rx_ring *rxr;
1126 	struct hn_rxvf_setarg arg;
1127 	struct task task;
1128 	int i;
1129 
1130 	HN_LOCK_ASSERT(sc);
1131 
1132 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1133 
1134 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1135 		rxr = &sc->hn_rx_ring[i];
1136 
1137 		if (i < sc->hn_rx_ring_inuse) {
1138 			arg.rxr = rxr;
1139 			arg.vf_ifp = vf_ifp;
1140 			vmbus_chan_run_task(rxr->hn_chan, &task);
1141 		} else {
1142 			rxr->hn_rxvf_ifp = vf_ifp;
1143 		}
1144 	}
1145 }
1146 
1147 static bool
1148 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1149 {
1150 	const struct ifnet *hn_ifp;
1151 
1152 	hn_ifp = sc->hn_ifp;
1153 
1154 	if (ifp == hn_ifp)
1155 		return (false);
1156 
1157 	if (ifp->if_alloctype != IFT_ETHER)
1158 		return (false);
1159 
1160 	/* Ignore lagg/vlan interfaces */
1161 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1162 	    strcmp(ifp->if_dname, "vlan") == 0)
1163 		return (false);
1164 
1165 	/*
1166 	 * During detach events ifp->if_addr might be NULL.
1167 	 * Make sure the bcmp() below doesn't panic on that:
1168 	 */
1169 	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1170 		return (false);
1171 
1172 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1173 		return (false);
1174 
1175 	return (true);
1176 }
1177 
1178 static void
1179 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1180 {
1181 	struct ifnet *hn_ifp;
1182 
1183 	HN_LOCK(sc);
1184 
1185 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1186 		goto out;
1187 
1188 	if (!hn_ismyvf(sc, ifp))
1189 		goto out;
1190 	hn_ifp = sc->hn_ifp;
1191 
1192 	if (rxvf) {
1193 		if (sc->hn_flags & HN_FLAG_RXVF)
1194 			goto out;
1195 
1196 		sc->hn_flags |= HN_FLAG_RXVF;
1197 		hn_rxfilter_config(sc);
1198 	} else {
1199 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1200 			goto out;
1201 
1202 		sc->hn_flags &= ~HN_FLAG_RXVF;
1203 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1204 			hn_rxfilter_config(sc);
1205 		else
1206 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1207 	}
1208 
1209 	hn_nvs_set_datapath(sc,
1210 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1211 
1212 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1213 
1214 	if (rxvf) {
1215 		hn_vf_rss_fixup(sc, true);
1216 		hn_suspend_mgmt(sc);
1217 		sc->hn_link_flags &=
1218 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1219 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1220 	} else {
1221 		hn_vf_rss_restore(sc);
1222 		hn_resume_mgmt(sc);
1223 	}
1224 
1225 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1226 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1227 
1228 	if (bootverbose) {
1229 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1230 		    rxvf ? "to" : "from", ifp->if_xname);
1231 	}
1232 out:
1233 	HN_UNLOCK(sc);
1234 }
1235 
1236 static void
1237 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1238 {
1239 
1240 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1241 		return;
1242 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1243 }
1244 
1245 static void
1246 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1247 {
1248 
1249 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1250 }
1251 
1252 static int
1253 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1254 {
1255 	struct ifnet *ifp, *vf_ifp;
1256 	uint64_t tmp;
1257 	int error;
1258 
1259 	HN_LOCK_ASSERT(sc);
1260 	ifp = sc->hn_ifp;
1261 	vf_ifp = sc->hn_vf_ifp;
1262 
1263 	/*
1264 	 * Fix up requested capabilities w/ supported capabilities,
1265 	 * since the supported capabilities could have been changed.
1266 	 */
1267 	ifr->ifr_reqcap &= ifp->if_capabilities;
1268 	/* Pass SIOCSIFCAP to VF. */
1269 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1270 
1271 	/*
1272 	 * NOTE:
1273 	 * The error will be propagated to the callers, however, it
1274 	 * is _not_ useful here.
1275 	 */
1276 
1277 	/*
1278 	 * Merge VF's enabled capabilities.
1279 	 */
1280 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1281 
1282 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1283 	if (ifp->if_capenable & IFCAP_TXCSUM)
1284 		ifp->if_hwassist |= tmp;
1285 	else
1286 		ifp->if_hwassist &= ~tmp;
1287 
1288 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1289 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1290 		ifp->if_hwassist |= tmp;
1291 	else
1292 		ifp->if_hwassist &= ~tmp;
1293 
1294 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1295 	if (ifp->if_capenable & IFCAP_TSO4)
1296 		ifp->if_hwassist |= tmp;
1297 	else
1298 		ifp->if_hwassist &= ~tmp;
1299 
1300 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1301 	if (ifp->if_capenable & IFCAP_TSO6)
1302 		ifp->if_hwassist |= tmp;
1303 	else
1304 		ifp->if_hwassist &= ~tmp;
1305 
1306 	return (error);
1307 }
1308 
1309 static int
1310 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1311 {
1312 	struct ifnet *vf_ifp;
1313 	struct ifreq ifr;
1314 
1315 	HN_LOCK_ASSERT(sc);
1316 	vf_ifp = sc->hn_vf_ifp;
1317 
1318 	memset(&ifr, 0, sizeof(ifr));
1319 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1320 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1321 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1322 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1323 }
1324 
1325 static void
1326 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1327 {
1328 	struct ifnet *ifp = sc->hn_ifp;
1329 	int allmulti = 0;
1330 
1331 	HN_LOCK_ASSERT(sc);
1332 
1333 	/* XXX vlan(4) style mcast addr maintenance */
1334 	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1335 		allmulti = IFF_ALLMULTI;
1336 
1337 	/* Always set the VF's if_flags */
1338 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1339 }
1340 
1341 static void
1342 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1343 {
1344 	struct rm_priotracker pt;
1345 	struct ifnet *hn_ifp = NULL;
1346 	struct mbuf *mn;
1347 
1348 	/*
1349 	 * XXX racy, if hn(4) ever detached.
1350 	 */
1351 	rm_rlock(&hn_vfmap_lock, &pt);
1352 	if (vf_ifp->if_index < hn_vfmap_size)
1353 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1354 	rm_runlock(&hn_vfmap_lock, &pt);
1355 
1356 	if (hn_ifp != NULL) {
1357 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1358 			/*
1359 			 * Allow tapping on the VF.
1360 			 */
1361 			ETHER_BPF_MTAP(vf_ifp, mn);
1362 
1363 			/*
1364 			 * Update VF stats.
1365 			 */
1366 			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1367 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1368 				    mn->m_pkthdr.len);
1369 			}
1370 			/*
1371 			 * XXX IFCOUNTER_IMCAST
1372 			 * This stat updating is kinda invasive, since it
1373 			 * requires two checks on the mbuf: the length check
1374 			 * and the ethernet header check.  As of this write,
1375 			 * all multicast packets go directly to hn(4), which
1376 			 * makes imcast stat updating in the VF a try in vian.
1377 			 */
1378 
1379 			/*
1380 			 * Fix up rcvif and increase hn(4)'s ipackets.
1381 			 */
1382 			mn->m_pkthdr.rcvif = hn_ifp;
1383 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1384 		}
1385 		/*
1386 		 * Go through hn(4)'s if_input.
1387 		 */
1388 		hn_ifp->if_input(hn_ifp, m);
1389 	} else {
1390 		/*
1391 		 * In the middle of the transition; free this
1392 		 * mbuf chain.
1393 		 */
1394 		while (m != NULL) {
1395 			mn = m->m_nextpkt;
1396 			m->m_nextpkt = NULL;
1397 			m_freem(m);
1398 			m = mn;
1399 		}
1400 	}
1401 }
1402 
1403 static void
1404 hn_mtu_change_fixup(struct hn_softc *sc)
1405 {
1406 	struct ifnet *ifp;
1407 
1408 	HN_LOCK_ASSERT(sc);
1409 	ifp = sc->hn_ifp;
1410 
1411 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1412 #if __FreeBSD_version >= 1100099
1413 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1414 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1415 #endif
1416 }
1417 
1418 static uint32_t
1419 hn_rss_type_fromndis(uint32_t rss_hash)
1420 {
1421 	uint32_t types = 0;
1422 
1423 	if (rss_hash & NDIS_HASH_IPV4)
1424 		types |= RSS_TYPE_IPV4;
1425 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1426 		types |= RSS_TYPE_TCP_IPV4;
1427 	if (rss_hash & NDIS_HASH_IPV6)
1428 		types |= RSS_TYPE_IPV6;
1429 	if (rss_hash & NDIS_HASH_IPV6_EX)
1430 		types |= RSS_TYPE_IPV6_EX;
1431 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1432 		types |= RSS_TYPE_TCP_IPV6;
1433 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1434 		types |= RSS_TYPE_TCP_IPV6_EX;
1435 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1436 		types |= RSS_TYPE_UDP_IPV4;
1437 	return (types);
1438 }
1439 
1440 static uint32_t
1441 hn_rss_type_tondis(uint32_t types)
1442 {
1443 	uint32_t rss_hash = 0;
1444 
1445 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1446 	    ("UDP6 and UDP6EX are not supported"));
1447 
1448 	if (types & RSS_TYPE_IPV4)
1449 		rss_hash |= NDIS_HASH_IPV4;
1450 	if (types & RSS_TYPE_TCP_IPV4)
1451 		rss_hash |= NDIS_HASH_TCP_IPV4;
1452 	if (types & RSS_TYPE_IPV6)
1453 		rss_hash |= NDIS_HASH_IPV6;
1454 	if (types & RSS_TYPE_IPV6_EX)
1455 		rss_hash |= NDIS_HASH_IPV6_EX;
1456 	if (types & RSS_TYPE_TCP_IPV6)
1457 		rss_hash |= NDIS_HASH_TCP_IPV6;
1458 	if (types & RSS_TYPE_TCP_IPV6_EX)
1459 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1460 	if (types & RSS_TYPE_UDP_IPV4)
1461 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1462 	return (rss_hash);
1463 }
1464 
1465 static void
1466 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1467 {
1468 	int i;
1469 
1470 	HN_LOCK_ASSERT(sc);
1471 
1472 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1473 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1474 }
1475 
1476 static void
1477 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1478 {
1479 	struct ifnet *ifp, *vf_ifp;
1480 	struct ifrsshash ifrh;
1481 	struct ifrsskey ifrk;
1482 	int error;
1483 	uint32_t my_types, diff_types, mbuf_types = 0;
1484 
1485 	HN_LOCK_ASSERT(sc);
1486 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1487 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1488 
1489 	if (sc->hn_rx_ring_inuse == 1) {
1490 		/* No RSS on synthetic parts; done. */
1491 		return;
1492 	}
1493 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1494 		/* Synthetic parts do not support Toeplitz; done. */
1495 		return;
1496 	}
1497 
1498 	ifp = sc->hn_ifp;
1499 	vf_ifp = sc->hn_vf_ifp;
1500 
1501 	/*
1502 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1503 	 * supported.
1504 	 */
1505 	memset(&ifrk, 0, sizeof(ifrk));
1506 	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1507 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1508 	if (error) {
1509 		if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1510 		    vf_ifp->if_xname, error);
1511 		goto done;
1512 	}
1513 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1514 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1515 		    vf_ifp->if_xname, ifrk.ifrk_func);
1516 		goto done;
1517 	}
1518 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1519 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1520 		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1521 		goto done;
1522 	}
1523 
1524 	/*
1525 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1526 	 */
1527 	memset(&ifrh, 0, sizeof(ifrh));
1528 	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1529 	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1530 	if (error) {
1531 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1532 		    vf_ifp->if_xname, error);
1533 		goto done;
1534 	}
1535 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1536 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1537 		    vf_ifp->if_xname, ifrh.ifrh_func);
1538 		goto done;
1539 	}
1540 
1541 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1542 	if ((ifrh.ifrh_types & my_types) == 0) {
1543 		/* This disables RSS; ignore it then */
1544 		if_printf(ifp, "%s intersection of RSS types failed.  "
1545 		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1546 		    ifrh.ifrh_types, my_types);
1547 		goto done;
1548 	}
1549 
1550 	diff_types = my_types ^ ifrh.ifrh_types;
1551 	my_types &= ifrh.ifrh_types;
1552 	mbuf_types = my_types;
1553 
1554 	/*
1555 	 * Detect RSS hash value/type confliction.
1556 	 *
1557 	 * NOTE:
1558 	 * We don't disable the hash type, but stop delivery the hash
1559 	 * value/type through mbufs on RX path.
1560 	 *
1561 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1562 	 * hash is delivered with type of TCP_IPV4.  This means if
1563 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1564 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1565 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1566 	 * here.
1567 	 */
1568 	if ((my_types & RSS_TYPE_IPV4) &&
1569 	    (diff_types & ifrh.ifrh_types &
1570 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1571 		/* Conflict; disable IPV4 hash type/value delivery. */
1572 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1573 		mbuf_types &= ~RSS_TYPE_IPV4;
1574 	}
1575 	if ((my_types & RSS_TYPE_IPV6) &&
1576 	    (diff_types & ifrh.ifrh_types &
1577 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1578 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1579 	      RSS_TYPE_IPV6_EX))) {
1580 		/* Conflict; disable IPV6 hash type/value delivery. */
1581 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1582 		mbuf_types &= ~RSS_TYPE_IPV6;
1583 	}
1584 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1585 	    (diff_types & ifrh.ifrh_types &
1586 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1587 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1588 	      RSS_TYPE_IPV6))) {
1589 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1590 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1591 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1592 	}
1593 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1594 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1595 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1596 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1597 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1598 	}
1599 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1600 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1601 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1602 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1603 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1604 	}
1605 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1606 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1607 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1608 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1609 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1610 	}
1611 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1612 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1613 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1614 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1615 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1616 	}
1617 
1618 	/*
1619 	 * Indirect table does not matter.
1620 	 */
1621 
1622 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1623 	    hn_rss_type_tondis(my_types);
1624 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1625 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1626 
1627 	if (reconf) {
1628 		error = hn_rss_reconfig(sc);
1629 		if (error) {
1630 			/* XXX roll-back? */
1631 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1632 			/* XXX keep going. */
1633 		}
1634 	}
1635 done:
1636 	/* Hash deliverability for mbufs. */
1637 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1638 }
1639 
1640 static void
1641 hn_vf_rss_restore(struct hn_softc *sc)
1642 {
1643 
1644 	HN_LOCK_ASSERT(sc);
1645 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1646 	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1647 
1648 	if (sc->hn_rx_ring_inuse == 1)
1649 		goto done;
1650 
1651 	/*
1652 	 * Restore hash types.  Key does _not_ matter.
1653 	 */
1654 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1655 		int error;
1656 
1657 		sc->hn_rss_hash = sc->hn_rss_hcap;
1658 		error = hn_rss_reconfig(sc);
1659 		if (error) {
1660 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1661 			    error);
1662 			/* XXX keep going. */
1663 		}
1664 	}
1665 done:
1666 	/* Hash deliverability for mbufs. */
1667 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1668 }
1669 
1670 static void
1671 hn_xpnt_vf_setready(struct hn_softc *sc)
1672 {
1673 	struct ifnet *ifp, *vf_ifp;
1674 	struct ifreq ifr;
1675 
1676 	HN_LOCK_ASSERT(sc);
1677 	ifp = sc->hn_ifp;
1678 	vf_ifp = sc->hn_vf_ifp;
1679 
1680 	/*
1681 	 * Mark the VF ready.
1682 	 */
1683 	sc->hn_vf_rdytick = 0;
1684 
1685 	/*
1686 	 * Save information for restoration.
1687 	 */
1688 	sc->hn_saved_caps = ifp->if_capabilities;
1689 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1690 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1691 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1692 
1693 	/*
1694 	 * Intersect supported/enabled capabilities.
1695 	 *
1696 	 * NOTE:
1697 	 * if_hwassist is not changed here.
1698 	 */
1699 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1700 	ifp->if_capenable &= ifp->if_capabilities;
1701 
1702 	/*
1703 	 * Fix TSO settings.
1704 	 */
1705 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1706 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1707 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1708 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1709 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1710 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1711 
1712 	/*
1713 	 * Change VF's enabled capabilities.
1714 	 */
1715 	memset(&ifr, 0, sizeof(ifr));
1716 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1717 	ifr.ifr_reqcap = ifp->if_capenable;
1718 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1719 
1720 	if (ifp->if_mtu != ETHERMTU) {
1721 		int error;
1722 
1723 		/*
1724 		 * Change VF's MTU.
1725 		 */
1726 		memset(&ifr, 0, sizeof(ifr));
1727 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1728 		ifr.ifr_mtu = ifp->if_mtu;
1729 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1730 		if (error) {
1731 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1732 			    vf_ifp->if_xname, ifp->if_mtu);
1733 			if (ifp->if_mtu > ETHERMTU) {
1734 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1735 
1736 				/*
1737 				 * XXX
1738 				 * No need to adjust the synthetic parts' MTU;
1739 				 * failure of the adjustment will cause us
1740 				 * infinite headache.
1741 				 */
1742 				ifp->if_mtu = ETHERMTU;
1743 				hn_mtu_change_fixup(sc);
1744 			}
1745 		}
1746 	}
1747 }
1748 
1749 static bool
1750 hn_xpnt_vf_isready(struct hn_softc *sc)
1751 {
1752 
1753 	HN_LOCK_ASSERT(sc);
1754 
1755 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1756 		return (false);
1757 
1758 	if (sc->hn_vf_rdytick == 0)
1759 		return (true);
1760 
1761 	if (sc->hn_vf_rdytick > ticks)
1762 		return (false);
1763 
1764 	/* Mark VF as ready. */
1765 	hn_xpnt_vf_setready(sc);
1766 	return (true);
1767 }
1768 
1769 static void
1770 hn_xpnt_vf_setenable(struct hn_softc *sc)
1771 {
1772 	int i;
1773 
1774 	HN_LOCK_ASSERT(sc);
1775 
1776 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1777 	rm_wlock(&sc->hn_vf_lock);
1778 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1779 	rm_wunlock(&sc->hn_vf_lock);
1780 
1781 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1782 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1783 }
1784 
1785 static void
1786 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1787 {
1788 	int i;
1789 
1790 	HN_LOCK_ASSERT(sc);
1791 
1792 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1793 	rm_wlock(&sc->hn_vf_lock);
1794 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1795 	if (clear_vf)
1796 		sc->hn_vf_ifp = NULL;
1797 	rm_wunlock(&sc->hn_vf_lock);
1798 
1799 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1800 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1801 }
1802 
1803 static void
1804 hn_xpnt_vf_init(struct hn_softc *sc)
1805 {
1806 	int error;
1807 
1808 	HN_LOCK_ASSERT(sc);
1809 
1810 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1811 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1812 
1813 	if (bootverbose) {
1814 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1815 		    sc->hn_vf_ifp->if_xname);
1816 	}
1817 
1818 	/*
1819 	 * Bring the VF up.
1820 	 */
1821 	hn_xpnt_vf_saveifflags(sc);
1822 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1823 	error = hn_xpnt_vf_iocsetflags(sc);
1824 	if (error) {
1825 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1826 		    sc->hn_vf_ifp->if_xname, error);
1827 		return;
1828 	}
1829 
1830 	/*
1831 	 * NOTE:
1832 	 * Datapath setting must happen _after_ bringing the VF up.
1833 	 */
1834 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1835 
1836 	/*
1837 	 * NOTE:
1838 	 * Fixup RSS related bits _after_ the VF is brought up, since
1839 	 * many VFs generate RSS key during it's initialization.
1840 	 */
1841 	hn_vf_rss_fixup(sc, true);
1842 
1843 	/* Mark transparent mode VF as enabled. */
1844 	hn_xpnt_vf_setenable(sc);
1845 }
1846 
1847 static void
1848 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1849 {
1850 	struct hn_softc *sc = xsc;
1851 
1852 	HN_LOCK(sc);
1853 
1854 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1855 		goto done;
1856 	if (sc->hn_vf_ifp == NULL)
1857 		goto done;
1858 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1859 		goto done;
1860 
1861 	if (sc->hn_vf_rdytick != 0) {
1862 		/* Mark VF as ready. */
1863 		hn_xpnt_vf_setready(sc);
1864 	}
1865 
1866 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1867 		/*
1868 		 * Delayed VF initialization.
1869 		 */
1870 		if (bootverbose) {
1871 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1872 			    sc->hn_vf_ifp->if_xname);
1873 		}
1874 		hn_xpnt_vf_init(sc);
1875 	}
1876 done:
1877 	HN_UNLOCK(sc);
1878 }
1879 
1880 static void
1881 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1882 {
1883 	struct hn_softc *sc = xsc;
1884 
1885 	HN_LOCK(sc);
1886 
1887 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1888 		goto done;
1889 
1890 	if (!hn_ismyvf(sc, ifp))
1891 		goto done;
1892 
1893 	if (sc->hn_vf_ifp != NULL) {
1894 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1895 		    sc->hn_vf_ifp->if_xname);
1896 		goto done;
1897 	}
1898 
1899 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1900 		/*
1901 		 * ifnet.if_start is _not_ supported by transparent
1902 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1903 		 */
1904 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1905 		    "in transparent VF mode.\n", ifp->if_xname);
1906 		goto done;
1907 	}
1908 
1909 	rm_wlock(&hn_vfmap_lock);
1910 
1911 	if (ifp->if_index >= hn_vfmap_size) {
1912 		struct ifnet **newmap;
1913 		int newsize;
1914 
1915 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1916 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1917 		    M_WAITOK | M_ZERO);
1918 
1919 		memcpy(newmap, hn_vfmap,
1920 		    sizeof(struct ifnet *) * hn_vfmap_size);
1921 		free(hn_vfmap, M_DEVBUF);
1922 		hn_vfmap = newmap;
1923 		hn_vfmap_size = newsize;
1924 	}
1925 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1926 	    ("%s: ifindex %d was mapped to %s",
1927 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1928 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1929 
1930 	rm_wunlock(&hn_vfmap_lock);
1931 
1932 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1933 	rm_wlock(&sc->hn_vf_lock);
1934 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1935 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1936 	sc->hn_vf_ifp = ifp;
1937 	rm_wunlock(&sc->hn_vf_lock);
1938 
1939 	if (hn_xpnt_vf) {
1940 		int wait_ticks;
1941 
1942 		/*
1943 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1944 		 * Save vf_ifp's current if_input for later restoration.
1945 		 */
1946 		sc->hn_vf_input = ifp->if_input;
1947 		ifp->if_input = hn_xpnt_vf_input;
1948 
1949 		/*
1950 		 * Stop link status management; use the VF's.
1951 		 */
1952 		hn_suspend_mgmt(sc);
1953 
1954 		/*
1955 		 * Give VF sometime to complete its attach routing.
1956 		 */
1957 		wait_ticks = hn_xpnt_vf_attwait * hz;
1958 		sc->hn_vf_rdytick = ticks + wait_ticks;
1959 
1960 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1961 		    wait_ticks);
1962 	}
1963 done:
1964 	HN_UNLOCK(sc);
1965 }
1966 
1967 static void
1968 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1969 {
1970 	struct hn_softc *sc = xsc;
1971 
1972 	HN_LOCK(sc);
1973 
1974 	if (sc->hn_vf_ifp == NULL)
1975 		goto done;
1976 
1977 	if (!hn_ismyvf(sc, ifp))
1978 		goto done;
1979 
1980 	if (hn_xpnt_vf) {
1981 		/*
1982 		 * Make sure that the delayed initialization is not running.
1983 		 *
1984 		 * NOTE:
1985 		 * - This lock _must_ be released, since the hn_vf_init task
1986 		 *   will try holding this lock.
1987 		 * - It is safe to release this lock here, since the
1988 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1989 		 *
1990 		 * XXX racy, if hn(4) ever detached.
1991 		 */
1992 		HN_UNLOCK(sc);
1993 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1994 		HN_LOCK(sc);
1995 
1996 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1997 		    sc->hn_ifp->if_xname));
1998 		ifp->if_input = sc->hn_vf_input;
1999 		sc->hn_vf_input = NULL;
2000 
2001 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2002 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2003 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2004 
2005 		if (sc->hn_vf_rdytick == 0) {
2006 			/*
2007 			 * The VF was ready; restore some settings.
2008 			 */
2009 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2010 			/*
2011 			 * NOTE:
2012 			 * There is _no_ need to fixup if_capenable and
2013 			 * if_hwassist, since the if_capabilities before
2014 			 * restoration was an intersection of the VF's
2015 			 * if_capabilites and the synthetic device's
2016 			 * if_capabilites.
2017 			 */
2018 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2019 			sc->hn_ifp->if_hw_tsomaxsegcount =
2020 			    sc->hn_saved_tsosegcnt;
2021 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2022 		}
2023 
2024 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2025 			/*
2026 			 * Restore RSS settings.
2027 			 */
2028 			hn_vf_rss_restore(sc);
2029 
2030 			/*
2031 			 * Resume link status management, which was suspended
2032 			 * by hn_ifnet_attevent().
2033 			 */
2034 			hn_resume_mgmt(sc);
2035 		}
2036 	}
2037 
2038 	/* Mark transparent mode VF as disabled. */
2039 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2040 
2041 	rm_wlock(&hn_vfmap_lock);
2042 
2043 	KASSERT(ifp->if_index < hn_vfmap_size,
2044 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2045 	if (hn_vfmap[ifp->if_index] != NULL) {
2046 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2047 		    ("%s: ifindex %d was mapped to %s",
2048 		     ifp->if_xname, ifp->if_index,
2049 		     hn_vfmap[ifp->if_index]->if_xname));
2050 		hn_vfmap[ifp->if_index] = NULL;
2051 	}
2052 
2053 	rm_wunlock(&hn_vfmap_lock);
2054 done:
2055 	HN_UNLOCK(sc);
2056 }
2057 
2058 static void
2059 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2060 {
2061 	struct hn_softc *sc = xsc;
2062 
2063 	if (sc->hn_vf_ifp == ifp)
2064 		if_link_state_change(sc->hn_ifp, link_state);
2065 }
2066 
2067 static int
2068 hn_probe(device_t dev)
2069 {
2070 
2071 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2072 		device_set_desc(dev, "Hyper-V Network Interface");
2073 		return BUS_PROBE_DEFAULT;
2074 	}
2075 	return ENXIO;
2076 }
2077 
2078 static int
2079 hn_attach(device_t dev)
2080 {
2081 	struct hn_softc *sc = device_get_softc(dev);
2082 	struct sysctl_oid_list *child;
2083 	struct sysctl_ctx_list *ctx;
2084 	uint8_t eaddr[ETHER_ADDR_LEN];
2085 	struct ifnet *ifp = NULL;
2086 	int error, ring_cnt, tx_ring_cnt;
2087 	uint32_t mtu;
2088 
2089 	sc->hn_dev = dev;
2090 	sc->hn_prichan = vmbus_get_channel(dev);
2091 	HN_LOCK_INIT(sc);
2092 	rm_init(&sc->hn_vf_lock, "hnvf");
2093 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2094 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2095 
2096 	/*
2097 	 * Initialize these tunables once.
2098 	 */
2099 	sc->hn_agg_size = hn_tx_agg_size;
2100 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2101 
2102 	/*
2103 	 * Setup taskqueue for transmission.
2104 	 */
2105 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2106 		int i;
2107 
2108 		sc->hn_tx_taskqs =
2109 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2110 		    M_DEVBUF, M_WAITOK);
2111 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2112 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2113 			    M_WAITOK, taskqueue_thread_enqueue,
2114 			    &sc->hn_tx_taskqs[i]);
2115 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2116 			    "%s tx%d", device_get_nameunit(dev), i);
2117 		}
2118 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2119 		sc->hn_tx_taskqs = hn_tx_taskque;
2120 	}
2121 
2122 	/*
2123 	 * Setup taskqueue for mangement tasks, e.g. link status.
2124 	 */
2125 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2126 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2127 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2128 	    device_get_nameunit(dev));
2129 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2130 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2131 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2132 	    hn_netchg_status_taskfunc, sc);
2133 
2134 	if (hn_xpnt_vf) {
2135 		/*
2136 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2137 		 */
2138 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2139 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2140 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2141 		    device_get_nameunit(dev));
2142 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2143 		    hn_xpnt_vf_init_taskfunc, sc);
2144 	}
2145 
2146 	/*
2147 	 * Allocate ifnet and setup its name earlier, so that if_printf
2148 	 * can be used by functions, which will be called after
2149 	 * ether_ifattach().
2150 	 */
2151 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2152 	ifp->if_softc = sc;
2153 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2154 
2155 	/*
2156 	 * Initialize ifmedia earlier so that it can be unconditionally
2157 	 * destroyed, if error happened later on.
2158 	 */
2159 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2160 
2161 	/*
2162 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2163 	 * to use (tx_ring_cnt).
2164 	 *
2165 	 * NOTE:
2166 	 * The # of RX rings to use is same as the # of channels to use.
2167 	 */
2168 	ring_cnt = hn_chan_cnt;
2169 	if (ring_cnt <= 0) {
2170 		/* Default */
2171 		ring_cnt = mp_ncpus;
2172 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2173 			ring_cnt = HN_RING_CNT_DEF_MAX;
2174 	} else if (ring_cnt > mp_ncpus) {
2175 		ring_cnt = mp_ncpus;
2176 	}
2177 #ifdef RSS
2178 	if (ring_cnt > rss_getnumbuckets())
2179 		ring_cnt = rss_getnumbuckets();
2180 #endif
2181 
2182 	tx_ring_cnt = hn_tx_ring_cnt;
2183 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2184 		tx_ring_cnt = ring_cnt;
2185 #ifdef HN_IFSTART_SUPPORT
2186 	if (hn_use_if_start) {
2187 		/* ifnet.if_start only needs one TX ring. */
2188 		tx_ring_cnt = 1;
2189 	}
2190 #endif
2191 
2192 	/*
2193 	 * Set the leader CPU for channels.
2194 	 */
2195 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2196 
2197 	/*
2198 	 * Create enough TX/RX rings, even if only limited number of
2199 	 * channels can be allocated.
2200 	 */
2201 	error = hn_create_tx_data(sc, tx_ring_cnt);
2202 	if (error)
2203 		goto failed;
2204 	error = hn_create_rx_data(sc, ring_cnt);
2205 	if (error)
2206 		goto failed;
2207 
2208 	/*
2209 	 * Create transaction context for NVS and RNDIS transactions.
2210 	 */
2211 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2212 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2213 	if (sc->hn_xact == NULL) {
2214 		error = ENXIO;
2215 		goto failed;
2216 	}
2217 
2218 	/*
2219 	 * Install orphan handler for the revocation of this device's
2220 	 * primary channel.
2221 	 *
2222 	 * NOTE:
2223 	 * The processing order is critical here:
2224 	 * Install the orphan handler, _before_ testing whether this
2225 	 * device's primary channel has been revoked or not.
2226 	 */
2227 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2228 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2229 		error = ENXIO;
2230 		goto failed;
2231 	}
2232 
2233 	/*
2234 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2235 	 */
2236 	error = hn_synth_attach(sc, ETHERMTU);
2237 	if (error)
2238 		goto failed;
2239 
2240 	error = hn_rndis_get_eaddr(sc, eaddr);
2241 	if (error)
2242 		goto failed;
2243 
2244 	error = hn_rndis_get_mtu(sc, &mtu);
2245 	if (error)
2246 		mtu = ETHERMTU;
2247 	else if (bootverbose)
2248 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2249 
2250 #if __FreeBSD_version >= 1100099
2251 	if (sc->hn_rx_ring_inuse > 1) {
2252 		/*
2253 		 * Reduce TCP segment aggregation limit for multiple
2254 		 * RX rings to increase ACK timeliness.
2255 		 */
2256 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2257 	}
2258 #endif
2259 
2260 	/*
2261 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2262 	 */
2263 	hn_fixup_tx_data(sc);
2264 	hn_fixup_rx_data(sc);
2265 
2266 	ctx = device_get_sysctl_ctx(dev);
2267 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2268 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2269 	    &sc->hn_nvs_ver, 0, "NVS version");
2270 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2271 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2272 	    hn_ndis_version_sysctl, "A", "NDIS version");
2273 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2274 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2275 	    hn_caps_sysctl, "A", "capabilities");
2276 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2277 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2278 	    hn_hwassist_sysctl, "A", "hwassist");
2279 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2280 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2281 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2282 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2283 	    "max # of TSO segments");
2284 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2285 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2286 	    "max size of TSO segment");
2287 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2288 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2289 	    hn_rxfilter_sysctl, "A", "rxfilter");
2290 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2291 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2292 	    hn_rss_hash_sysctl, "A", "RSS hash");
2293 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2294 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2295 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2296 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2297 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2298 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2299 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2300 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2301 #ifndef RSS
2302 	/*
2303 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2304 	 */
2305 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2306 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2307 	    hn_rss_key_sysctl, "IU", "RSS key");
2308 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2309 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2310 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2311 #endif
2312 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2313 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2314 	    "RNDIS offered packet transmission aggregation size limit");
2315 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2316 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2317 	    "RNDIS offered packet transmission aggregation count limit");
2318 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2319 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2320 	    "RNDIS packet transmission aggregation alignment");
2321 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2322 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2323 	    hn_txagg_size_sysctl, "I",
2324 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2325 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2326 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 	    hn_txagg_pkts_sysctl, "I",
2328 	    "Packet transmission aggregation packets, "
2329 	    "0 -- disable, -1 -- auto");
2330 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2331 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 	    hn_polling_sysctl, "I",
2333 	    "Polling frequency: [100,1000000], 0 disable polling");
2334 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2335 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2336 	    hn_vf_sysctl, "A", "Virtual Function's name");
2337 	if (!hn_xpnt_vf) {
2338 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2339 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2340 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2341 	} else {
2342 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2343 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2344 		    hn_xpnt_vf_enabled_sysctl, "I",
2345 		    "Transparent VF enabled");
2346 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2347 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348 		    hn_xpnt_vf_accbpf_sysctl, "I",
2349 		    "Accurate BPF for transparent VF");
2350 	}
2351 
2352 	/*
2353 	 * Setup the ifmedia, which has been initialized earlier.
2354 	 */
2355 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2356 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2357 	/* XXX ifmedia_set really should do this for us */
2358 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2359 
2360 	/*
2361 	 * Setup the ifnet for this interface.
2362 	 */
2363 
2364 	ifp->if_baudrate = IF_Gbps(10);
2365 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2366 	ifp->if_ioctl = hn_ioctl;
2367 	ifp->if_init = hn_init;
2368 #ifdef HN_IFSTART_SUPPORT
2369 	if (hn_use_if_start) {
2370 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2371 
2372 		ifp->if_start = hn_start;
2373 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2374 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2375 		IFQ_SET_READY(&ifp->if_snd);
2376 	} else
2377 #endif
2378 	{
2379 		ifp->if_transmit = hn_transmit;
2380 		ifp->if_qflush = hn_xmit_qflush;
2381 	}
2382 
2383 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2384 #ifdef foo
2385 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2386 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2387 #endif
2388 	if (sc->hn_caps & HN_CAP_VLAN) {
2389 		/* XXX not sure about VLAN_MTU. */
2390 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2391 	}
2392 
2393 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2394 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2395 		ifp->if_capabilities |= IFCAP_TXCSUM;
2396 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2397 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2398 	if (sc->hn_caps & HN_CAP_TSO4) {
2399 		ifp->if_capabilities |= IFCAP_TSO4;
2400 		ifp->if_hwassist |= CSUM_IP_TSO;
2401 	}
2402 	if (sc->hn_caps & HN_CAP_TSO6) {
2403 		ifp->if_capabilities |= IFCAP_TSO6;
2404 		ifp->if_hwassist |= CSUM_IP6_TSO;
2405 	}
2406 
2407 	/* Enable all available capabilities by default. */
2408 	ifp->if_capenable = ifp->if_capabilities;
2409 
2410 	/*
2411 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2412 	 * be enabled through SIOCSIFCAP.
2413 	 */
2414 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2415 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2416 
2417 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2418 		/*
2419 		 * Lock hn_set_tso_maxsize() to simplify its
2420 		 * internal logic.
2421 		 */
2422 		HN_LOCK(sc);
2423 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2424 		HN_UNLOCK(sc);
2425 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2426 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2427 	}
2428 
2429 	ether_ifattach(ifp, eaddr);
2430 
2431 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2432 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2433 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2434 	}
2435 	if (mtu < ETHERMTU) {
2436 		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2437 		ifp->if_mtu = mtu;
2438 	}
2439 
2440 	/* Inform the upper layer about the long frame support. */
2441 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2442 
2443 	/*
2444 	 * Kick off link status check.
2445 	 */
2446 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2447 	hn_update_link_status(sc);
2448 
2449 	if (!hn_xpnt_vf) {
2450 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2451 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2452 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2453 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2454 	} else {
2455 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2456 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2457 	}
2458 
2459 	/*
2460 	 * NOTE:
2461 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2462 	 * since interface's LLADDR is needed; interface LLADDR is not
2463 	 * available when ifnet_arrival event is triggered.
2464 	 */
2465 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2466 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2467 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2468 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2469 
2470 	return (0);
2471 failed:
2472 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2473 		hn_synth_detach(sc);
2474 	hn_detach(dev);
2475 	return (error);
2476 }
2477 
2478 static int
2479 hn_detach(device_t dev)
2480 {
2481 	struct hn_softc *sc = device_get_softc(dev);
2482 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2483 
2484 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2485 		/*
2486 		 * In case that the vmbus missed the orphan handler
2487 		 * installation.
2488 		 */
2489 		vmbus_xact_ctx_orphan(sc->hn_xact);
2490 	}
2491 
2492 	if (sc->hn_ifaddr_evthand != NULL)
2493 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2494 	if (sc->hn_ifnet_evthand != NULL)
2495 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2496 	if (sc->hn_ifnet_atthand != NULL) {
2497 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2498 		    sc->hn_ifnet_atthand);
2499 	}
2500 	if (sc->hn_ifnet_dethand != NULL) {
2501 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2502 		    sc->hn_ifnet_dethand);
2503 	}
2504 	if (sc->hn_ifnet_lnkhand != NULL)
2505 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2506 
2507 	vf_ifp = sc->hn_vf_ifp;
2508 	__compiler_membar();
2509 	if (vf_ifp != NULL)
2510 		hn_ifnet_detevent(sc, vf_ifp);
2511 
2512 	if (device_is_attached(dev)) {
2513 		HN_LOCK(sc);
2514 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2515 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2516 				hn_stop(sc, true);
2517 			/*
2518 			 * NOTE:
2519 			 * hn_stop() only suspends data, so managment
2520 			 * stuffs have to be suspended manually here.
2521 			 */
2522 			hn_suspend_mgmt(sc);
2523 			hn_synth_detach(sc);
2524 		}
2525 		HN_UNLOCK(sc);
2526 		ether_ifdetach(ifp);
2527 	}
2528 
2529 	ifmedia_removeall(&sc->hn_media);
2530 	hn_destroy_rx_data(sc);
2531 	hn_destroy_tx_data(sc);
2532 
2533 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2534 		int i;
2535 
2536 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2537 			taskqueue_free(sc->hn_tx_taskqs[i]);
2538 		free(sc->hn_tx_taskqs, M_DEVBUF);
2539 	}
2540 	taskqueue_free(sc->hn_mgmt_taskq0);
2541 	if (sc->hn_vf_taskq != NULL)
2542 		taskqueue_free(sc->hn_vf_taskq);
2543 
2544 	if (sc->hn_xact != NULL) {
2545 		/*
2546 		 * Uninstall the orphan handler _before_ the xact is
2547 		 * destructed.
2548 		 */
2549 		vmbus_chan_unset_orphan(sc->hn_prichan);
2550 		vmbus_xact_ctx_destroy(sc->hn_xact);
2551 	}
2552 
2553 	if_free(ifp);
2554 
2555 	HN_LOCK_DESTROY(sc);
2556 	rm_destroy(&sc->hn_vf_lock);
2557 	return (0);
2558 }
2559 
2560 static int
2561 hn_shutdown(device_t dev)
2562 {
2563 
2564 	return (0);
2565 }
2566 
2567 static void
2568 hn_link_status(struct hn_softc *sc)
2569 {
2570 	uint32_t link_status;
2571 	int error;
2572 
2573 	error = hn_rndis_get_linkstatus(sc, &link_status);
2574 	if (error) {
2575 		/* XXX what to do? */
2576 		return;
2577 	}
2578 
2579 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2580 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2581 	else
2582 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2583 	if_link_state_change(sc->hn_ifp,
2584 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2585 	    LINK_STATE_UP : LINK_STATE_DOWN);
2586 }
2587 
2588 static void
2589 hn_link_taskfunc(void *xsc, int pending __unused)
2590 {
2591 	struct hn_softc *sc = xsc;
2592 
2593 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2594 		return;
2595 	hn_link_status(sc);
2596 }
2597 
2598 static void
2599 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2600 {
2601 	struct hn_softc *sc = xsc;
2602 
2603 	/* Prevent any link status checks from running. */
2604 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2605 
2606 	/*
2607 	 * Fake up a [link down --> link up] state change; 5 seconds
2608 	 * delay is used, which closely simulates miibus reaction
2609 	 * upon link down event.
2610 	 */
2611 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2612 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2613 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2614 	    &sc->hn_netchg_status, 5 * hz);
2615 }
2616 
2617 static void
2618 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2619 {
2620 	struct hn_softc *sc = xsc;
2621 
2622 	/* Re-allow link status checks. */
2623 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2624 	hn_link_status(sc);
2625 }
2626 
2627 static void
2628 hn_update_link_status(struct hn_softc *sc)
2629 {
2630 
2631 	if (sc->hn_mgmt_taskq != NULL)
2632 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2633 }
2634 
2635 static void
2636 hn_change_network(struct hn_softc *sc)
2637 {
2638 
2639 	if (sc->hn_mgmt_taskq != NULL)
2640 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2641 }
2642 
2643 static __inline int
2644 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2645     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2646 {
2647 	struct mbuf *m = *m_head;
2648 	int error;
2649 
2650 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2651 
2652 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2653 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2654 	if (error == EFBIG) {
2655 		struct mbuf *m_new;
2656 
2657 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2658 		if (m_new == NULL)
2659 			return ENOBUFS;
2660 		else
2661 			*m_head = m = m_new;
2662 		txr->hn_tx_collapsed++;
2663 
2664 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2665 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2666 	}
2667 	if (!error) {
2668 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2669 		    BUS_DMASYNC_PREWRITE);
2670 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2671 	}
2672 	return error;
2673 }
2674 
2675 static __inline int
2676 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2677 {
2678 
2679 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2680 	    ("put an onlist txd %#x", txd->flags));
2681 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2682 	    ("put an onagg txd %#x", txd->flags));
2683 
2684 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2685 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2686 		return 0;
2687 
2688 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2689 		struct hn_txdesc *tmp_txd;
2690 
2691 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2692 			int freed;
2693 
2694 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2695 			    ("resursive aggregation on aggregated txdesc"));
2696 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2697 			    ("not aggregated txdesc"));
2698 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2699 			    ("aggregated txdesc uses dmamap"));
2700 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2701 			    ("aggregated txdesc consumes "
2702 			     "chimney sending buffer"));
2703 			KASSERT(tmp_txd->chim_size == 0,
2704 			    ("aggregated txdesc has non-zero "
2705 			     "chimney sending size"));
2706 
2707 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2708 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2709 			freed = hn_txdesc_put(txr, tmp_txd);
2710 			KASSERT(freed, ("failed to free aggregated txdesc"));
2711 		}
2712 	}
2713 
2714 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2715 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2716 		    ("chim txd uses dmamap"));
2717 		hn_chim_free(txr->hn_sc, txd->chim_index);
2718 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2719 		txd->chim_size = 0;
2720 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2721 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2722 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2723 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2724 		    txd->data_dmap);
2725 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2726 	}
2727 
2728 	if (txd->m != NULL) {
2729 		m_freem(txd->m);
2730 		txd->m = NULL;
2731 	}
2732 
2733 	txd->flags |= HN_TXD_FLAG_ONLIST;
2734 #ifndef HN_USE_TXDESC_BUFRING
2735 	mtx_lock_spin(&txr->hn_txlist_spin);
2736 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2737 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2738 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2739 	txr->hn_txdesc_avail++;
2740 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2741 	mtx_unlock_spin(&txr->hn_txlist_spin);
2742 #else	/* HN_USE_TXDESC_BUFRING */
2743 #ifdef HN_DEBUG
2744 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2745 #endif
2746 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2747 #endif	/* !HN_USE_TXDESC_BUFRING */
2748 
2749 	return 1;
2750 }
2751 
2752 static __inline struct hn_txdesc *
2753 hn_txdesc_get(struct hn_tx_ring *txr)
2754 {
2755 	struct hn_txdesc *txd;
2756 
2757 #ifndef HN_USE_TXDESC_BUFRING
2758 	mtx_lock_spin(&txr->hn_txlist_spin);
2759 	txd = SLIST_FIRST(&txr->hn_txlist);
2760 	if (txd != NULL) {
2761 		KASSERT(txr->hn_txdesc_avail > 0,
2762 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2763 		txr->hn_txdesc_avail--;
2764 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2765 	}
2766 	mtx_unlock_spin(&txr->hn_txlist_spin);
2767 #else
2768 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2769 #endif
2770 
2771 	if (txd != NULL) {
2772 #ifdef HN_USE_TXDESC_BUFRING
2773 #ifdef HN_DEBUG
2774 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2775 #endif
2776 #endif	/* HN_USE_TXDESC_BUFRING */
2777 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2778 		    STAILQ_EMPTY(&txd->agg_list) &&
2779 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2780 		    txd->chim_size == 0 &&
2781 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2782 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2783 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2784 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2785 		txd->refs = 1;
2786 	}
2787 	return txd;
2788 }
2789 
2790 static __inline void
2791 hn_txdesc_hold(struct hn_txdesc *txd)
2792 {
2793 
2794 	/* 0->1 transition will never work */
2795 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2796 	atomic_add_int(&txd->refs, 1);
2797 }
2798 
2799 static __inline void
2800 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2801 {
2802 
2803 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2804 	    ("recursive aggregation on aggregating txdesc"));
2805 
2806 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2807 	    ("already aggregated"));
2808 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2809 	    ("recursive aggregation on to-be-aggregated txdesc"));
2810 
2811 	txd->flags |= HN_TXD_FLAG_ONAGG;
2812 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2813 }
2814 
2815 static bool
2816 hn_tx_ring_pending(struct hn_tx_ring *txr)
2817 {
2818 	bool pending = false;
2819 
2820 #ifndef HN_USE_TXDESC_BUFRING
2821 	mtx_lock_spin(&txr->hn_txlist_spin);
2822 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2823 		pending = true;
2824 	mtx_unlock_spin(&txr->hn_txlist_spin);
2825 #else
2826 	if (!buf_ring_full(txr->hn_txdesc_br))
2827 		pending = true;
2828 #endif
2829 	return (pending);
2830 }
2831 
2832 static __inline void
2833 hn_txeof(struct hn_tx_ring *txr)
2834 {
2835 	txr->hn_has_txeof = 0;
2836 	txr->hn_txeof(txr);
2837 }
2838 
2839 static void
2840 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2841     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2842 {
2843 	struct hn_txdesc *txd = sndc->hn_cbarg;
2844 	struct hn_tx_ring *txr;
2845 
2846 	txr = txd->txr;
2847 	KASSERT(txr->hn_chan == chan,
2848 	    ("channel mismatch, on chan%u, should be chan%u",
2849 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2850 
2851 	txr->hn_has_txeof = 1;
2852 	hn_txdesc_put(txr, txd);
2853 
2854 	++txr->hn_txdone_cnt;
2855 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2856 		txr->hn_txdone_cnt = 0;
2857 		if (txr->hn_oactive)
2858 			hn_txeof(txr);
2859 	}
2860 }
2861 
2862 static void
2863 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2864 {
2865 #if defined(INET) || defined(INET6)
2866 	tcp_lro_flush_all(&rxr->hn_lro);
2867 #endif
2868 
2869 	/*
2870 	 * NOTE:
2871 	 * 'txr' could be NULL, if multiple channels and
2872 	 * ifnet.if_start method are enabled.
2873 	 */
2874 	if (txr == NULL || !txr->hn_has_txeof)
2875 		return;
2876 
2877 	txr->hn_txdone_cnt = 0;
2878 	hn_txeof(txr);
2879 }
2880 
2881 static __inline uint32_t
2882 hn_rndis_pktmsg_offset(uint32_t ofs)
2883 {
2884 
2885 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2886 	    ("invalid RNDIS packet msg offset %u", ofs));
2887 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2888 }
2889 
2890 static __inline void *
2891 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2892     size_t pi_dlen, uint32_t pi_type)
2893 {
2894 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2895 	struct rndis_pktinfo *pi;
2896 
2897 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2898 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2899 
2900 	/*
2901 	 * Per-packet-info does not move; it only grows.
2902 	 *
2903 	 * NOTE:
2904 	 * rm_pktinfooffset in this phase counts from the beginning
2905 	 * of rndis_packet_msg.
2906 	 */
2907 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2908 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2909 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2910 	    pkt->rm_pktinfolen);
2911 	pkt->rm_pktinfolen += pi_size;
2912 
2913 	pi->rm_size = pi_size;
2914 	pi->rm_type = pi_type;
2915 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2916 
2917 	return (pi->rm_data);
2918 }
2919 
2920 static __inline int
2921 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2922 {
2923 	struct hn_txdesc *txd;
2924 	struct mbuf *m;
2925 	int error, pkts;
2926 
2927 	txd = txr->hn_agg_txd;
2928 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2929 
2930 	/*
2931 	 * Since hn_txpkt() will reset this temporary stat, save
2932 	 * it now, so that oerrors can be updated properly, if
2933 	 * hn_txpkt() ever fails.
2934 	 */
2935 	pkts = txr->hn_stat_pkts;
2936 
2937 	/*
2938 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2939 	 * failure, save it for later freeing, if hn_txpkt() ever
2940 	 * fails.
2941 	 */
2942 	m = txd->m;
2943 	error = hn_txpkt(ifp, txr, txd);
2944 	if (__predict_false(error)) {
2945 		/* txd is freed, but m is not. */
2946 		m_freem(m);
2947 
2948 		txr->hn_flush_failed++;
2949 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2950 	}
2951 
2952 	/* Reset all aggregation states. */
2953 	txr->hn_agg_txd = NULL;
2954 	txr->hn_agg_szleft = 0;
2955 	txr->hn_agg_pktleft = 0;
2956 	txr->hn_agg_prevpkt = NULL;
2957 
2958 	return (error);
2959 }
2960 
2961 static void *
2962 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2963     int pktsize)
2964 {
2965 	void *chim;
2966 
2967 	if (txr->hn_agg_txd != NULL) {
2968 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2969 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2970 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2971 			int olen;
2972 
2973 			/*
2974 			 * Update the previous RNDIS packet's total length,
2975 			 * it can be increased due to the mandatory alignment
2976 			 * padding for this RNDIS packet.  And update the
2977 			 * aggregating txdesc's chimney sending buffer size
2978 			 * accordingly.
2979 			 *
2980 			 * XXX
2981 			 * Zero-out the padding, as required by the RNDIS spec.
2982 			 */
2983 			olen = pkt->rm_len;
2984 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2985 			agg_txd->chim_size += pkt->rm_len - olen;
2986 
2987 			/* Link this txdesc to the parent. */
2988 			hn_txdesc_agg(agg_txd, txd);
2989 
2990 			chim = (uint8_t *)pkt + pkt->rm_len;
2991 			/* Save the current packet for later fixup. */
2992 			txr->hn_agg_prevpkt = chim;
2993 
2994 			txr->hn_agg_pktleft--;
2995 			txr->hn_agg_szleft -= pktsize;
2996 			if (txr->hn_agg_szleft <=
2997 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2998 				/*
2999 				 * Probably can't aggregate more packets,
3000 				 * flush this aggregating txdesc proactively.
3001 				 */
3002 				txr->hn_agg_pktleft = 0;
3003 			}
3004 			/* Done! */
3005 			return (chim);
3006 		}
3007 		hn_flush_txagg(ifp, txr);
3008 	}
3009 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3010 
3011 	txr->hn_tx_chimney_tried++;
3012 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3013 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3014 		return (NULL);
3015 	txr->hn_tx_chimney++;
3016 
3017 	chim = txr->hn_sc->hn_chim +
3018 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3019 
3020 	if (txr->hn_agg_pktmax > 1 &&
3021 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3022 		txr->hn_agg_txd = txd;
3023 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3024 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3025 		txr->hn_agg_prevpkt = chim;
3026 	}
3027 	return (chim);
3028 }
3029 
3030 /*
3031  * NOTE:
3032  * If this function fails, then both txd and m_head0 will be freed.
3033  */
3034 static int
3035 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3036     struct mbuf **m_head0)
3037 {
3038 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3039 	int error, nsegs, i;
3040 	struct mbuf *m_head = *m_head0;
3041 	struct rndis_packet_msg *pkt;
3042 	uint32_t *pi_data;
3043 	void *chim = NULL;
3044 	int pkt_hlen, pkt_size;
3045 
3046 	pkt = txd->rndis_pkt;
3047 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3048 	if (pkt_size < txr->hn_chim_size) {
3049 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3050 		if (chim != NULL)
3051 			pkt = chim;
3052 	} else {
3053 		if (txr->hn_agg_txd != NULL)
3054 			hn_flush_txagg(ifp, txr);
3055 	}
3056 
3057 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3058 	pkt->rm_len = m_head->m_pkthdr.len;
3059 	pkt->rm_dataoffset = 0;
3060 	pkt->rm_datalen = m_head->m_pkthdr.len;
3061 	pkt->rm_oobdataoffset = 0;
3062 	pkt->rm_oobdatalen = 0;
3063 	pkt->rm_oobdataelements = 0;
3064 	pkt->rm_pktinfooffset = sizeof(*pkt);
3065 	pkt->rm_pktinfolen = 0;
3066 	pkt->rm_vchandle = 0;
3067 	pkt->rm_reserved = 0;
3068 
3069 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3070 		/*
3071 		 * Set the hash value for this packet, so that the host could
3072 		 * dispatch the TX done event for this packet back to this TX
3073 		 * ring's channel.
3074 		 */
3075 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3076 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3077 		*pi_data = txr->hn_tx_idx;
3078 	}
3079 
3080 	if (m_head->m_flags & M_VLANTAG) {
3081 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3082 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3083 		*pi_data = NDIS_VLAN_INFO_MAKE(
3084 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3085 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3086 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3087 	}
3088 
3089 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3090 #if defined(INET6) || defined(INET)
3091 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3092 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3093 #ifdef INET
3094 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3095 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3096 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3097 			    m_head->m_pkthdr.tso_segsz);
3098 		}
3099 #endif
3100 #if defined(INET6) && defined(INET)
3101 		else
3102 #endif
3103 #ifdef INET6
3104 		{
3105 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3106 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3107 			    m_head->m_pkthdr.tso_segsz);
3108 		}
3109 #endif
3110 #endif	/* INET6 || INET */
3111 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3112 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3113 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3114 		if (m_head->m_pkthdr.csum_flags &
3115 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3116 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3117 		} else {
3118 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3119 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3120 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3121 		}
3122 
3123 		if (m_head->m_pkthdr.csum_flags &
3124 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3125 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3126 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3127 		} else if (m_head->m_pkthdr.csum_flags &
3128 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3129 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3130 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3131 		}
3132 	}
3133 
3134 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3135 	/* Fixup RNDIS packet message total length */
3136 	pkt->rm_len += pkt_hlen;
3137 	/* Convert RNDIS packet message offsets */
3138 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3139 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3140 
3141 	/*
3142 	 * Fast path: Chimney sending.
3143 	 */
3144 	if (chim != NULL) {
3145 		struct hn_txdesc *tgt_txd = txd;
3146 
3147 		if (txr->hn_agg_txd != NULL) {
3148 			tgt_txd = txr->hn_agg_txd;
3149 #ifdef INVARIANTS
3150 			*m_head0 = NULL;
3151 #endif
3152 		}
3153 
3154 		KASSERT(pkt == chim,
3155 		    ("RNDIS pkt not in chimney sending buffer"));
3156 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3157 		    ("chimney sending buffer is not used"));
3158 		tgt_txd->chim_size += pkt->rm_len;
3159 
3160 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3161 		    ((uint8_t *)chim) + pkt_hlen);
3162 
3163 		txr->hn_gpa_cnt = 0;
3164 		txr->hn_sendpkt = hn_txpkt_chim;
3165 		goto done;
3166 	}
3167 
3168 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3169 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3170 	    ("chimney buffer is used"));
3171 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3172 
3173 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3174 	if (__predict_false(error)) {
3175 		int freed;
3176 
3177 		/*
3178 		 * This mbuf is not linked w/ the txd yet, so free it now.
3179 		 */
3180 		m_freem(m_head);
3181 		*m_head0 = NULL;
3182 
3183 		freed = hn_txdesc_put(txr, txd);
3184 		KASSERT(freed != 0,
3185 		    ("fail to free txd upon txdma error"));
3186 
3187 		txr->hn_txdma_failed++;
3188 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3189 		return error;
3190 	}
3191 	*m_head0 = m_head;
3192 
3193 	/* +1 RNDIS packet message */
3194 	txr->hn_gpa_cnt = nsegs + 1;
3195 
3196 	/* send packet with page buffer */
3197 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3198 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3199 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3200 
3201 	/*
3202 	 * Fill the page buffers with mbuf info after the page
3203 	 * buffer for RNDIS packet message.
3204 	 */
3205 	for (i = 0; i < nsegs; ++i) {
3206 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3207 
3208 		gpa->gpa_page = atop(segs[i].ds_addr);
3209 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3210 		gpa->gpa_len = segs[i].ds_len;
3211 	}
3212 
3213 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3214 	txd->chim_size = 0;
3215 	txr->hn_sendpkt = hn_txpkt_sglist;
3216 done:
3217 	txd->m = m_head;
3218 
3219 	/* Set the completion routine */
3220 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3221 
3222 	/* Update temporary stats for later use. */
3223 	txr->hn_stat_pkts++;
3224 	txr->hn_stat_size += m_head->m_pkthdr.len;
3225 	if (m_head->m_flags & M_MCAST)
3226 		txr->hn_stat_mcasts++;
3227 
3228 	return 0;
3229 }
3230 
3231 /*
3232  * NOTE:
3233  * If this function fails, then txd will be freed, but the mbuf
3234  * associated w/ the txd will _not_ be freed.
3235  */
3236 static int
3237 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3238 {
3239 	int error, send_failed = 0, has_bpf;
3240 
3241 again:
3242 	has_bpf = bpf_peers_present(ifp->if_bpf);
3243 	if (has_bpf) {
3244 		/*
3245 		 * Make sure that this txd and any aggregated txds are not
3246 		 * freed before ETHER_BPF_MTAP.
3247 		 */
3248 		hn_txdesc_hold(txd);
3249 	}
3250 	error = txr->hn_sendpkt(txr, txd);
3251 	if (!error) {
3252 		if (has_bpf) {
3253 			const struct hn_txdesc *tmp_txd;
3254 
3255 			ETHER_BPF_MTAP(ifp, txd->m);
3256 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3257 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3258 		}
3259 
3260 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3261 #ifdef HN_IFSTART_SUPPORT
3262 		if (!hn_use_if_start)
3263 #endif
3264 		{
3265 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3266 			    txr->hn_stat_size);
3267 			if (txr->hn_stat_mcasts != 0) {
3268 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3269 				    txr->hn_stat_mcasts);
3270 			}
3271 		}
3272 		txr->hn_pkts += txr->hn_stat_pkts;
3273 		txr->hn_sends++;
3274 	}
3275 	if (has_bpf)
3276 		hn_txdesc_put(txr, txd);
3277 
3278 	if (__predict_false(error)) {
3279 		int freed;
3280 
3281 		/*
3282 		 * This should "really rarely" happen.
3283 		 *
3284 		 * XXX Too many RX to be acked or too many sideband
3285 		 * commands to run?  Ask netvsc_channel_rollup()
3286 		 * to kick start later.
3287 		 */
3288 		txr->hn_has_txeof = 1;
3289 		if (!send_failed) {
3290 			txr->hn_send_failed++;
3291 			send_failed = 1;
3292 			/*
3293 			 * Try sending again after set hn_has_txeof;
3294 			 * in case that we missed the last
3295 			 * netvsc_channel_rollup().
3296 			 */
3297 			goto again;
3298 		}
3299 		if_printf(ifp, "send failed\n");
3300 
3301 		/*
3302 		 * Caller will perform further processing on the
3303 		 * associated mbuf, so don't free it in hn_txdesc_put();
3304 		 * only unload it from the DMA map in hn_txdesc_put(),
3305 		 * if it was loaded.
3306 		 */
3307 		txd->m = NULL;
3308 		freed = hn_txdesc_put(txr, txd);
3309 		KASSERT(freed != 0,
3310 		    ("fail to free txd upon send error"));
3311 
3312 		txr->hn_send_failed++;
3313 	}
3314 
3315 	/* Reset temporary stats, after this sending is done. */
3316 	txr->hn_stat_size = 0;
3317 	txr->hn_stat_pkts = 0;
3318 	txr->hn_stat_mcasts = 0;
3319 
3320 	return (error);
3321 }
3322 
3323 /*
3324  * Append the specified data to the indicated mbuf chain,
3325  * Extend the mbuf chain if the new data does not fit in
3326  * existing space.
3327  *
3328  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3329  * There should be an equivalent in the kernel mbuf code,
3330  * but there does not appear to be one yet.
3331  *
3332  * Differs from m_append() in that additional mbufs are
3333  * allocated with cluster size MJUMPAGESIZE, and filled
3334  * accordingly.
3335  *
3336  * Return 1 if able to complete the job; otherwise 0.
3337  */
3338 static int
3339 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3340 {
3341 	struct mbuf *m, *n;
3342 	int remainder, space;
3343 
3344 	for (m = m0; m->m_next != NULL; m = m->m_next)
3345 		;
3346 	remainder = len;
3347 	space = M_TRAILINGSPACE(m);
3348 	if (space > 0) {
3349 		/*
3350 		 * Copy into available space.
3351 		 */
3352 		if (space > remainder)
3353 			space = remainder;
3354 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3355 		m->m_len += space;
3356 		cp += space;
3357 		remainder -= space;
3358 	}
3359 	while (remainder > 0) {
3360 		/*
3361 		 * Allocate a new mbuf; could check space
3362 		 * and allocate a cluster instead.
3363 		 */
3364 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3365 		if (n == NULL)
3366 			break;
3367 		n->m_len = min(MJUMPAGESIZE, remainder);
3368 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3369 		cp += n->m_len;
3370 		remainder -= n->m_len;
3371 		m->m_next = n;
3372 		m = n;
3373 	}
3374 	if (m0->m_flags & M_PKTHDR)
3375 		m0->m_pkthdr.len += len - remainder;
3376 
3377 	return (remainder == 0);
3378 }
3379 
3380 #if defined(INET) || defined(INET6)
3381 static __inline int
3382 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3383 {
3384 #if __FreeBSD_version >= 1100095
3385 	if (hn_lro_mbufq_depth) {
3386 		tcp_lro_queue_mbuf(lc, m);
3387 		return 0;
3388 	}
3389 #endif
3390 	return tcp_lro_rx(lc, m, 0);
3391 }
3392 #endif
3393 
3394 static int
3395 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3396     const struct hn_rxinfo *info)
3397 {
3398 	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3399 	struct mbuf *m_new;
3400 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3401 	int hash_type = M_HASHTYPE_NONE;
3402 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3403 
3404 	ifp = hn_ifp;
3405 	if (rxr->hn_rxvf_ifp != NULL) {
3406 		/*
3407 		 * Non-transparent mode VF; pretend this packet is from
3408 		 * the VF.
3409 		 */
3410 		ifp = rxr->hn_rxvf_ifp;
3411 		is_vf = 1;
3412 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3413 		/* Transparent mode VF. */
3414 		is_vf = 1;
3415 	}
3416 
3417 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3418 		/*
3419 		 * NOTE:
3420 		 * See the NOTE of hn_rndis_init_fixat().  This
3421 		 * function can be reached, immediately after the
3422 		 * RNDIS is initialized but before the ifnet is
3423 		 * setup on the hn_attach() path; drop the unexpected
3424 		 * packets.
3425 		 */
3426 		return (0);
3427 	}
3428 
3429 	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3430 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3431 		return (0);
3432 	}
3433 
3434 	if (dlen <= MHLEN) {
3435 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3436 		if (m_new == NULL) {
3437 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3438 			return (0);
3439 		}
3440 		memcpy(mtod(m_new, void *), data, dlen);
3441 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3442 		rxr->hn_small_pkts++;
3443 	} else {
3444 		/*
3445 		 * Get an mbuf with a cluster.  For packets 2K or less,
3446 		 * get a standard 2K cluster.  For anything larger, get a
3447 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3448 		 * if looped around to the Hyper-V TX channel, so avoid them.
3449 		 */
3450 		size = MCLBYTES;
3451 		if (dlen > MCLBYTES) {
3452 			/* 4096 */
3453 			size = MJUMPAGESIZE;
3454 		}
3455 
3456 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3457 		if (m_new == NULL) {
3458 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3459 			return (0);
3460 		}
3461 
3462 		hv_m_append(m_new, dlen, data);
3463 	}
3464 	m_new->m_pkthdr.rcvif = ifp;
3465 
3466 	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3467 		do_csum = 0;
3468 
3469 	/* receive side checksum offload */
3470 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3471 		/* IP csum offload */
3472 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3473 			m_new->m_pkthdr.csum_flags |=
3474 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3475 			rxr->hn_csum_ip++;
3476 		}
3477 
3478 		/* TCP/UDP csum offload */
3479 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3480 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3481 			m_new->m_pkthdr.csum_flags |=
3482 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3483 			m_new->m_pkthdr.csum_data = 0xffff;
3484 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3485 				rxr->hn_csum_tcp++;
3486 			else
3487 				rxr->hn_csum_udp++;
3488 		}
3489 
3490 		/*
3491 		 * XXX
3492 		 * As of this write (Oct 28th, 2016), host side will turn
3493 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3494 		 * the do_lro setting here is actually _not_ accurate.  We
3495 		 * depend on the RSS hash type check to reset do_lro.
3496 		 */
3497 		if ((info->csum_info &
3498 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3499 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3500 			do_lro = 1;
3501 	} else {
3502 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3503 		if (l3proto == ETHERTYPE_IP) {
3504 			if (l4proto == IPPROTO_TCP) {
3505 				if (do_csum &&
3506 				    (rxr->hn_trust_hcsum &
3507 				     HN_TRUST_HCSUM_TCP)) {
3508 					rxr->hn_csum_trusted++;
3509 					m_new->m_pkthdr.csum_flags |=
3510 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3511 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3512 					m_new->m_pkthdr.csum_data = 0xffff;
3513 				}
3514 				do_lro = 1;
3515 			} else if (l4proto == IPPROTO_UDP) {
3516 				if (do_csum &&
3517 				    (rxr->hn_trust_hcsum &
3518 				     HN_TRUST_HCSUM_UDP)) {
3519 					rxr->hn_csum_trusted++;
3520 					m_new->m_pkthdr.csum_flags |=
3521 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3522 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3523 					m_new->m_pkthdr.csum_data = 0xffff;
3524 				}
3525 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3526 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3527 				rxr->hn_csum_trusted++;
3528 				m_new->m_pkthdr.csum_flags |=
3529 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3530 			}
3531 		}
3532 	}
3533 
3534 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3535 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3536 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3537 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3538 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3539 		m_new->m_flags |= M_VLANTAG;
3540 	}
3541 
3542 	/*
3543 	 * If VF is activated (tranparent/non-transparent mode does not
3544 	 * matter here).
3545 	 *
3546 	 * - Disable LRO
3547 	 *
3548 	 *   hn(4) will only receive broadcast packets, multicast packets,
3549 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3550 	 *   packet types.
3551 	 *
3552 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3553 	 *   all, since the LRO flush will use hn(4) as the receiving
3554 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3555 	 */
3556 	if (is_vf)
3557 		do_lro = 0;
3558 
3559 	/*
3560 	 * If VF is activated (tranparent/non-transparent mode does not
3561 	 * matter here), do _not_ mess with unsupported hash types or
3562 	 * functions.
3563 	 */
3564 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3565 		rxr->hn_rss_pkts++;
3566 		m_new->m_pkthdr.flowid = info->hash_value;
3567 		if (!is_vf)
3568 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3569 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3570 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3571 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3572 			    rxr->hn_mbuf_hash);
3573 
3574 			/*
3575 			 * NOTE:
3576 			 * do_lro is resetted, if the hash types are not TCP
3577 			 * related.  See the comment in the above csum_flags
3578 			 * setup section.
3579 			 */
3580 			switch (type) {
3581 			case NDIS_HASH_IPV4:
3582 				hash_type = M_HASHTYPE_RSS_IPV4;
3583 				do_lro = 0;
3584 				break;
3585 
3586 			case NDIS_HASH_TCP_IPV4:
3587 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3588 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3589 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3590 
3591 					if (is_vf)
3592 						def_htype = M_HASHTYPE_NONE;
3593 
3594 					/*
3595 					 * UDP 4-tuple hash is delivered as
3596 					 * TCP 4-tuple hash.
3597 					 */
3598 					if (l3proto == ETHERTYPE_MAX) {
3599 						hn_rxpkt_proto(m_new,
3600 						    &l3proto, &l4proto);
3601 					}
3602 					if (l3proto == ETHERTYPE_IP) {
3603 						if (l4proto == IPPROTO_UDP &&
3604 						    (rxr->hn_mbuf_hash &
3605 						     NDIS_HASH_UDP_IPV4_X)) {
3606 							hash_type =
3607 							M_HASHTYPE_RSS_UDP_IPV4;
3608 							do_lro = 0;
3609 						} else if (l4proto !=
3610 						    IPPROTO_TCP) {
3611 							hash_type = def_htype;
3612 							do_lro = 0;
3613 						}
3614 					} else {
3615 						hash_type = def_htype;
3616 						do_lro = 0;
3617 					}
3618 				}
3619 				break;
3620 
3621 			case NDIS_HASH_IPV6:
3622 				hash_type = M_HASHTYPE_RSS_IPV6;
3623 				do_lro = 0;
3624 				break;
3625 
3626 			case NDIS_HASH_IPV6_EX:
3627 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3628 				do_lro = 0;
3629 				break;
3630 
3631 			case NDIS_HASH_TCP_IPV6:
3632 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3633 				break;
3634 
3635 			case NDIS_HASH_TCP_IPV6_EX:
3636 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3637 				break;
3638 			}
3639 		}
3640 	} else if (!is_vf) {
3641 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3642 		hash_type = M_HASHTYPE_OPAQUE;
3643 	}
3644 	M_HASHTYPE_SET(m_new, hash_type);
3645 
3646 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3647 	if (hn_ifp != ifp) {
3648 		const struct ether_header *eh;
3649 
3650 		/*
3651 		 * Non-transparent mode VF is activated.
3652 		 */
3653 
3654 		/*
3655 		 * Allow tapping on hn(4).
3656 		 */
3657 		ETHER_BPF_MTAP(hn_ifp, m_new);
3658 
3659 		/*
3660 		 * Update hn(4)'s stats.
3661 		 */
3662 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3663 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3664 		/* Checked at the beginning of this function. */
3665 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3666 		eh = mtod(m_new, struct ether_header *);
3667 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3668 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3669 	}
3670 	rxr->hn_pkts++;
3671 
3672 	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3673 #if defined(INET) || defined(INET6)
3674 		struct lro_ctrl *lro = &rxr->hn_lro;
3675 
3676 		if (lro->lro_cnt) {
3677 			rxr->hn_lro_tried++;
3678 			if (hn_lro_rx(lro, m_new) == 0) {
3679 				/* DONE! */
3680 				return 0;
3681 			}
3682 		}
3683 #endif
3684 	}
3685 	ifp->if_input(ifp, m_new);
3686 
3687 	return (0);
3688 }
3689 
3690 static int
3691 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3692 {
3693 	struct hn_softc *sc = ifp->if_softc;
3694 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3695 	struct ifnet *vf_ifp;
3696 	int mask, error = 0;
3697 	struct ifrsskey *ifrk;
3698 	struct ifrsshash *ifrh;
3699 	uint32_t mtu;
3700 
3701 	switch (cmd) {
3702 	case SIOCSIFMTU:
3703 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3704 			error = EINVAL;
3705 			break;
3706 		}
3707 
3708 		HN_LOCK(sc);
3709 
3710 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3711 			HN_UNLOCK(sc);
3712 			break;
3713 		}
3714 
3715 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3716 			/* Can't change MTU */
3717 			HN_UNLOCK(sc);
3718 			error = EOPNOTSUPP;
3719 			break;
3720 		}
3721 
3722 		if (ifp->if_mtu == ifr->ifr_mtu) {
3723 			HN_UNLOCK(sc);
3724 			break;
3725 		}
3726 
3727 		if (hn_xpnt_vf_isready(sc)) {
3728 			vf_ifp = sc->hn_vf_ifp;
3729 			ifr_vf = *ifr;
3730 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3731 			    sizeof(ifr_vf.ifr_name));
3732 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3733 			    (caddr_t)&ifr_vf);
3734 			if (error) {
3735 				HN_UNLOCK(sc);
3736 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3737 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3738 				break;
3739 			}
3740 		}
3741 
3742 		/*
3743 		 * Suspend this interface before the synthetic parts
3744 		 * are ripped.
3745 		 */
3746 		hn_suspend(sc);
3747 
3748 		/*
3749 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3750 		 */
3751 		hn_synth_detach(sc);
3752 
3753 		/*
3754 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3755 		 * with the new MTU setting.
3756 		 */
3757 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3758 		if (error) {
3759 			HN_UNLOCK(sc);
3760 			break;
3761 		}
3762 
3763 		error = hn_rndis_get_mtu(sc, &mtu);
3764 		if (error)
3765 			mtu = ifr->ifr_mtu;
3766 		else if (bootverbose)
3767 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3768 
3769 		/*
3770 		 * Commit the requested MTU, after the synthetic parts
3771 		 * have been successfully attached.
3772 		 */
3773 		if (mtu >= ifr->ifr_mtu) {
3774 			mtu = ifr->ifr_mtu;
3775 		} else {
3776 			if_printf(ifp, "fixup mtu %d -> %u\n",
3777 			    ifr->ifr_mtu, mtu);
3778 		}
3779 		ifp->if_mtu = mtu;
3780 
3781 		/*
3782 		 * Synthetic parts' reattach may change the chimney
3783 		 * sending size; update it.
3784 		 */
3785 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3786 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3787 
3788 		/*
3789 		 * Make sure that various parameters based on MTU are
3790 		 * still valid, after the MTU change.
3791 		 */
3792 		hn_mtu_change_fixup(sc);
3793 
3794 		/*
3795 		 * All done!  Resume the interface now.
3796 		 */
3797 		hn_resume(sc);
3798 
3799 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3800 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3801 			/*
3802 			 * Since we have reattached the NVS part,
3803 			 * change the datapath to VF again; in case
3804 			 * that it is lost, after the NVS was detached.
3805 			 */
3806 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3807 		}
3808 
3809 		HN_UNLOCK(sc);
3810 		break;
3811 
3812 	case SIOCSIFFLAGS:
3813 		HN_LOCK(sc);
3814 
3815 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3816 			HN_UNLOCK(sc);
3817 			break;
3818 		}
3819 
3820 		if (hn_xpnt_vf_isready(sc))
3821 			hn_xpnt_vf_saveifflags(sc);
3822 
3823 		if (ifp->if_flags & IFF_UP) {
3824 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3825 				/*
3826 				 * Caller meight hold mutex, e.g.
3827 				 * bpf; use busy-wait for the RNDIS
3828 				 * reply.
3829 				 */
3830 				HN_NO_SLEEPING(sc);
3831 				hn_rxfilter_config(sc);
3832 				HN_SLEEPING_OK(sc);
3833 
3834 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3835 					error = hn_xpnt_vf_iocsetflags(sc);
3836 			} else {
3837 				hn_init_locked(sc);
3838 			}
3839 		} else {
3840 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3841 				hn_stop(sc, false);
3842 		}
3843 		sc->hn_if_flags = ifp->if_flags;
3844 
3845 		HN_UNLOCK(sc);
3846 		break;
3847 
3848 	case SIOCSIFCAP:
3849 		HN_LOCK(sc);
3850 
3851 		if (hn_xpnt_vf_isready(sc)) {
3852 			ifr_vf = *ifr;
3853 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3854 			    sizeof(ifr_vf.ifr_name));
3855 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3856 			HN_UNLOCK(sc);
3857 			break;
3858 		}
3859 
3860 		/*
3861 		 * Fix up requested capabilities w/ supported capabilities,
3862 		 * since the supported capabilities could have been changed.
3863 		 */
3864 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3865 		    ifp->if_capenable;
3866 
3867 		if (mask & IFCAP_TXCSUM) {
3868 			ifp->if_capenable ^= IFCAP_TXCSUM;
3869 			if (ifp->if_capenable & IFCAP_TXCSUM)
3870 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3871 			else
3872 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3873 		}
3874 		if (mask & IFCAP_TXCSUM_IPV6) {
3875 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3876 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3877 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3878 			else
3879 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3880 		}
3881 
3882 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3883 		if (mask & IFCAP_RXCSUM)
3884 			ifp->if_capenable ^= IFCAP_RXCSUM;
3885 #ifdef foo
3886 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3887 		if (mask & IFCAP_RXCSUM_IPV6)
3888 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3889 #endif
3890 
3891 		if (mask & IFCAP_LRO)
3892 			ifp->if_capenable ^= IFCAP_LRO;
3893 
3894 		if (mask & IFCAP_TSO4) {
3895 			ifp->if_capenable ^= IFCAP_TSO4;
3896 			if (ifp->if_capenable & IFCAP_TSO4)
3897 				ifp->if_hwassist |= CSUM_IP_TSO;
3898 			else
3899 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3900 		}
3901 		if (mask & IFCAP_TSO6) {
3902 			ifp->if_capenable ^= IFCAP_TSO6;
3903 			if (ifp->if_capenable & IFCAP_TSO6)
3904 				ifp->if_hwassist |= CSUM_IP6_TSO;
3905 			else
3906 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3907 		}
3908 
3909 		HN_UNLOCK(sc);
3910 		break;
3911 
3912 	case SIOCADDMULTI:
3913 	case SIOCDELMULTI:
3914 		HN_LOCK(sc);
3915 
3916 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3917 			HN_UNLOCK(sc);
3918 			break;
3919 		}
3920 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3921 			/*
3922 			 * Multicast uses mutex; use busy-wait for
3923 			 * the RNDIS reply.
3924 			 */
3925 			HN_NO_SLEEPING(sc);
3926 			hn_rxfilter_config(sc);
3927 			HN_SLEEPING_OK(sc);
3928 		}
3929 
3930 		/* XXX vlan(4) style mcast addr maintenance */
3931 		if (hn_xpnt_vf_isready(sc)) {
3932 			int old_if_flags;
3933 
3934 			old_if_flags = sc->hn_vf_ifp->if_flags;
3935 			hn_xpnt_vf_saveifflags(sc);
3936 
3937 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3938 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3939 			     IFF_ALLMULTI))
3940 				error = hn_xpnt_vf_iocsetflags(sc);
3941 		}
3942 
3943 		HN_UNLOCK(sc);
3944 		break;
3945 
3946 	case SIOCSIFMEDIA:
3947 	case SIOCGIFMEDIA:
3948 		HN_LOCK(sc);
3949 		if (hn_xpnt_vf_isready(sc)) {
3950 			/*
3951 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3952 			 * create and pass ifr_vf to the VF here; just
3953 			 * replace the ifr_name.
3954 			 */
3955 			vf_ifp = sc->hn_vf_ifp;
3956 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3957 			    sizeof(ifr->ifr_name));
3958 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3959 			/* Restore the ifr_name. */
3960 			strlcpy(ifr->ifr_name, ifp->if_xname,
3961 			    sizeof(ifr->ifr_name));
3962 			HN_UNLOCK(sc);
3963 			break;
3964 		}
3965 		HN_UNLOCK(sc);
3966 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3967 		break;
3968 
3969 	case SIOCGIFRSSHASH:
3970 		ifrh = (struct ifrsshash *)data;
3971 		HN_LOCK(sc);
3972 		if (sc->hn_rx_ring_inuse == 1) {
3973 			HN_UNLOCK(sc);
3974 			ifrh->ifrh_func = RSS_FUNC_NONE;
3975 			ifrh->ifrh_types = 0;
3976 			break;
3977 		}
3978 
3979 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3980 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3981 		else
3982 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3983 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3984 		HN_UNLOCK(sc);
3985 		break;
3986 
3987 	case SIOCGIFRSSKEY:
3988 		ifrk = (struct ifrsskey *)data;
3989 		HN_LOCK(sc);
3990 		if (sc->hn_rx_ring_inuse == 1) {
3991 			HN_UNLOCK(sc);
3992 			ifrk->ifrk_func = RSS_FUNC_NONE;
3993 			ifrk->ifrk_keylen = 0;
3994 			break;
3995 		}
3996 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3997 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3998 		else
3999 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4000 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4001 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4002 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4003 		HN_UNLOCK(sc);
4004 		break;
4005 
4006 	default:
4007 		error = ether_ioctl(ifp, cmd, data);
4008 		break;
4009 	}
4010 	return (error);
4011 }
4012 
4013 static void
4014 hn_stop(struct hn_softc *sc, bool detaching)
4015 {
4016 	struct ifnet *ifp = sc->hn_ifp;
4017 	int i;
4018 
4019 	HN_LOCK_ASSERT(sc);
4020 
4021 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4022 	    ("synthetic parts were not attached"));
4023 
4024 	/* Clear RUNNING bit ASAP. */
4025 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4026 
4027 	/* Disable polling. */
4028 	hn_polling(sc, 0);
4029 
4030 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4031 		KASSERT(sc->hn_vf_ifp != NULL,
4032 		    ("%s: VF is not attached", ifp->if_xname));
4033 
4034 		/* Mark transparent mode VF as disabled. */
4035 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4036 
4037 		/*
4038 		 * NOTE:
4039 		 * Datapath setting must happen _before_ bringing
4040 		 * the VF down.
4041 		 */
4042 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4043 
4044 		/*
4045 		 * Bring the VF down.
4046 		 */
4047 		hn_xpnt_vf_saveifflags(sc);
4048 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4049 		hn_xpnt_vf_iocsetflags(sc);
4050 	}
4051 
4052 	/* Suspend data transfers. */
4053 	hn_suspend_data(sc);
4054 
4055 	/* Clear OACTIVE bit. */
4056 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4057 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4058 		sc->hn_tx_ring[i].hn_oactive = 0;
4059 
4060 	/*
4061 	 * If the non-transparent mode VF is active, make sure
4062 	 * that the RX filter still allows packet reception.
4063 	 */
4064 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4065 		hn_rxfilter_config(sc);
4066 }
4067 
4068 static void
4069 hn_init_locked(struct hn_softc *sc)
4070 {
4071 	struct ifnet *ifp = sc->hn_ifp;
4072 	int i;
4073 
4074 	HN_LOCK_ASSERT(sc);
4075 
4076 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4077 		return;
4078 
4079 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4080 		return;
4081 
4082 	/* Configure RX filter */
4083 	hn_rxfilter_config(sc);
4084 
4085 	/* Clear OACTIVE bit. */
4086 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4087 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4088 		sc->hn_tx_ring[i].hn_oactive = 0;
4089 
4090 	/* Clear TX 'suspended' bit. */
4091 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4092 
4093 	if (hn_xpnt_vf_isready(sc)) {
4094 		/* Initialize transparent VF. */
4095 		hn_xpnt_vf_init(sc);
4096 	}
4097 
4098 	/* Everything is ready; unleash! */
4099 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4100 
4101 	/* Re-enable polling if requested. */
4102 	if (sc->hn_pollhz > 0)
4103 		hn_polling(sc, sc->hn_pollhz);
4104 }
4105 
4106 static void
4107 hn_init(void *xsc)
4108 {
4109 	struct hn_softc *sc = xsc;
4110 
4111 	HN_LOCK(sc);
4112 	hn_init_locked(sc);
4113 	HN_UNLOCK(sc);
4114 }
4115 
4116 #if __FreeBSD_version >= 1100099
4117 
4118 static int
4119 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4120 {
4121 	struct hn_softc *sc = arg1;
4122 	unsigned int lenlim;
4123 	int error;
4124 
4125 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4126 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4127 	if (error || req->newptr == NULL)
4128 		return error;
4129 
4130 	HN_LOCK(sc);
4131 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4132 	    lenlim > TCP_LRO_LENGTH_MAX) {
4133 		HN_UNLOCK(sc);
4134 		return EINVAL;
4135 	}
4136 	hn_set_lro_lenlim(sc, lenlim);
4137 	HN_UNLOCK(sc);
4138 
4139 	return 0;
4140 }
4141 
4142 static int
4143 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4144 {
4145 	struct hn_softc *sc = arg1;
4146 	int ackcnt, error, i;
4147 
4148 	/*
4149 	 * lro_ackcnt_lim is append count limit,
4150 	 * +1 to turn it into aggregation limit.
4151 	 */
4152 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4153 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4154 	if (error || req->newptr == NULL)
4155 		return error;
4156 
4157 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4158 		return EINVAL;
4159 
4160 	/*
4161 	 * Convert aggregation limit back to append
4162 	 * count limit.
4163 	 */
4164 	--ackcnt;
4165 	HN_LOCK(sc);
4166 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4167 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4168 	HN_UNLOCK(sc);
4169 	return 0;
4170 }
4171 
4172 #endif
4173 
4174 static int
4175 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4176 {
4177 	struct hn_softc *sc = arg1;
4178 	int hcsum = arg2;
4179 	int on, error, i;
4180 
4181 	on = 0;
4182 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4183 		on = 1;
4184 
4185 	error = sysctl_handle_int(oidp, &on, 0, req);
4186 	if (error || req->newptr == NULL)
4187 		return error;
4188 
4189 	HN_LOCK(sc);
4190 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4191 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4192 
4193 		if (on)
4194 			rxr->hn_trust_hcsum |= hcsum;
4195 		else
4196 			rxr->hn_trust_hcsum &= ~hcsum;
4197 	}
4198 	HN_UNLOCK(sc);
4199 	return 0;
4200 }
4201 
4202 static int
4203 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4204 {
4205 	struct hn_softc *sc = arg1;
4206 	int chim_size, error;
4207 
4208 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4209 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4210 	if (error || req->newptr == NULL)
4211 		return error;
4212 
4213 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4214 		return EINVAL;
4215 
4216 	HN_LOCK(sc);
4217 	hn_set_chim_size(sc, chim_size);
4218 	HN_UNLOCK(sc);
4219 	return 0;
4220 }
4221 
4222 #if __FreeBSD_version < 1100095
4223 static int
4224 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4225 {
4226 	struct hn_softc *sc = arg1;
4227 	int ofs = arg2, i, error;
4228 	struct hn_rx_ring *rxr;
4229 	uint64_t stat;
4230 
4231 	stat = 0;
4232 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4233 		rxr = &sc->hn_rx_ring[i];
4234 		stat += *((int *)((uint8_t *)rxr + ofs));
4235 	}
4236 
4237 	error = sysctl_handle_64(oidp, &stat, 0, req);
4238 	if (error || req->newptr == NULL)
4239 		return error;
4240 
4241 	/* Zero out this stat. */
4242 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4243 		rxr = &sc->hn_rx_ring[i];
4244 		*((int *)((uint8_t *)rxr + ofs)) = 0;
4245 	}
4246 	return 0;
4247 }
4248 #else
4249 static int
4250 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4251 {
4252 	struct hn_softc *sc = arg1;
4253 	int ofs = arg2, i, error;
4254 	struct hn_rx_ring *rxr;
4255 	uint64_t stat;
4256 
4257 	stat = 0;
4258 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4259 		rxr = &sc->hn_rx_ring[i];
4260 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4261 	}
4262 
4263 	error = sysctl_handle_64(oidp, &stat, 0, req);
4264 	if (error || req->newptr == NULL)
4265 		return error;
4266 
4267 	/* Zero out this stat. */
4268 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4269 		rxr = &sc->hn_rx_ring[i];
4270 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4271 	}
4272 	return 0;
4273 }
4274 
4275 #endif
4276 
4277 static int
4278 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4279 {
4280 	struct hn_softc *sc = arg1;
4281 	int ofs = arg2, i, error;
4282 	struct hn_rx_ring *rxr;
4283 	u_long stat;
4284 
4285 	stat = 0;
4286 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4287 		rxr = &sc->hn_rx_ring[i];
4288 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4289 	}
4290 
4291 	error = sysctl_handle_long(oidp, &stat, 0, req);
4292 	if (error || req->newptr == NULL)
4293 		return error;
4294 
4295 	/* Zero out this stat. */
4296 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4297 		rxr = &sc->hn_rx_ring[i];
4298 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4299 	}
4300 	return 0;
4301 }
4302 
4303 static int
4304 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4305 {
4306 	struct hn_softc *sc = arg1;
4307 	int ofs = arg2, i, error;
4308 	struct hn_tx_ring *txr;
4309 	u_long stat;
4310 
4311 	stat = 0;
4312 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4313 		txr = &sc->hn_tx_ring[i];
4314 		stat += *((u_long *)((uint8_t *)txr + ofs));
4315 	}
4316 
4317 	error = sysctl_handle_long(oidp, &stat, 0, req);
4318 	if (error || req->newptr == NULL)
4319 		return error;
4320 
4321 	/* Zero out this stat. */
4322 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4323 		txr = &sc->hn_tx_ring[i];
4324 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4325 	}
4326 	return 0;
4327 }
4328 
4329 static int
4330 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4331 {
4332 	struct hn_softc *sc = arg1;
4333 	int ofs = arg2, i, error, conf;
4334 	struct hn_tx_ring *txr;
4335 
4336 	txr = &sc->hn_tx_ring[0];
4337 	conf = *((int *)((uint8_t *)txr + ofs));
4338 
4339 	error = sysctl_handle_int(oidp, &conf, 0, req);
4340 	if (error || req->newptr == NULL)
4341 		return error;
4342 
4343 	HN_LOCK(sc);
4344 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4345 		txr = &sc->hn_tx_ring[i];
4346 		*((int *)((uint8_t *)txr + ofs)) = conf;
4347 	}
4348 	HN_UNLOCK(sc);
4349 
4350 	return 0;
4351 }
4352 
4353 static int
4354 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4355 {
4356 	struct hn_softc *sc = arg1;
4357 	int error, size;
4358 
4359 	size = sc->hn_agg_size;
4360 	error = sysctl_handle_int(oidp, &size, 0, req);
4361 	if (error || req->newptr == NULL)
4362 		return (error);
4363 
4364 	HN_LOCK(sc);
4365 	sc->hn_agg_size = size;
4366 	hn_set_txagg(sc);
4367 	HN_UNLOCK(sc);
4368 
4369 	return (0);
4370 }
4371 
4372 static int
4373 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4374 {
4375 	struct hn_softc *sc = arg1;
4376 	int error, pkts;
4377 
4378 	pkts = sc->hn_agg_pkts;
4379 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4380 	if (error || req->newptr == NULL)
4381 		return (error);
4382 
4383 	HN_LOCK(sc);
4384 	sc->hn_agg_pkts = pkts;
4385 	hn_set_txagg(sc);
4386 	HN_UNLOCK(sc);
4387 
4388 	return (0);
4389 }
4390 
4391 static int
4392 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4393 {
4394 	struct hn_softc *sc = arg1;
4395 	int pkts;
4396 
4397 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4398 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4399 }
4400 
4401 static int
4402 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4403 {
4404 	struct hn_softc *sc = arg1;
4405 	int align;
4406 
4407 	align = sc->hn_tx_ring[0].hn_agg_align;
4408 	return (sysctl_handle_int(oidp, &align, 0, req));
4409 }
4410 
4411 static void
4412 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4413 {
4414 	if (pollhz == 0)
4415 		vmbus_chan_poll_disable(chan);
4416 	else
4417 		vmbus_chan_poll_enable(chan, pollhz);
4418 }
4419 
4420 static void
4421 hn_polling(struct hn_softc *sc, u_int pollhz)
4422 {
4423 	int nsubch = sc->hn_rx_ring_inuse - 1;
4424 
4425 	HN_LOCK_ASSERT(sc);
4426 
4427 	if (nsubch > 0) {
4428 		struct vmbus_channel **subch;
4429 		int i;
4430 
4431 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4432 		for (i = 0; i < nsubch; ++i)
4433 			hn_chan_polling(subch[i], pollhz);
4434 		vmbus_subchan_rel(subch, nsubch);
4435 	}
4436 	hn_chan_polling(sc->hn_prichan, pollhz);
4437 }
4438 
4439 static int
4440 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4441 {
4442 	struct hn_softc *sc = arg1;
4443 	int pollhz, error;
4444 
4445 	pollhz = sc->hn_pollhz;
4446 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4447 	if (error || req->newptr == NULL)
4448 		return (error);
4449 
4450 	if (pollhz != 0 &&
4451 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4452 		return (EINVAL);
4453 
4454 	HN_LOCK(sc);
4455 	if (sc->hn_pollhz != pollhz) {
4456 		sc->hn_pollhz = pollhz;
4457 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4458 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4459 			hn_polling(sc, sc->hn_pollhz);
4460 	}
4461 	HN_UNLOCK(sc);
4462 
4463 	return (0);
4464 }
4465 
4466 static int
4467 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4468 {
4469 	struct hn_softc *sc = arg1;
4470 	char verstr[16];
4471 
4472 	snprintf(verstr, sizeof(verstr), "%u.%u",
4473 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4474 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4475 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4476 }
4477 
4478 static int
4479 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4480 {
4481 	struct hn_softc *sc = arg1;
4482 	char caps_str[128];
4483 	uint32_t caps;
4484 
4485 	HN_LOCK(sc);
4486 	caps = sc->hn_caps;
4487 	HN_UNLOCK(sc);
4488 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4489 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4490 }
4491 
4492 static int
4493 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4494 {
4495 	struct hn_softc *sc = arg1;
4496 	char assist_str[128];
4497 	uint32_t hwassist;
4498 
4499 	HN_LOCK(sc);
4500 	hwassist = sc->hn_ifp->if_hwassist;
4501 	HN_UNLOCK(sc);
4502 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4503 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4504 }
4505 
4506 static int
4507 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4508 {
4509 	struct hn_softc *sc = arg1;
4510 	char filter_str[128];
4511 	uint32_t filter;
4512 
4513 	HN_LOCK(sc);
4514 	filter = sc->hn_rx_filter;
4515 	HN_UNLOCK(sc);
4516 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4517 	    NDIS_PACKET_TYPES);
4518 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4519 }
4520 
4521 #ifndef RSS
4522 
4523 static int
4524 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4525 {
4526 	struct hn_softc *sc = arg1;
4527 	int error;
4528 
4529 	HN_LOCK(sc);
4530 
4531 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4532 	if (error || req->newptr == NULL)
4533 		goto back;
4534 
4535 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4536 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4537 		/*
4538 		 * RSS key is synchronized w/ VF's, don't allow users
4539 		 * to change it.
4540 		 */
4541 		error = EBUSY;
4542 		goto back;
4543 	}
4544 
4545 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4546 	if (error)
4547 		goto back;
4548 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4549 
4550 	if (sc->hn_rx_ring_inuse > 1) {
4551 		error = hn_rss_reconfig(sc);
4552 	} else {
4553 		/* Not RSS capable, at least for now; just save the RSS key. */
4554 		error = 0;
4555 	}
4556 back:
4557 	HN_UNLOCK(sc);
4558 	return (error);
4559 }
4560 
4561 static int
4562 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4563 {
4564 	struct hn_softc *sc = arg1;
4565 	int error;
4566 
4567 	HN_LOCK(sc);
4568 
4569 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4570 	if (error || req->newptr == NULL)
4571 		goto back;
4572 
4573 	/*
4574 	 * Don't allow RSS indirect table change, if this interface is not
4575 	 * RSS capable currently.
4576 	 */
4577 	if (sc->hn_rx_ring_inuse == 1) {
4578 		error = EOPNOTSUPP;
4579 		goto back;
4580 	}
4581 
4582 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4583 	if (error)
4584 		goto back;
4585 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4586 
4587 	hn_rss_ind_fixup(sc);
4588 	error = hn_rss_reconfig(sc);
4589 back:
4590 	HN_UNLOCK(sc);
4591 	return (error);
4592 }
4593 
4594 #endif	/* !RSS */
4595 
4596 static int
4597 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4598 {
4599 	struct hn_softc *sc = arg1;
4600 	char hash_str[128];
4601 	uint32_t hash;
4602 
4603 	HN_LOCK(sc);
4604 	hash = sc->hn_rss_hash;
4605 	HN_UNLOCK(sc);
4606 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4607 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4608 }
4609 
4610 static int
4611 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4612 {
4613 	struct hn_softc *sc = arg1;
4614 	char hash_str[128];
4615 	uint32_t hash;
4616 
4617 	HN_LOCK(sc);
4618 	hash = sc->hn_rss_hcap;
4619 	HN_UNLOCK(sc);
4620 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4621 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4622 }
4623 
4624 static int
4625 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4626 {
4627 	struct hn_softc *sc = arg1;
4628 	char hash_str[128];
4629 	uint32_t hash;
4630 
4631 	HN_LOCK(sc);
4632 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4633 	HN_UNLOCK(sc);
4634 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4635 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4636 }
4637 
4638 static int
4639 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4640 {
4641 	struct hn_softc *sc = arg1;
4642 	char vf_name[IFNAMSIZ + 1];
4643 	struct ifnet *vf_ifp;
4644 
4645 	HN_LOCK(sc);
4646 	vf_name[0] = '\0';
4647 	vf_ifp = sc->hn_vf_ifp;
4648 	if (vf_ifp != NULL)
4649 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4650 	HN_UNLOCK(sc);
4651 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4652 }
4653 
4654 static int
4655 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4656 {
4657 	struct hn_softc *sc = arg1;
4658 	char vf_name[IFNAMSIZ + 1];
4659 	struct ifnet *vf_ifp;
4660 
4661 	HN_LOCK(sc);
4662 	vf_name[0] = '\0';
4663 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4664 	if (vf_ifp != NULL)
4665 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4666 	HN_UNLOCK(sc);
4667 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4668 }
4669 
4670 static int
4671 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4672 {
4673 	struct rm_priotracker pt;
4674 	struct sbuf *sb;
4675 	int error, i;
4676 	bool first;
4677 
4678 	error = sysctl_wire_old_buffer(req, 0);
4679 	if (error != 0)
4680 		return (error);
4681 
4682 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4683 	if (sb == NULL)
4684 		return (ENOMEM);
4685 
4686 	rm_rlock(&hn_vfmap_lock, &pt);
4687 
4688 	first = true;
4689 	for (i = 0; i < hn_vfmap_size; ++i) {
4690 		struct ifnet *ifp;
4691 
4692 		if (hn_vfmap[i] == NULL)
4693 			continue;
4694 
4695 		ifp = ifnet_byindex(i);
4696 		if (ifp != NULL) {
4697 			if (first)
4698 				sbuf_printf(sb, "%s", ifp->if_xname);
4699 			else
4700 				sbuf_printf(sb, " %s", ifp->if_xname);
4701 			first = false;
4702 		}
4703 	}
4704 
4705 	rm_runlock(&hn_vfmap_lock, &pt);
4706 
4707 	error = sbuf_finish(sb);
4708 	sbuf_delete(sb);
4709 	return (error);
4710 }
4711 
4712 static int
4713 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4714 {
4715 	struct rm_priotracker pt;
4716 	struct sbuf *sb;
4717 	int error, i;
4718 	bool first;
4719 
4720 	error = sysctl_wire_old_buffer(req, 0);
4721 	if (error != 0)
4722 		return (error);
4723 
4724 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4725 	if (sb == NULL)
4726 		return (ENOMEM);
4727 
4728 	rm_rlock(&hn_vfmap_lock, &pt);
4729 
4730 	first = true;
4731 	for (i = 0; i < hn_vfmap_size; ++i) {
4732 		struct ifnet *ifp, *hn_ifp;
4733 
4734 		hn_ifp = hn_vfmap[i];
4735 		if (hn_ifp == NULL)
4736 			continue;
4737 
4738 		ifp = ifnet_byindex(i);
4739 		if (ifp != NULL) {
4740 			if (first) {
4741 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4742 				    hn_ifp->if_xname);
4743 			} else {
4744 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4745 				    hn_ifp->if_xname);
4746 			}
4747 			first = false;
4748 		}
4749 	}
4750 
4751 	rm_runlock(&hn_vfmap_lock, &pt);
4752 
4753 	error = sbuf_finish(sb);
4754 	sbuf_delete(sb);
4755 	return (error);
4756 }
4757 
4758 static int
4759 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4760 {
4761 	struct hn_softc *sc = arg1;
4762 	int error, onoff = 0;
4763 
4764 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4765 		onoff = 1;
4766 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4767 	if (error || req->newptr == NULL)
4768 		return (error);
4769 
4770 	HN_LOCK(sc);
4771 	/* NOTE: hn_vf_lock for hn_transmit() */
4772 	rm_wlock(&sc->hn_vf_lock);
4773 	if (onoff)
4774 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4775 	else
4776 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4777 	rm_wunlock(&sc->hn_vf_lock);
4778 	HN_UNLOCK(sc);
4779 
4780 	return (0);
4781 }
4782 
4783 static int
4784 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4785 {
4786 	struct hn_softc *sc = arg1;
4787 	int enabled = 0;
4788 
4789 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4790 		enabled = 1;
4791 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4792 }
4793 
4794 static int
4795 hn_check_iplen(const struct mbuf *m, int hoff)
4796 {
4797 	const struct ip *ip;
4798 	int len, iphlen, iplen;
4799 	const struct tcphdr *th;
4800 	int thoff;				/* TCP data offset */
4801 
4802 	len = hoff + sizeof(struct ip);
4803 
4804 	/* The packet must be at least the size of an IP header. */
4805 	if (m->m_pkthdr.len < len)
4806 		return IPPROTO_DONE;
4807 
4808 	/* The fixed IP header must reside completely in the first mbuf. */
4809 	if (m->m_len < len)
4810 		return IPPROTO_DONE;
4811 
4812 	ip = mtodo(m, hoff);
4813 
4814 	/* Bound check the packet's stated IP header length. */
4815 	iphlen = ip->ip_hl << 2;
4816 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4817 		return IPPROTO_DONE;
4818 
4819 	/* The full IP header must reside completely in the one mbuf. */
4820 	if (m->m_len < hoff + iphlen)
4821 		return IPPROTO_DONE;
4822 
4823 	iplen = ntohs(ip->ip_len);
4824 
4825 	/*
4826 	 * Check that the amount of data in the buffers is as
4827 	 * at least much as the IP header would have us expect.
4828 	 */
4829 	if (m->m_pkthdr.len < hoff + iplen)
4830 		return IPPROTO_DONE;
4831 
4832 	/*
4833 	 * Ignore IP fragments.
4834 	 */
4835 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4836 		return IPPROTO_DONE;
4837 
4838 	/*
4839 	 * The TCP/IP or UDP/IP header must be entirely contained within
4840 	 * the first fragment of a packet.
4841 	 */
4842 	switch (ip->ip_p) {
4843 	case IPPROTO_TCP:
4844 		if (iplen < iphlen + sizeof(struct tcphdr))
4845 			return IPPROTO_DONE;
4846 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4847 			return IPPROTO_DONE;
4848 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4849 		thoff = th->th_off << 2;
4850 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4851 			return IPPROTO_DONE;
4852 		if (m->m_len < hoff + iphlen + thoff)
4853 			return IPPROTO_DONE;
4854 		break;
4855 	case IPPROTO_UDP:
4856 		if (iplen < iphlen + sizeof(struct udphdr))
4857 			return IPPROTO_DONE;
4858 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4859 			return IPPROTO_DONE;
4860 		break;
4861 	default:
4862 		if (iplen < iphlen)
4863 			return IPPROTO_DONE;
4864 		break;
4865 	}
4866 	return ip->ip_p;
4867 }
4868 
4869 static void
4870 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4871 {
4872 	const struct ether_header *eh;
4873 	uint16_t etype;
4874 	int hoff;
4875 
4876 	hoff = sizeof(*eh);
4877 	/* Checked at the beginning of this function. */
4878 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4879 
4880 	eh = mtod(m_new, const struct ether_header *);
4881 	etype = ntohs(eh->ether_type);
4882 	if (etype == ETHERTYPE_VLAN) {
4883 		const struct ether_vlan_header *evl;
4884 
4885 		hoff = sizeof(*evl);
4886 		if (m_new->m_len < hoff)
4887 			return;
4888 		evl = mtod(m_new, const struct ether_vlan_header *);
4889 		etype = ntohs(evl->evl_proto);
4890 	}
4891 	*l3proto = etype;
4892 
4893 	if (etype == ETHERTYPE_IP)
4894 		*l4proto = hn_check_iplen(m_new, hoff);
4895 	else
4896 		*l4proto = IPPROTO_DONE;
4897 }
4898 
4899 static int
4900 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4901 {
4902 	struct sysctl_oid_list *child;
4903 	struct sysctl_ctx_list *ctx;
4904 	device_t dev = sc->hn_dev;
4905 #if defined(INET) || defined(INET6)
4906 #if __FreeBSD_version >= 1100095
4907 	int lroent_cnt;
4908 #endif
4909 #endif
4910 	int i;
4911 
4912 	/*
4913 	 * Create RXBUF for reception.
4914 	 *
4915 	 * NOTE:
4916 	 * - It is shared by all channels.
4917 	 * - A large enough buffer is allocated, certain version of NVSes
4918 	 *   may further limit the usable space.
4919 	 */
4920 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4921 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4922 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4923 	if (sc->hn_rxbuf == NULL) {
4924 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4925 		return (ENOMEM);
4926 	}
4927 
4928 	sc->hn_rx_ring_cnt = ring_cnt;
4929 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4930 
4931 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4932 	    M_DEVBUF, M_WAITOK | M_ZERO);
4933 
4934 #if defined(INET) || defined(INET6)
4935 #if __FreeBSD_version >= 1100095
4936 	lroent_cnt = hn_lro_entry_count;
4937 	if (lroent_cnt < TCP_LRO_ENTRIES)
4938 		lroent_cnt = TCP_LRO_ENTRIES;
4939 	if (bootverbose)
4940 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4941 #endif
4942 #endif	/* INET || INET6 */
4943 
4944 	ctx = device_get_sysctl_ctx(dev);
4945 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4946 
4947 	/* Create dev.hn.UNIT.rx sysctl tree */
4948 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4949 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4950 
4951 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4952 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4953 
4954 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4955 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4956 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4957 		if (rxr->hn_br == NULL) {
4958 			device_printf(dev, "allocate bufring failed\n");
4959 			return (ENOMEM);
4960 		}
4961 
4962 		if (hn_trust_hosttcp)
4963 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4964 		if (hn_trust_hostudp)
4965 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4966 		if (hn_trust_hostip)
4967 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4968 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4969 		rxr->hn_ifp = sc->hn_ifp;
4970 		if (i < sc->hn_tx_ring_cnt)
4971 			rxr->hn_txr = &sc->hn_tx_ring[i];
4972 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4973 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4974 		rxr->hn_rx_idx = i;
4975 		rxr->hn_rxbuf = sc->hn_rxbuf;
4976 
4977 		/*
4978 		 * Initialize LRO.
4979 		 */
4980 #if defined(INET) || defined(INET6)
4981 #if __FreeBSD_version >= 1100095
4982 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4983 		    hn_lro_mbufq_depth);
4984 #else
4985 		tcp_lro_init(&rxr->hn_lro);
4986 		rxr->hn_lro.ifp = sc->hn_ifp;
4987 #endif
4988 #if __FreeBSD_version >= 1100099
4989 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4990 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4991 #endif
4992 #endif	/* INET || INET6 */
4993 
4994 		if (sc->hn_rx_sysctl_tree != NULL) {
4995 			char name[16];
4996 
4997 			/*
4998 			 * Create per RX ring sysctl tree:
4999 			 * dev.hn.UNIT.rx.RINGID
5000 			 */
5001 			snprintf(name, sizeof(name), "%d", i);
5002 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5003 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5004 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5005 
5006 			if (rxr->hn_rx_sysctl_tree != NULL) {
5007 				SYSCTL_ADD_ULONG(ctx,
5008 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5009 				    OID_AUTO, "packets", CTLFLAG_RW,
5010 				    &rxr->hn_pkts, "# of packets received");
5011 				SYSCTL_ADD_ULONG(ctx,
5012 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5013 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5014 				    &rxr->hn_rss_pkts,
5015 				    "# of packets w/ RSS info received");
5016 				SYSCTL_ADD_INT(ctx,
5017 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5018 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5019 				    &rxr->hn_pktbuf_len, 0,
5020 				    "Temporary channel packet buffer length");
5021 			}
5022 		}
5023 	}
5024 
5025 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5026 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5027 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5028 #if __FreeBSD_version < 1100095
5029 	    hn_rx_stat_int_sysctl,
5030 #else
5031 	    hn_rx_stat_u64_sysctl,
5032 #endif
5033 	    "LU", "LRO queued");
5034 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5035 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5036 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5037 #if __FreeBSD_version < 1100095
5038 	    hn_rx_stat_int_sysctl,
5039 #else
5040 	    hn_rx_stat_u64_sysctl,
5041 #endif
5042 	    "LU", "LRO flushed");
5043 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5044 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5045 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5046 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5047 #if __FreeBSD_version >= 1100099
5048 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5049 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5050 	    hn_lro_lenlim_sysctl, "IU",
5051 	    "Max # of data bytes to be aggregated by LRO");
5052 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5053 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5054 	    hn_lro_ackcnt_sysctl, "I",
5055 	    "Max # of ACKs to be aggregated by LRO");
5056 #endif
5057 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5058 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5059 	    hn_trust_hcsum_sysctl, "I",
5060 	    "Trust tcp segement verification on host side, "
5061 	    "when csum info is missing");
5062 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5063 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5064 	    hn_trust_hcsum_sysctl, "I",
5065 	    "Trust udp datagram verification on host side, "
5066 	    "when csum info is missing");
5067 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5068 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5069 	    hn_trust_hcsum_sysctl, "I",
5070 	    "Trust ip packet verification on host side, "
5071 	    "when csum info is missing");
5072 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5073 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5074 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5075 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5076 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5077 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5078 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5079 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5080 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5081 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5082 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5083 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5084 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5085 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5086 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5087 	    hn_rx_stat_ulong_sysctl, "LU",
5088 	    "# of packets that we trust host's csum verification");
5089 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5090 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5091 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5092 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5093 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5094 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5095 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5096 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5097 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5098 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5099 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5100 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5101 
5102 	return (0);
5103 }
5104 
5105 static void
5106 hn_destroy_rx_data(struct hn_softc *sc)
5107 {
5108 	int i;
5109 
5110 	if (sc->hn_rxbuf != NULL) {
5111 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5112 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5113 		else
5114 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5115 		sc->hn_rxbuf = NULL;
5116 	}
5117 
5118 	if (sc->hn_rx_ring_cnt == 0)
5119 		return;
5120 
5121 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5122 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5123 
5124 		if (rxr->hn_br == NULL)
5125 			continue;
5126 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5127 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5128 		} else {
5129 			device_printf(sc->hn_dev,
5130 			    "%dth channel bufring is referenced", i);
5131 		}
5132 		rxr->hn_br = NULL;
5133 
5134 #if defined(INET) || defined(INET6)
5135 		tcp_lro_free(&rxr->hn_lro);
5136 #endif
5137 		free(rxr->hn_pktbuf, M_DEVBUF);
5138 	}
5139 	free(sc->hn_rx_ring, M_DEVBUF);
5140 	sc->hn_rx_ring = NULL;
5141 
5142 	sc->hn_rx_ring_cnt = 0;
5143 	sc->hn_rx_ring_inuse = 0;
5144 }
5145 
5146 static int
5147 hn_tx_ring_create(struct hn_softc *sc, int id)
5148 {
5149 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5150 	device_t dev = sc->hn_dev;
5151 	bus_dma_tag_t parent_dtag;
5152 	int error, i;
5153 
5154 	txr->hn_sc = sc;
5155 	txr->hn_tx_idx = id;
5156 
5157 #ifndef HN_USE_TXDESC_BUFRING
5158 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5159 #endif
5160 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5161 
5162 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5163 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5164 	    M_DEVBUF, M_WAITOK | M_ZERO);
5165 #ifndef HN_USE_TXDESC_BUFRING
5166 	SLIST_INIT(&txr->hn_txlist);
5167 #else
5168 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5169 	    M_WAITOK, &txr->hn_tx_lock);
5170 #endif
5171 
5172 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5173 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5174 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5175 	} else {
5176 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5177 	}
5178 
5179 #ifdef HN_IFSTART_SUPPORT
5180 	if (hn_use_if_start) {
5181 		txr->hn_txeof = hn_start_txeof;
5182 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5183 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5184 	} else
5185 #endif
5186 	{
5187 		int br_depth;
5188 
5189 		txr->hn_txeof = hn_xmit_txeof;
5190 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5191 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5192 
5193 		br_depth = hn_get_txswq_depth(txr);
5194 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5195 		    M_WAITOK, &txr->hn_tx_lock);
5196 	}
5197 
5198 	txr->hn_direct_tx_size = hn_direct_tx_size;
5199 
5200 	/*
5201 	 * Always schedule transmission instead of trying to do direct
5202 	 * transmission.  This one gives the best performance so far.
5203 	 */
5204 	txr->hn_sched_tx = 1;
5205 
5206 	parent_dtag = bus_get_dma_tag(dev);
5207 
5208 	/* DMA tag for RNDIS packet messages. */
5209 	error = bus_dma_tag_create(parent_dtag, /* parent */
5210 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5211 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5212 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5213 	    BUS_SPACE_MAXADDR,		/* highaddr */
5214 	    NULL, NULL,			/* filter, filterarg */
5215 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5216 	    1,				/* nsegments */
5217 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5218 	    0,				/* flags */
5219 	    NULL,			/* lockfunc */
5220 	    NULL,			/* lockfuncarg */
5221 	    &txr->hn_tx_rndis_dtag);
5222 	if (error) {
5223 		device_printf(dev, "failed to create rndis dmatag\n");
5224 		return error;
5225 	}
5226 
5227 	/* DMA tag for data. */
5228 	error = bus_dma_tag_create(parent_dtag, /* parent */
5229 	    1,				/* alignment */
5230 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5231 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5232 	    BUS_SPACE_MAXADDR,		/* highaddr */
5233 	    NULL, NULL,			/* filter, filterarg */
5234 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5235 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5236 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5237 	    0,				/* flags */
5238 	    NULL,			/* lockfunc */
5239 	    NULL,			/* lockfuncarg */
5240 	    &txr->hn_tx_data_dtag);
5241 	if (error) {
5242 		device_printf(dev, "failed to create data dmatag\n");
5243 		return error;
5244 	}
5245 
5246 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5247 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5248 
5249 		txd->txr = txr;
5250 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5251 		STAILQ_INIT(&txd->agg_list);
5252 
5253 		/*
5254 		 * Allocate and load RNDIS packet message.
5255 		 */
5256         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5257 		    (void **)&txd->rndis_pkt,
5258 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5259 		    &txd->rndis_pkt_dmap);
5260 		if (error) {
5261 			device_printf(dev,
5262 			    "failed to allocate rndis_packet_msg, %d\n", i);
5263 			return error;
5264 		}
5265 
5266 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5267 		    txd->rndis_pkt_dmap,
5268 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5269 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5270 		    BUS_DMA_NOWAIT);
5271 		if (error) {
5272 			device_printf(dev,
5273 			    "failed to load rndis_packet_msg, %d\n", i);
5274 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5275 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5276 			return error;
5277 		}
5278 
5279 		/* DMA map for TX data. */
5280 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5281 		    &txd->data_dmap);
5282 		if (error) {
5283 			device_printf(dev,
5284 			    "failed to allocate tx data dmamap\n");
5285 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5286 			    txd->rndis_pkt_dmap);
5287 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5288 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5289 			return error;
5290 		}
5291 
5292 		/* All set, put it to list */
5293 		txd->flags |= HN_TXD_FLAG_ONLIST;
5294 #ifndef HN_USE_TXDESC_BUFRING
5295 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5296 #else
5297 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5298 #endif
5299 	}
5300 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5301 
5302 	if (sc->hn_tx_sysctl_tree != NULL) {
5303 		struct sysctl_oid_list *child;
5304 		struct sysctl_ctx_list *ctx;
5305 		char name[16];
5306 
5307 		/*
5308 		 * Create per TX ring sysctl tree:
5309 		 * dev.hn.UNIT.tx.RINGID
5310 		 */
5311 		ctx = device_get_sysctl_ctx(dev);
5312 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5313 
5314 		snprintf(name, sizeof(name), "%d", id);
5315 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5316 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5317 
5318 		if (txr->hn_tx_sysctl_tree != NULL) {
5319 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5320 
5321 #ifdef HN_DEBUG
5322 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5323 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5324 			    "# of available TX descs");
5325 #endif
5326 #ifdef HN_IFSTART_SUPPORT
5327 			if (!hn_use_if_start)
5328 #endif
5329 			{
5330 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5331 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5332 				    "over active");
5333 			}
5334 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5335 			    CTLFLAG_RW, &txr->hn_pkts,
5336 			    "# of packets transmitted");
5337 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5338 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5339 		}
5340 	}
5341 
5342 	return 0;
5343 }
5344 
5345 static void
5346 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5347 {
5348 	struct hn_tx_ring *txr = txd->txr;
5349 
5350 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5351 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5352 
5353 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5354 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5355 	    txd->rndis_pkt_dmap);
5356 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5357 }
5358 
5359 static void
5360 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5361 {
5362 
5363 	KASSERT(txd->refs == 0 || txd->refs == 1,
5364 	    ("invalid txd refs %d", txd->refs));
5365 
5366 	/* Aggregated txds will be freed by their aggregating txd. */
5367 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5368 		int freed;
5369 
5370 		freed = hn_txdesc_put(txr, txd);
5371 		KASSERT(freed, ("can't free txdesc"));
5372 	}
5373 }
5374 
5375 static void
5376 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5377 {
5378 	int i;
5379 
5380 	if (txr->hn_txdesc == NULL)
5381 		return;
5382 
5383 	/*
5384 	 * NOTE:
5385 	 * Because the freeing of aggregated txds will be deferred
5386 	 * to the aggregating txd, two passes are used here:
5387 	 * - The first pass GCes any pending txds.  This GC is necessary,
5388 	 *   since if the channels are revoked, hypervisor will not
5389 	 *   deliver send-done for all pending txds.
5390 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5391 	 *   were freed.
5392 	 */
5393 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5394 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5395 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5396 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5397 
5398 	if (txr->hn_tx_data_dtag != NULL)
5399 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5400 	if (txr->hn_tx_rndis_dtag != NULL)
5401 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5402 
5403 #ifdef HN_USE_TXDESC_BUFRING
5404 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5405 #endif
5406 
5407 	free(txr->hn_txdesc, M_DEVBUF);
5408 	txr->hn_txdesc = NULL;
5409 
5410 	if (txr->hn_mbuf_br != NULL)
5411 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5412 
5413 #ifndef HN_USE_TXDESC_BUFRING
5414 	mtx_destroy(&txr->hn_txlist_spin);
5415 #endif
5416 	mtx_destroy(&txr->hn_tx_lock);
5417 }
5418 
5419 static int
5420 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5421 {
5422 	struct sysctl_oid_list *child;
5423 	struct sysctl_ctx_list *ctx;
5424 	int i;
5425 
5426 	/*
5427 	 * Create TXBUF for chimney sending.
5428 	 *
5429 	 * NOTE: It is shared by all channels.
5430 	 */
5431 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5432 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5433 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5434 	if (sc->hn_chim == NULL) {
5435 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5436 		return (ENOMEM);
5437 	}
5438 
5439 	sc->hn_tx_ring_cnt = ring_cnt;
5440 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5441 
5442 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5443 	    M_DEVBUF, M_WAITOK | M_ZERO);
5444 
5445 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5446 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5447 
5448 	/* Create dev.hn.UNIT.tx sysctl tree */
5449 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5450 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5451 
5452 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5453 		int error;
5454 
5455 		error = hn_tx_ring_create(sc, i);
5456 		if (error)
5457 			return error;
5458 	}
5459 
5460 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5461 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5462 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5463 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5464 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5465 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5466 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5467 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5468 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5469 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5470 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5471 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5472 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5473 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5474 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5475 	    hn_tx_stat_ulong_sysctl, "LU",
5476 	    "# of packet transmission aggregation flush failure");
5477 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5478 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5479 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5480 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5481 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5482 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5483 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5484 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5485 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5486 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5487 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5488 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5489 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5490 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5491 	    "# of total TX descs");
5492 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5493 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5494 	    "Chimney send packet size upper boundary");
5495 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5496 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5497 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5498 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5499 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5500 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5501 	    hn_tx_conf_int_sysctl, "I",
5502 	    "Size of the packet for direct transmission");
5503 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5504 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5505 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5506 	    hn_tx_conf_int_sysctl, "I",
5507 	    "Always schedule transmission "
5508 	    "instead of doing direct transmission");
5509 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5510 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5511 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5512 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5513 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5514 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5515 	    "Applied packet transmission aggregation size");
5516 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5517 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5518 	    hn_txagg_pktmax_sysctl, "I",
5519 	    "Applied packet transmission aggregation packets");
5520 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5521 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5522 	    hn_txagg_align_sysctl, "I",
5523 	    "Applied packet transmission aggregation alignment");
5524 
5525 	return 0;
5526 }
5527 
5528 static void
5529 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5530 {
5531 	int i;
5532 
5533 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5534 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5535 }
5536 
5537 static void
5538 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5539 {
5540 	struct ifnet *ifp = sc->hn_ifp;
5541 	u_int hw_tsomax;
5542 	int tso_minlen;
5543 
5544 	HN_LOCK_ASSERT(sc);
5545 
5546 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5547 		return;
5548 
5549 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5550 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5551 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5552 
5553 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5554 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5555 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5556 
5557 	if (tso_maxlen < tso_minlen)
5558 		tso_maxlen = tso_minlen;
5559 	else if (tso_maxlen > IP_MAXPACKET)
5560 		tso_maxlen = IP_MAXPACKET;
5561 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5562 		tso_maxlen = sc->hn_ndis_tso_szmax;
5563 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5564 
5565 	if (hn_xpnt_vf_isready(sc)) {
5566 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5567 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5568 	}
5569 	ifp->if_hw_tsomax = hw_tsomax;
5570 	if (bootverbose)
5571 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5572 }
5573 
5574 static void
5575 hn_fixup_tx_data(struct hn_softc *sc)
5576 {
5577 	uint64_t csum_assist;
5578 	int i;
5579 
5580 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5581 	if (hn_tx_chimney_size > 0 &&
5582 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5583 		hn_set_chim_size(sc, hn_tx_chimney_size);
5584 
5585 	csum_assist = 0;
5586 	if (sc->hn_caps & HN_CAP_IPCS)
5587 		csum_assist |= CSUM_IP;
5588 	if (sc->hn_caps & HN_CAP_TCP4CS)
5589 		csum_assist |= CSUM_IP_TCP;
5590 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5591 		csum_assist |= CSUM_IP_UDP;
5592 	if (sc->hn_caps & HN_CAP_TCP6CS)
5593 		csum_assist |= CSUM_IP6_TCP;
5594 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5595 		csum_assist |= CSUM_IP6_UDP;
5596 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5597 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5598 
5599 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5600 		/*
5601 		 * Support HASHVAL pktinfo on TX path.
5602 		 */
5603 		if (bootverbose)
5604 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5605 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5606 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5607 	}
5608 }
5609 
5610 static void
5611 hn_fixup_rx_data(struct hn_softc *sc)
5612 {
5613 
5614 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5615 		int i;
5616 
5617 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5618 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5619 	}
5620 }
5621 
5622 static void
5623 hn_destroy_tx_data(struct hn_softc *sc)
5624 {
5625 	int i;
5626 
5627 	if (sc->hn_chim != NULL) {
5628 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5629 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5630 		} else {
5631 			device_printf(sc->hn_dev,
5632 			    "chimney sending buffer is referenced");
5633 		}
5634 		sc->hn_chim = NULL;
5635 	}
5636 
5637 	if (sc->hn_tx_ring_cnt == 0)
5638 		return;
5639 
5640 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5641 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5642 
5643 	free(sc->hn_tx_ring, M_DEVBUF);
5644 	sc->hn_tx_ring = NULL;
5645 
5646 	sc->hn_tx_ring_cnt = 0;
5647 	sc->hn_tx_ring_inuse = 0;
5648 }
5649 
5650 #ifdef HN_IFSTART_SUPPORT
5651 
5652 static void
5653 hn_start_taskfunc(void *xtxr, int pending __unused)
5654 {
5655 	struct hn_tx_ring *txr = xtxr;
5656 
5657 	mtx_lock(&txr->hn_tx_lock);
5658 	hn_start_locked(txr, 0);
5659 	mtx_unlock(&txr->hn_tx_lock);
5660 }
5661 
5662 static int
5663 hn_start_locked(struct hn_tx_ring *txr, int len)
5664 {
5665 	struct hn_softc *sc = txr->hn_sc;
5666 	struct ifnet *ifp = sc->hn_ifp;
5667 	int sched = 0;
5668 
5669 	KASSERT(hn_use_if_start,
5670 	    ("hn_start_locked is called, when if_start is disabled"));
5671 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5672 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5673 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5674 
5675 	if (__predict_false(txr->hn_suspended))
5676 		return (0);
5677 
5678 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5679 	    IFF_DRV_RUNNING)
5680 		return (0);
5681 
5682 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5683 		struct hn_txdesc *txd;
5684 		struct mbuf *m_head;
5685 		int error;
5686 
5687 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5688 		if (m_head == NULL)
5689 			break;
5690 
5691 		if (len > 0 && m_head->m_pkthdr.len > len) {
5692 			/*
5693 			 * This sending could be time consuming; let callers
5694 			 * dispatch this packet sending (and sending of any
5695 			 * following up packets) to tx taskqueue.
5696 			 */
5697 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5698 			sched = 1;
5699 			break;
5700 		}
5701 
5702 #if defined(INET6) || defined(INET)
5703 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5704 			m_head = hn_tso_fixup(m_head);
5705 			if (__predict_false(m_head == NULL)) {
5706 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5707 				continue;
5708 			}
5709 		} else if (m_head->m_pkthdr.csum_flags &
5710 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5711 			m_head = hn_set_hlen(m_head);
5712 			if (__predict_false(m_head == NULL)) {
5713 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5714 				continue;
5715 			}
5716 		}
5717 #endif
5718 
5719 		txd = hn_txdesc_get(txr);
5720 		if (txd == NULL) {
5721 			txr->hn_no_txdescs++;
5722 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5723 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5724 			break;
5725 		}
5726 
5727 		error = hn_encap(ifp, txr, txd, &m_head);
5728 		if (error) {
5729 			/* Both txd and m_head are freed */
5730 			KASSERT(txr->hn_agg_txd == NULL,
5731 			    ("encap failed w/ pending aggregating txdesc"));
5732 			continue;
5733 		}
5734 
5735 		if (txr->hn_agg_pktleft == 0) {
5736 			if (txr->hn_agg_txd != NULL) {
5737 				KASSERT(m_head == NULL,
5738 				    ("pending mbuf for aggregating txdesc"));
5739 				error = hn_flush_txagg(ifp, txr);
5740 				if (__predict_false(error)) {
5741 					atomic_set_int(&ifp->if_drv_flags,
5742 					    IFF_DRV_OACTIVE);
5743 					break;
5744 				}
5745 			} else {
5746 				KASSERT(m_head != NULL, ("mbuf was freed"));
5747 				error = hn_txpkt(ifp, txr, txd);
5748 				if (__predict_false(error)) {
5749 					/* txd is freed, but m_head is not */
5750 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5751 					atomic_set_int(&ifp->if_drv_flags,
5752 					    IFF_DRV_OACTIVE);
5753 					break;
5754 				}
5755 			}
5756 		}
5757 #ifdef INVARIANTS
5758 		else {
5759 			KASSERT(txr->hn_agg_txd != NULL,
5760 			    ("no aggregating txdesc"));
5761 			KASSERT(m_head == NULL,
5762 			    ("pending mbuf for aggregating txdesc"));
5763 		}
5764 #endif
5765 	}
5766 
5767 	/* Flush pending aggerated transmission. */
5768 	if (txr->hn_agg_txd != NULL)
5769 		hn_flush_txagg(ifp, txr);
5770 	return (sched);
5771 }
5772 
5773 static void
5774 hn_start(struct ifnet *ifp)
5775 {
5776 	struct hn_softc *sc = ifp->if_softc;
5777 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5778 
5779 	if (txr->hn_sched_tx)
5780 		goto do_sched;
5781 
5782 	if (mtx_trylock(&txr->hn_tx_lock)) {
5783 		int sched;
5784 
5785 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5786 		mtx_unlock(&txr->hn_tx_lock);
5787 		if (!sched)
5788 			return;
5789 	}
5790 do_sched:
5791 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5792 }
5793 
5794 static void
5795 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5796 {
5797 	struct hn_tx_ring *txr = xtxr;
5798 
5799 	mtx_lock(&txr->hn_tx_lock);
5800 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5801 	hn_start_locked(txr, 0);
5802 	mtx_unlock(&txr->hn_tx_lock);
5803 }
5804 
5805 static void
5806 hn_start_txeof(struct hn_tx_ring *txr)
5807 {
5808 	struct hn_softc *sc = txr->hn_sc;
5809 	struct ifnet *ifp = sc->hn_ifp;
5810 
5811 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5812 
5813 	if (txr->hn_sched_tx)
5814 		goto do_sched;
5815 
5816 	if (mtx_trylock(&txr->hn_tx_lock)) {
5817 		int sched;
5818 
5819 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5820 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5821 		mtx_unlock(&txr->hn_tx_lock);
5822 		if (sched) {
5823 			taskqueue_enqueue(txr->hn_tx_taskq,
5824 			    &txr->hn_tx_task);
5825 		}
5826 	} else {
5827 do_sched:
5828 		/*
5829 		 * Release the OACTIVE earlier, with the hope, that
5830 		 * others could catch up.  The task will clear the
5831 		 * flag again with the hn_tx_lock to avoid possible
5832 		 * races.
5833 		 */
5834 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5835 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5836 	}
5837 }
5838 
5839 #endif	/* HN_IFSTART_SUPPORT */
5840 
5841 static int
5842 hn_xmit(struct hn_tx_ring *txr, int len)
5843 {
5844 	struct hn_softc *sc = txr->hn_sc;
5845 	struct ifnet *ifp = sc->hn_ifp;
5846 	struct mbuf *m_head;
5847 	int sched = 0;
5848 
5849 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5850 #ifdef HN_IFSTART_SUPPORT
5851 	KASSERT(hn_use_if_start == 0,
5852 	    ("hn_xmit is called, when if_start is enabled"));
5853 #endif
5854 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5855 
5856 	if (__predict_false(txr->hn_suspended))
5857 		return (0);
5858 
5859 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5860 		return (0);
5861 
5862 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5863 		struct hn_txdesc *txd;
5864 		int error;
5865 
5866 		if (len > 0 && m_head->m_pkthdr.len > len) {
5867 			/*
5868 			 * This sending could be time consuming; let callers
5869 			 * dispatch this packet sending (and sending of any
5870 			 * following up packets) to tx taskqueue.
5871 			 */
5872 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5873 			sched = 1;
5874 			break;
5875 		}
5876 
5877 		txd = hn_txdesc_get(txr);
5878 		if (txd == NULL) {
5879 			txr->hn_no_txdescs++;
5880 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5881 			txr->hn_oactive = 1;
5882 			break;
5883 		}
5884 
5885 		error = hn_encap(ifp, txr, txd, &m_head);
5886 		if (error) {
5887 			/* Both txd and m_head are freed; discard */
5888 			KASSERT(txr->hn_agg_txd == NULL,
5889 			    ("encap failed w/ pending aggregating txdesc"));
5890 			drbr_advance(ifp, txr->hn_mbuf_br);
5891 			continue;
5892 		}
5893 
5894 		if (txr->hn_agg_pktleft == 0) {
5895 			if (txr->hn_agg_txd != NULL) {
5896 				KASSERT(m_head == NULL,
5897 				    ("pending mbuf for aggregating txdesc"));
5898 				error = hn_flush_txagg(ifp, txr);
5899 				if (__predict_false(error)) {
5900 					txr->hn_oactive = 1;
5901 					break;
5902 				}
5903 			} else {
5904 				KASSERT(m_head != NULL, ("mbuf was freed"));
5905 				error = hn_txpkt(ifp, txr, txd);
5906 				if (__predict_false(error)) {
5907 					/* txd is freed, but m_head is not */
5908 					drbr_putback(ifp, txr->hn_mbuf_br,
5909 					    m_head);
5910 					txr->hn_oactive = 1;
5911 					break;
5912 				}
5913 			}
5914 		}
5915 #ifdef INVARIANTS
5916 		else {
5917 			KASSERT(txr->hn_agg_txd != NULL,
5918 			    ("no aggregating txdesc"));
5919 			KASSERT(m_head == NULL,
5920 			    ("pending mbuf for aggregating txdesc"));
5921 		}
5922 #endif
5923 
5924 		/* Sent */
5925 		drbr_advance(ifp, txr->hn_mbuf_br);
5926 	}
5927 
5928 	/* Flush pending aggerated transmission. */
5929 	if (txr->hn_agg_txd != NULL)
5930 		hn_flush_txagg(ifp, txr);
5931 	return (sched);
5932 }
5933 
5934 static int
5935 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5936 {
5937 	struct hn_softc *sc = ifp->if_softc;
5938 	struct hn_tx_ring *txr;
5939 	int error, idx = 0;
5940 
5941 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5942 		struct rm_priotracker pt;
5943 
5944 		rm_rlock(&sc->hn_vf_lock, &pt);
5945 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5946 			struct mbuf *m_bpf = NULL;
5947 			int obytes, omcast;
5948 
5949 			obytes = m->m_pkthdr.len;
5950 			omcast = (m->m_flags & M_MCAST) != 0;
5951 
5952 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5953 				if (bpf_peers_present(ifp->if_bpf)) {
5954 					m_bpf = m_copypacket(m, M_NOWAIT);
5955 					if (m_bpf == NULL) {
5956 						/*
5957 						 * Failed to grab a shallow
5958 						 * copy; tap now.
5959 						 */
5960 						ETHER_BPF_MTAP(ifp, m);
5961 					}
5962 				}
5963 			} else {
5964 				ETHER_BPF_MTAP(ifp, m);
5965 			}
5966 
5967 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5968 			rm_runlock(&sc->hn_vf_lock, &pt);
5969 
5970 			if (m_bpf != NULL) {
5971 				if (!error)
5972 					ETHER_BPF_MTAP(ifp, m_bpf);
5973 				m_freem(m_bpf);
5974 			}
5975 
5976 			if (error == ENOBUFS) {
5977 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5978 			} else if (error) {
5979 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5980 			} else {
5981 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5982 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5983 				if (omcast) {
5984 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5985 					    omcast);
5986 				}
5987 			}
5988 			return (error);
5989 		}
5990 		rm_runlock(&sc->hn_vf_lock, &pt);
5991 	}
5992 
5993 #if defined(INET6) || defined(INET)
5994 	/*
5995 	 * Perform TSO packet header fixup or get l2/l3 header length now,
5996 	 * since packet headers should be cache-hot.
5997 	 */
5998 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5999 		m = hn_tso_fixup(m);
6000 		if (__predict_false(m == NULL)) {
6001 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6002 			return EIO;
6003 		}
6004 	} else if (m->m_pkthdr.csum_flags &
6005 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6006 		m = hn_set_hlen(m);
6007 		if (__predict_false(m == NULL)) {
6008 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6009 			return EIO;
6010 		}
6011 	}
6012 #endif
6013 
6014 	/*
6015 	 * Select the TX ring based on flowid
6016 	 */
6017 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6018 #ifdef RSS
6019 		uint32_t bid;
6020 
6021 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6022 		    &bid) == 0)
6023 			idx = bid % sc->hn_tx_ring_inuse;
6024 		else
6025 #endif
6026 		{
6027 #if defined(INET6) || defined(INET)
6028 			int tcpsyn = 0;
6029 
6030 			if (m->m_pkthdr.len < 128 &&
6031 			    (m->m_pkthdr.csum_flags &
6032 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6033 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6034 				m = hn_check_tcpsyn(m, &tcpsyn);
6035 				if (__predict_false(m == NULL)) {
6036 					if_inc_counter(ifp,
6037 					    IFCOUNTER_OERRORS, 1);
6038 					return (EIO);
6039 				}
6040 			}
6041 #else
6042 			const int tcpsyn = 0;
6043 #endif
6044 			if (tcpsyn)
6045 				idx = 0;
6046 			else
6047 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6048 		}
6049 	}
6050 	txr = &sc->hn_tx_ring[idx];
6051 
6052 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6053 	if (error) {
6054 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6055 		return error;
6056 	}
6057 
6058 	if (txr->hn_oactive)
6059 		return 0;
6060 
6061 	if (txr->hn_sched_tx)
6062 		goto do_sched;
6063 
6064 	if (mtx_trylock(&txr->hn_tx_lock)) {
6065 		int sched;
6066 
6067 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6068 		mtx_unlock(&txr->hn_tx_lock);
6069 		if (!sched)
6070 			return 0;
6071 	}
6072 do_sched:
6073 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6074 	return 0;
6075 }
6076 
6077 static void
6078 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6079 {
6080 	struct mbuf *m;
6081 
6082 	mtx_lock(&txr->hn_tx_lock);
6083 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6084 		m_freem(m);
6085 	mtx_unlock(&txr->hn_tx_lock);
6086 }
6087 
6088 static void
6089 hn_xmit_qflush(struct ifnet *ifp)
6090 {
6091 	struct hn_softc *sc = ifp->if_softc;
6092 	struct rm_priotracker pt;
6093 	int i;
6094 
6095 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6096 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6097 	if_qflush(ifp);
6098 
6099 	rm_rlock(&sc->hn_vf_lock, &pt);
6100 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6101 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6102 	rm_runlock(&sc->hn_vf_lock, &pt);
6103 }
6104 
6105 static void
6106 hn_xmit_txeof(struct hn_tx_ring *txr)
6107 {
6108 
6109 	if (txr->hn_sched_tx)
6110 		goto do_sched;
6111 
6112 	if (mtx_trylock(&txr->hn_tx_lock)) {
6113 		int sched;
6114 
6115 		txr->hn_oactive = 0;
6116 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6117 		mtx_unlock(&txr->hn_tx_lock);
6118 		if (sched) {
6119 			taskqueue_enqueue(txr->hn_tx_taskq,
6120 			    &txr->hn_tx_task);
6121 		}
6122 	} else {
6123 do_sched:
6124 		/*
6125 		 * Release the oactive earlier, with the hope, that
6126 		 * others could catch up.  The task will clear the
6127 		 * oactive again with the hn_tx_lock to avoid possible
6128 		 * races.
6129 		 */
6130 		txr->hn_oactive = 0;
6131 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6132 	}
6133 }
6134 
6135 static void
6136 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6137 {
6138 	struct hn_tx_ring *txr = xtxr;
6139 
6140 	mtx_lock(&txr->hn_tx_lock);
6141 	hn_xmit(txr, 0);
6142 	mtx_unlock(&txr->hn_tx_lock);
6143 }
6144 
6145 static void
6146 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6147 {
6148 	struct hn_tx_ring *txr = xtxr;
6149 
6150 	mtx_lock(&txr->hn_tx_lock);
6151 	txr->hn_oactive = 0;
6152 	hn_xmit(txr, 0);
6153 	mtx_unlock(&txr->hn_tx_lock);
6154 }
6155 
6156 static int
6157 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6158 {
6159 	struct vmbus_chan_br cbr;
6160 	struct hn_rx_ring *rxr;
6161 	struct hn_tx_ring *txr = NULL;
6162 	int idx, error;
6163 
6164 	idx = vmbus_chan_subidx(chan);
6165 
6166 	/*
6167 	 * Link this channel to RX/TX ring.
6168 	 */
6169 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6170 	    ("invalid channel index %d, should > 0 && < %d",
6171 	     idx, sc->hn_rx_ring_inuse));
6172 	rxr = &sc->hn_rx_ring[idx];
6173 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6174 	    ("RX ring %d already attached", idx));
6175 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6176 	rxr->hn_chan = chan;
6177 
6178 	if (bootverbose) {
6179 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6180 		    idx, vmbus_chan_id(chan));
6181 	}
6182 
6183 	if (idx < sc->hn_tx_ring_inuse) {
6184 		txr = &sc->hn_tx_ring[idx];
6185 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6186 		    ("TX ring %d already attached", idx));
6187 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6188 
6189 		txr->hn_chan = chan;
6190 		if (bootverbose) {
6191 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6192 			    idx, vmbus_chan_id(chan));
6193 		}
6194 	}
6195 
6196 	/* Bind this channel to a proper CPU. */
6197 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6198 
6199 	/*
6200 	 * Open this channel
6201 	 */
6202 	cbr.cbr = rxr->hn_br;
6203 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6204 	cbr.cbr_txsz = HN_TXBR_SIZE;
6205 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6206 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6207 	if (error) {
6208 		if (error == EISCONN) {
6209 			if_printf(sc->hn_ifp, "bufring is connected after "
6210 			    "chan%u open failure\n", vmbus_chan_id(chan));
6211 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6212 		} else {
6213 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6214 			    vmbus_chan_id(chan), error);
6215 		}
6216 	}
6217 	return (error);
6218 }
6219 
6220 static void
6221 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6222 {
6223 	struct hn_rx_ring *rxr;
6224 	int idx, error;
6225 
6226 	idx = vmbus_chan_subidx(chan);
6227 
6228 	/*
6229 	 * Link this channel to RX/TX ring.
6230 	 */
6231 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6232 	    ("invalid channel index %d, should > 0 && < %d",
6233 	     idx, sc->hn_rx_ring_inuse));
6234 	rxr = &sc->hn_rx_ring[idx];
6235 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6236 	    ("RX ring %d is not attached", idx));
6237 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6238 
6239 	if (idx < sc->hn_tx_ring_inuse) {
6240 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6241 
6242 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6243 		    ("TX ring %d is not attached attached", idx));
6244 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6245 	}
6246 
6247 	/*
6248 	 * Close this channel.
6249 	 *
6250 	 * NOTE:
6251 	 * Channel closing does _not_ destroy the target channel.
6252 	 */
6253 	error = vmbus_chan_close_direct(chan);
6254 	if (error == EISCONN) {
6255 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6256 		    "after being closed\n", vmbus_chan_id(chan));
6257 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6258 	} else if (error) {
6259 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6260 		    vmbus_chan_id(chan), error);
6261 	}
6262 }
6263 
6264 static int
6265 hn_attach_subchans(struct hn_softc *sc)
6266 {
6267 	struct vmbus_channel **subchans;
6268 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6269 	int i, error = 0;
6270 
6271 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6272 
6273 	/* Attach the sub-channels. */
6274 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6275 	for (i = 0; i < subchan_cnt; ++i) {
6276 		int error1;
6277 
6278 		error1 = hn_chan_attach(sc, subchans[i]);
6279 		if (error1) {
6280 			error = error1;
6281 			/* Move on; all channels will be detached later. */
6282 		}
6283 	}
6284 	vmbus_subchan_rel(subchans, subchan_cnt);
6285 
6286 	if (error) {
6287 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6288 	} else {
6289 		if (bootverbose) {
6290 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6291 			    subchan_cnt);
6292 		}
6293 	}
6294 	return (error);
6295 }
6296 
6297 static void
6298 hn_detach_allchans(struct hn_softc *sc)
6299 {
6300 	struct vmbus_channel **subchans;
6301 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6302 	int i;
6303 
6304 	if (subchan_cnt == 0)
6305 		goto back;
6306 
6307 	/* Detach the sub-channels. */
6308 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6309 	for (i = 0; i < subchan_cnt; ++i)
6310 		hn_chan_detach(sc, subchans[i]);
6311 	vmbus_subchan_rel(subchans, subchan_cnt);
6312 
6313 back:
6314 	/*
6315 	 * Detach the primary channel, _after_ all sub-channels
6316 	 * are detached.
6317 	 */
6318 	hn_chan_detach(sc, sc->hn_prichan);
6319 
6320 	/* Wait for sub-channels to be destroyed, if any. */
6321 	vmbus_subchan_drain(sc->hn_prichan);
6322 
6323 #ifdef INVARIANTS
6324 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6325 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6326 		    HN_RX_FLAG_ATTACHED) == 0,
6327 		    ("%dth RX ring is still attached", i));
6328 	}
6329 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6330 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6331 		    HN_TX_FLAG_ATTACHED) == 0,
6332 		    ("%dth TX ring is still attached", i));
6333 	}
6334 #endif
6335 }
6336 
6337 static int
6338 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6339 {
6340 	struct vmbus_channel **subchans;
6341 	int nchan, rxr_cnt, error;
6342 
6343 	nchan = *nsubch + 1;
6344 	if (nchan == 1) {
6345 		/*
6346 		 * Multiple RX/TX rings are not requested.
6347 		 */
6348 		*nsubch = 0;
6349 		return (0);
6350 	}
6351 
6352 	/*
6353 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6354 	 * table entries.
6355 	 */
6356 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6357 	if (error) {
6358 		/* No RSS; this is benign. */
6359 		*nsubch = 0;
6360 		return (0);
6361 	}
6362 	if (bootverbose) {
6363 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6364 		    rxr_cnt, nchan);
6365 	}
6366 
6367 	if (nchan > rxr_cnt)
6368 		nchan = rxr_cnt;
6369 	if (nchan == 1) {
6370 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6371 		*nsubch = 0;
6372 		return (0);
6373 	}
6374 
6375 	/*
6376 	 * Allocate sub-channels from NVS.
6377 	 */
6378 	*nsubch = nchan - 1;
6379 	error = hn_nvs_alloc_subchans(sc, nsubch);
6380 	if (error || *nsubch == 0) {
6381 		/* Failed to allocate sub-channels. */
6382 		*nsubch = 0;
6383 		return (0);
6384 	}
6385 
6386 	/*
6387 	 * Wait for all sub-channels to become ready before moving on.
6388 	 */
6389 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6390 	vmbus_subchan_rel(subchans, *nsubch);
6391 	return (0);
6392 }
6393 
6394 static bool
6395 hn_synth_attachable(const struct hn_softc *sc)
6396 {
6397 	int i;
6398 
6399 	if (sc->hn_flags & HN_FLAG_ERRORS)
6400 		return (false);
6401 
6402 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6403 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6404 
6405 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6406 			return (false);
6407 	}
6408 	return (true);
6409 }
6410 
6411 /*
6412  * Make sure that the RX filter is zero after the successful
6413  * RNDIS initialization.
6414  *
6415  * NOTE:
6416  * Under certain conditions on certain versions of Hyper-V,
6417  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6418  * after the successful RNDIS initialization, which breaks
6419  * the assumption of any following code (well, it breaks the
6420  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6421  * explicitly, drain packets sneaking through, and drain the
6422  * interrupt taskqueues scheduled due to the stealth packets.
6423  */
6424 static void
6425 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6426 {
6427 
6428 	hn_disable_rx(sc);
6429 	hn_drain_rxtx(sc, nchan);
6430 }
6431 
6432 static int
6433 hn_synth_attach(struct hn_softc *sc, int mtu)
6434 {
6435 #define ATTACHED_NVS		0x0002
6436 #define ATTACHED_RNDIS		0x0004
6437 
6438 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6439 	int error, nsubch, nchan = 1, i, rndis_inited;
6440 	uint32_t old_caps, attached = 0;
6441 
6442 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6443 	    ("synthetic parts were attached"));
6444 
6445 	if (!hn_synth_attachable(sc))
6446 		return (ENXIO);
6447 
6448 	/* Save capabilities for later verification. */
6449 	old_caps = sc->hn_caps;
6450 	sc->hn_caps = 0;
6451 
6452 	/* Clear RSS stuffs. */
6453 	sc->hn_rss_ind_size = 0;
6454 	sc->hn_rss_hash = 0;
6455 	sc->hn_rss_hcap = 0;
6456 
6457 	/*
6458 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6459 	 */
6460 	error = hn_chan_attach(sc, sc->hn_prichan);
6461 	if (error)
6462 		goto failed;
6463 
6464 	/*
6465 	 * Attach NVS.
6466 	 */
6467 	error = hn_nvs_attach(sc, mtu);
6468 	if (error)
6469 		goto failed;
6470 	attached |= ATTACHED_NVS;
6471 
6472 	/*
6473 	 * Attach RNDIS _after_ NVS is attached.
6474 	 */
6475 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6476 	if (rndis_inited)
6477 		attached |= ATTACHED_RNDIS;
6478 	if (error)
6479 		goto failed;
6480 
6481 	/*
6482 	 * Make sure capabilities are not changed.
6483 	 */
6484 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6485 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6486 		    old_caps, sc->hn_caps);
6487 		error = ENXIO;
6488 		goto failed;
6489 	}
6490 
6491 	/*
6492 	 * Allocate sub-channels for multi-TX/RX rings.
6493 	 *
6494 	 * NOTE:
6495 	 * The # of RX rings that can be used is equivalent to the # of
6496 	 * channels to be requested.
6497 	 */
6498 	nsubch = sc->hn_rx_ring_cnt - 1;
6499 	error = hn_synth_alloc_subchans(sc, &nsubch);
6500 	if (error)
6501 		goto failed;
6502 	/* NOTE: _Full_ synthetic parts detach is required now. */
6503 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6504 
6505 	/*
6506 	 * Set the # of TX/RX rings that could be used according to
6507 	 * the # of channels that NVS offered.
6508 	 */
6509 	nchan = nsubch + 1;
6510 	hn_set_ring_inuse(sc, nchan);
6511 	if (nchan == 1) {
6512 		/* Only the primary channel can be used; done */
6513 		goto back;
6514 	}
6515 
6516 	/*
6517 	 * Attach the sub-channels.
6518 	 *
6519 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6520 	 */
6521 	error = hn_attach_subchans(sc);
6522 	if (error)
6523 		goto failed;
6524 
6525 	/*
6526 	 * Configure RSS key and indirect table _after_ all sub-channels
6527 	 * are attached.
6528 	 */
6529 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6530 		/*
6531 		 * RSS key is not set yet; set it to the default RSS key.
6532 		 */
6533 		if (bootverbose)
6534 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6535 #ifdef RSS
6536 		rss_getkey(rss->rss_key);
6537 #else
6538 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6539 #endif
6540 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6541 	}
6542 
6543 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6544 		/*
6545 		 * RSS indirect table is not set yet; set it up in round-
6546 		 * robin fashion.
6547 		 */
6548 		if (bootverbose) {
6549 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6550 			    "table\n");
6551 		}
6552 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6553 			uint32_t subidx;
6554 
6555 #ifdef RSS
6556 			subidx = rss_get_indirection_to_bucket(i);
6557 #else
6558 			subidx = i;
6559 #endif
6560 			rss->rss_ind[i] = subidx % nchan;
6561 		}
6562 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6563 	} else {
6564 		/*
6565 		 * # of usable channels may be changed, so we have to
6566 		 * make sure that all entries in RSS indirect table
6567 		 * are valid.
6568 		 *
6569 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6570 		 */
6571 		hn_rss_ind_fixup(sc);
6572 	}
6573 
6574 	sc->hn_rss_hash = sc->hn_rss_hcap;
6575 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6576 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6577 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6578 		hn_vf_rss_fixup(sc, false);
6579 	}
6580 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6581 	if (error)
6582 		goto failed;
6583 back:
6584 	/*
6585 	 * Fixup transmission aggregation setup.
6586 	 */
6587 	hn_set_txagg(sc);
6588 	hn_rndis_init_fixat(sc, nchan);
6589 	return (0);
6590 
6591 failed:
6592 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6593 		hn_rndis_init_fixat(sc, nchan);
6594 		hn_synth_detach(sc);
6595 	} else {
6596 		if (attached & ATTACHED_RNDIS) {
6597 			hn_rndis_init_fixat(sc, nchan);
6598 			hn_rndis_detach(sc);
6599 		}
6600 		if (attached & ATTACHED_NVS)
6601 			hn_nvs_detach(sc);
6602 		hn_chan_detach(sc, sc->hn_prichan);
6603 		/* Restore old capabilities. */
6604 		sc->hn_caps = old_caps;
6605 	}
6606 	return (error);
6607 
6608 #undef ATTACHED_RNDIS
6609 #undef ATTACHED_NVS
6610 }
6611 
6612 /*
6613  * NOTE:
6614  * The interface must have been suspended though hn_suspend(), before
6615  * this function get called.
6616  */
6617 static void
6618 hn_synth_detach(struct hn_softc *sc)
6619 {
6620 
6621 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6622 	    ("synthetic parts were not attached"));
6623 
6624 	/* Detach the RNDIS first. */
6625 	hn_rndis_detach(sc);
6626 
6627 	/* Detach NVS. */
6628 	hn_nvs_detach(sc);
6629 
6630 	/* Detach all of the channels. */
6631 	hn_detach_allchans(sc);
6632 
6633 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6634 		/*
6635 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6636 		 */
6637 		int error;
6638 
6639 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6640 		    sc->hn_rxbuf_gpadl);
6641 		if (error) {
6642 			if_printf(sc->hn_ifp,
6643 			    "rxbuf gpadl disconn failed: %d\n", error);
6644 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6645 		}
6646 		sc->hn_rxbuf_gpadl = 0;
6647 	}
6648 
6649 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6650 		/*
6651 		 * Host is post-Win2016, disconnect chimney sending buffer from
6652 		 * primary channel here.
6653 		 */
6654 		int error;
6655 
6656 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6657 		    sc->hn_chim_gpadl);
6658 		if (error) {
6659 			if_printf(sc->hn_ifp,
6660 			    "chim gpadl disconn failed: %d\n", error);
6661 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6662 		}
6663 		sc->hn_chim_gpadl = 0;
6664 	}
6665 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6666 }
6667 
6668 static void
6669 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6670 {
6671 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6672 	    ("invalid ring count %d", ring_cnt));
6673 
6674 	if (sc->hn_tx_ring_cnt > ring_cnt)
6675 		sc->hn_tx_ring_inuse = ring_cnt;
6676 	else
6677 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6678 	sc->hn_rx_ring_inuse = ring_cnt;
6679 
6680 #ifdef RSS
6681 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6682 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6683 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6684 		    rss_getnumbuckets());
6685 	}
6686 #endif
6687 
6688 	if (bootverbose) {
6689 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6690 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6691 	}
6692 }
6693 
6694 static void
6695 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6696 {
6697 
6698 	/*
6699 	 * NOTE:
6700 	 * The TX bufring will not be drained by the hypervisor,
6701 	 * if the primary channel is revoked.
6702 	 */
6703 	while (!vmbus_chan_rx_empty(chan) ||
6704 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6705 	     !vmbus_chan_tx_empty(chan)))
6706 		pause("waitch", 1);
6707 	vmbus_chan_intr_drain(chan);
6708 }
6709 
6710 static void
6711 hn_disable_rx(struct hn_softc *sc)
6712 {
6713 
6714 	/*
6715 	 * Disable RX by clearing RX filter forcefully.
6716 	 */
6717 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6718 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6719 
6720 	/*
6721 	 * Give RNDIS enough time to flush all pending data packets.
6722 	 */
6723 	pause("waitrx", (200 * hz) / 1000);
6724 }
6725 
6726 /*
6727  * NOTE:
6728  * RX/TX _must_ have been suspended/disabled, before this function
6729  * is called.
6730  */
6731 static void
6732 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6733 {
6734 	struct vmbus_channel **subch = NULL;
6735 	int nsubch;
6736 
6737 	/*
6738 	 * Drain RX/TX bufrings and interrupts.
6739 	 */
6740 	nsubch = nchan - 1;
6741 	if (nsubch > 0)
6742 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6743 
6744 	if (subch != NULL) {
6745 		int i;
6746 
6747 		for (i = 0; i < nsubch; ++i)
6748 			hn_chan_drain(sc, subch[i]);
6749 	}
6750 	hn_chan_drain(sc, sc->hn_prichan);
6751 
6752 	if (subch != NULL)
6753 		vmbus_subchan_rel(subch, nsubch);
6754 }
6755 
6756 static void
6757 hn_suspend_data(struct hn_softc *sc)
6758 {
6759 	struct hn_tx_ring *txr;
6760 	int i;
6761 
6762 	HN_LOCK_ASSERT(sc);
6763 
6764 	/*
6765 	 * Suspend TX.
6766 	 */
6767 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6768 		txr = &sc->hn_tx_ring[i];
6769 
6770 		mtx_lock(&txr->hn_tx_lock);
6771 		txr->hn_suspended = 1;
6772 		mtx_unlock(&txr->hn_tx_lock);
6773 		/* No one is able send more packets now. */
6774 
6775 		/*
6776 		 * Wait for all pending sends to finish.
6777 		 *
6778 		 * NOTE:
6779 		 * We will _not_ receive all pending send-done, if the
6780 		 * primary channel is revoked.
6781 		 */
6782 		while (hn_tx_ring_pending(txr) &&
6783 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6784 			pause("hnwtx", 1 /* 1 tick */);
6785 	}
6786 
6787 	/*
6788 	 * Disable RX.
6789 	 */
6790 	hn_disable_rx(sc);
6791 
6792 	/*
6793 	 * Drain RX/TX.
6794 	 */
6795 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6796 
6797 	/*
6798 	 * Drain any pending TX tasks.
6799 	 *
6800 	 * NOTE:
6801 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6802 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6803 	 */
6804 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6805 		txr = &sc->hn_tx_ring[i];
6806 
6807 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6808 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6809 	}
6810 }
6811 
6812 static void
6813 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6814 {
6815 
6816 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6817 }
6818 
6819 static void
6820 hn_suspend_mgmt(struct hn_softc *sc)
6821 {
6822 	struct task task;
6823 
6824 	HN_LOCK_ASSERT(sc);
6825 
6826 	/*
6827 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6828 	 * through hn_mgmt_taskq.
6829 	 */
6830 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6831 	vmbus_chan_run_task(sc->hn_prichan, &task);
6832 
6833 	/*
6834 	 * Make sure that all pending management tasks are completed.
6835 	 */
6836 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6837 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6838 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6839 }
6840 
6841 static void
6842 hn_suspend(struct hn_softc *sc)
6843 {
6844 
6845 	/* Disable polling. */
6846 	hn_polling(sc, 0);
6847 
6848 	/*
6849 	 * If the non-transparent mode VF is activated, the synthetic
6850 	 * device is receiving packets, so the data path of the
6851 	 * synthetic device must be suspended.
6852 	 */
6853 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6854 	    (sc->hn_flags & HN_FLAG_RXVF))
6855 		hn_suspend_data(sc);
6856 	hn_suspend_mgmt(sc);
6857 }
6858 
6859 static void
6860 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6861 {
6862 	int i;
6863 
6864 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6865 	    ("invalid TX ring count %d", tx_ring_cnt));
6866 
6867 	for (i = 0; i < tx_ring_cnt; ++i) {
6868 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6869 
6870 		mtx_lock(&txr->hn_tx_lock);
6871 		txr->hn_suspended = 0;
6872 		mtx_unlock(&txr->hn_tx_lock);
6873 	}
6874 }
6875 
6876 static void
6877 hn_resume_data(struct hn_softc *sc)
6878 {
6879 	int i;
6880 
6881 	HN_LOCK_ASSERT(sc);
6882 
6883 	/*
6884 	 * Re-enable RX.
6885 	 */
6886 	hn_rxfilter_config(sc);
6887 
6888 	/*
6889 	 * Make sure to clear suspend status on "all" TX rings,
6890 	 * since hn_tx_ring_inuse can be changed after
6891 	 * hn_suspend_data().
6892 	 */
6893 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6894 
6895 #ifdef HN_IFSTART_SUPPORT
6896 	if (!hn_use_if_start)
6897 #endif
6898 	{
6899 		/*
6900 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6901 		 * reduced.
6902 		 */
6903 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6904 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6905 	}
6906 
6907 	/*
6908 	 * Kick start TX.
6909 	 */
6910 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6911 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6912 
6913 		/*
6914 		 * Use txeof task, so that any pending oactive can be
6915 		 * cleared properly.
6916 		 */
6917 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6918 	}
6919 }
6920 
6921 static void
6922 hn_resume_mgmt(struct hn_softc *sc)
6923 {
6924 
6925 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6926 
6927 	/*
6928 	 * Kick off network change detection, if it was pending.
6929 	 * If no network change was pending, start link status
6930 	 * checks, which is more lightweight than network change
6931 	 * detection.
6932 	 */
6933 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6934 		hn_change_network(sc);
6935 	else
6936 		hn_update_link_status(sc);
6937 }
6938 
6939 static void
6940 hn_resume(struct hn_softc *sc)
6941 {
6942 
6943 	/*
6944 	 * If the non-transparent mode VF is activated, the synthetic
6945 	 * device have to receive packets, so the data path of the
6946 	 * synthetic device must be resumed.
6947 	 */
6948 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6949 	    (sc->hn_flags & HN_FLAG_RXVF))
6950 		hn_resume_data(sc);
6951 
6952 	/*
6953 	 * Don't resume link status change if VF is attached/activated.
6954 	 * - In the non-transparent VF mode, the synthetic device marks
6955 	 *   link down until the VF is deactivated; i.e. VF is down.
6956 	 * - In transparent VF mode, VF's media status is used until
6957 	 *   the VF is detached.
6958 	 */
6959 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6960 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6961 		hn_resume_mgmt(sc);
6962 
6963 	/*
6964 	 * Re-enable polling if this interface is running and
6965 	 * the polling is requested.
6966 	 */
6967 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6968 		hn_polling(sc, sc->hn_pollhz);
6969 }
6970 
6971 static void
6972 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6973 {
6974 	const struct rndis_status_msg *msg;
6975 	int ofs;
6976 
6977 	if (dlen < sizeof(*msg)) {
6978 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6979 		return;
6980 	}
6981 	msg = data;
6982 
6983 	switch (msg->rm_status) {
6984 	case RNDIS_STATUS_MEDIA_CONNECT:
6985 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6986 		hn_update_link_status(sc);
6987 		break;
6988 
6989 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6990 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6991 		/* Not really useful; ignore. */
6992 		break;
6993 
6994 	case RNDIS_STATUS_NETWORK_CHANGE:
6995 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6996 		if (dlen < ofs + msg->rm_stbuflen ||
6997 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6998 			if_printf(sc->hn_ifp, "network changed\n");
6999 		} else {
7000 			uint32_t change;
7001 
7002 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7003 			    sizeof(change));
7004 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7005 			    change);
7006 		}
7007 		hn_change_network(sc);
7008 		break;
7009 
7010 	default:
7011 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7012 		    msg->rm_status);
7013 		break;
7014 	}
7015 }
7016 
7017 static int
7018 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7019 {
7020 	const struct rndis_pktinfo *pi = info_data;
7021 	uint32_t mask = 0;
7022 
7023 	while (info_dlen != 0) {
7024 		const void *data;
7025 		uint32_t dlen;
7026 
7027 		if (__predict_false(info_dlen < sizeof(*pi)))
7028 			return (EINVAL);
7029 		if (__predict_false(info_dlen < pi->rm_size))
7030 			return (EINVAL);
7031 		info_dlen -= pi->rm_size;
7032 
7033 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7034 			return (EINVAL);
7035 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7036 			return (EINVAL);
7037 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7038 		data = pi->rm_data;
7039 
7040 		switch (pi->rm_type) {
7041 		case NDIS_PKTINFO_TYPE_VLAN:
7042 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
7043 				return (EINVAL);
7044 			info->vlan_info = *((const uint32_t *)data);
7045 			mask |= HN_RXINFO_VLAN;
7046 			break;
7047 
7048 		case NDIS_PKTINFO_TYPE_CSUM:
7049 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
7050 				return (EINVAL);
7051 			info->csum_info = *((const uint32_t *)data);
7052 			mask |= HN_RXINFO_CSUM;
7053 			break;
7054 
7055 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7056 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
7057 				return (EINVAL);
7058 			info->hash_value = *((const uint32_t *)data);
7059 			mask |= HN_RXINFO_HASHVAL;
7060 			break;
7061 
7062 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
7063 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
7064 				return (EINVAL);
7065 			info->hash_info = *((const uint32_t *)data);
7066 			mask |= HN_RXINFO_HASHINF;
7067 			break;
7068 
7069 		default:
7070 			goto next;
7071 		}
7072 
7073 		if (mask == HN_RXINFO_ALL) {
7074 			/* All found; done */
7075 			break;
7076 		}
7077 next:
7078 		pi = (const struct rndis_pktinfo *)
7079 		    ((const uint8_t *)pi + pi->rm_size);
7080 	}
7081 
7082 	/*
7083 	 * Final fixup.
7084 	 * - If there is no hash value, invalidate the hash info.
7085 	 */
7086 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7087 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7088 	return (0);
7089 }
7090 
7091 static __inline bool
7092 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7093 {
7094 
7095 	if (off < check_off) {
7096 		if (__predict_true(off + len <= check_off))
7097 			return (false);
7098 	} else if (off > check_off) {
7099 		if (__predict_true(check_off + check_len <= off))
7100 			return (false);
7101 	}
7102 	return (true);
7103 }
7104 
7105 static void
7106 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7107 {
7108 	const struct rndis_packet_msg *pkt;
7109 	struct hn_rxinfo info;
7110 	int data_off, pktinfo_off, data_len, pktinfo_len;
7111 
7112 	/*
7113 	 * Check length.
7114 	 */
7115 	if (__predict_false(dlen < sizeof(*pkt))) {
7116 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7117 		return;
7118 	}
7119 	pkt = data;
7120 
7121 	if (__predict_false(dlen < pkt->rm_len)) {
7122 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7123 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7124 		return;
7125 	}
7126 	if (__predict_false(pkt->rm_len <
7127 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7128 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7129 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7130 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7131 		    pkt->rm_pktinfolen);
7132 		return;
7133 	}
7134 	if (__predict_false(pkt->rm_datalen == 0)) {
7135 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7136 		return;
7137 	}
7138 
7139 	/*
7140 	 * Check offests.
7141 	 */
7142 #define IS_OFFSET_INVALID(ofs)			\
7143 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7144 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7145 
7146 	/* XXX Hyper-V does not meet data offset alignment requirement */
7147 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7148 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7149 		    "data offset %u\n", pkt->rm_dataoffset);
7150 		return;
7151 	}
7152 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7153 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7154 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7155 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7156 		return;
7157 	}
7158 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7159 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7160 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7161 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7162 		return;
7163 	}
7164 
7165 #undef IS_OFFSET_INVALID
7166 
7167 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7168 	data_len = pkt->rm_datalen;
7169 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7170 	pktinfo_len = pkt->rm_pktinfolen;
7171 
7172 	/*
7173 	 * Check OOB coverage.
7174 	 */
7175 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7176 		int oob_off, oob_len;
7177 
7178 		if_printf(rxr->hn_ifp, "got oobdata\n");
7179 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7180 		oob_len = pkt->rm_oobdatalen;
7181 
7182 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7183 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7184 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7185 			    pkt->rm_len, oob_off, oob_len);
7186 			return;
7187 		}
7188 
7189 		/*
7190 		 * Check against data.
7191 		 */
7192 		if (hn_rndis_check_overlap(oob_off, oob_len,
7193 		    data_off, data_len)) {
7194 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7195 			    "oob overlaps data, oob abs %d len %d, "
7196 			    "data abs %d len %d\n",
7197 			    oob_off, oob_len, data_off, data_len);
7198 			return;
7199 		}
7200 
7201 		/*
7202 		 * Check against pktinfo.
7203 		 */
7204 		if (pktinfo_len != 0 &&
7205 		    hn_rndis_check_overlap(oob_off, oob_len,
7206 		    pktinfo_off, pktinfo_len)) {
7207 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7208 			    "oob overlaps pktinfo, oob abs %d len %d, "
7209 			    "pktinfo abs %d len %d\n",
7210 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7211 			return;
7212 		}
7213 	}
7214 
7215 	/*
7216 	 * Check per-packet-info coverage and find useful per-packet-info.
7217 	 */
7218 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7219 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7220 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7221 	if (__predict_true(pktinfo_len != 0)) {
7222 		bool overlap;
7223 		int error;
7224 
7225 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7226 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7227 			    "pktinfo overflow, msglen %u, "
7228 			    "pktinfo abs %d len %d\n",
7229 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7230 			return;
7231 		}
7232 
7233 		/*
7234 		 * Check packet info coverage.
7235 		 */
7236 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7237 		    data_off, data_len);
7238 		if (__predict_false(overlap)) {
7239 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7240 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7241 			    "data abs %d len %d\n",
7242 			    pktinfo_off, pktinfo_len, data_off, data_len);
7243 			return;
7244 		}
7245 
7246 		/*
7247 		 * Find useful per-packet-info.
7248 		 */
7249 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7250 		    pktinfo_len, &info);
7251 		if (__predict_false(error)) {
7252 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7253 			    "pktinfo\n");
7254 			return;
7255 		}
7256 	}
7257 
7258 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7259 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7260 		    "data overflow, msglen %u, data abs %d len %d\n",
7261 		    pkt->rm_len, data_off, data_len);
7262 		return;
7263 	}
7264 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7265 }
7266 
7267 static __inline void
7268 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7269 {
7270 	const struct rndis_msghdr *hdr;
7271 
7272 	if (__predict_false(dlen < sizeof(*hdr))) {
7273 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7274 		return;
7275 	}
7276 	hdr = data;
7277 
7278 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7279 		/* Hot data path. */
7280 		hn_rndis_rx_data(rxr, data, dlen);
7281 		/* Done! */
7282 		return;
7283 	}
7284 
7285 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7286 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7287 	else
7288 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7289 }
7290 
7291 static void
7292 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7293 {
7294 	const struct hn_nvs_hdr *hdr;
7295 
7296 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7297 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7298 		return;
7299 	}
7300 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7301 
7302 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7303 		/* Useless; ignore */
7304 		return;
7305 	}
7306 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7307 }
7308 
7309 static void
7310 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7311     const struct vmbus_chanpkt_hdr *pkt)
7312 {
7313 	struct hn_nvs_sendctx *sndc;
7314 
7315 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7316 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7317 	    VMBUS_CHANPKT_DATALEN(pkt));
7318 	/*
7319 	 * NOTE:
7320 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7321 	 * its callback.
7322 	 */
7323 }
7324 
7325 static void
7326 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7327     const struct vmbus_chanpkt_hdr *pkthdr)
7328 {
7329 	const struct vmbus_chanpkt_rxbuf *pkt;
7330 	const struct hn_nvs_hdr *nvs_hdr;
7331 	int count, i, hlen;
7332 
7333 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7334 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7335 		return;
7336 	}
7337 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7338 
7339 	/* Make sure that this is a RNDIS message. */
7340 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7341 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7342 		    nvs_hdr->nvs_type);
7343 		return;
7344 	}
7345 
7346 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7347 	if (__predict_false(hlen < sizeof(*pkt))) {
7348 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7349 		return;
7350 	}
7351 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7352 
7353 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7354 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7355 		    pkt->cp_rxbuf_id);
7356 		return;
7357 	}
7358 
7359 	count = pkt->cp_rxbuf_cnt;
7360 	if (__predict_false(hlen <
7361 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7362 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7363 		return;
7364 	}
7365 
7366 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7367 	for (i = 0; i < count; ++i) {
7368 		int ofs, len;
7369 
7370 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7371 		len = pkt->cp_rxbuf[i].rb_len;
7372 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7373 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7374 			    "ofs %d, len %d\n", i, ofs, len);
7375 			continue;
7376 		}
7377 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7378 	}
7379 
7380 	/*
7381 	 * Ack the consumed RXBUF associated w/ this channel packet,
7382 	 * so that this RXBUF can be recycled by the hypervisor.
7383 	 */
7384 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7385 }
7386 
7387 static void
7388 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7389     uint64_t tid)
7390 {
7391 	struct hn_nvs_rndis_ack ack;
7392 	int retries, error;
7393 
7394 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7395 	ack.nvs_status = HN_NVS_STATUS_OK;
7396 
7397 	retries = 0;
7398 again:
7399 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7400 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7401 	if (__predict_false(error == EAGAIN)) {
7402 		/*
7403 		 * NOTE:
7404 		 * This should _not_ happen in real world, since the
7405 		 * consumption of the TX bufring from the TX path is
7406 		 * controlled.
7407 		 */
7408 		if (rxr->hn_ack_failed == 0)
7409 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7410 		rxr->hn_ack_failed++;
7411 		retries++;
7412 		if (retries < 10) {
7413 			DELAY(100);
7414 			goto again;
7415 		}
7416 		/* RXBUF leaks! */
7417 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7418 	}
7419 }
7420 
7421 static void
7422 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7423 {
7424 	struct hn_rx_ring *rxr = xrxr;
7425 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7426 
7427 	for (;;) {
7428 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7429 		int error, pktlen;
7430 
7431 		pktlen = rxr->hn_pktbuf_len;
7432 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7433 		if (__predict_false(error == ENOBUFS)) {
7434 			void *nbuf;
7435 			int nlen;
7436 
7437 			/*
7438 			 * Expand channel packet buffer.
7439 			 *
7440 			 * XXX
7441 			 * Use M_WAITOK here, since allocation failure
7442 			 * is fatal.
7443 			 */
7444 			nlen = rxr->hn_pktbuf_len * 2;
7445 			while (nlen < pktlen)
7446 				nlen *= 2;
7447 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7448 
7449 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7450 			    rxr->hn_pktbuf_len, nlen);
7451 
7452 			free(rxr->hn_pktbuf, M_DEVBUF);
7453 			rxr->hn_pktbuf = nbuf;
7454 			rxr->hn_pktbuf_len = nlen;
7455 			/* Retry! */
7456 			continue;
7457 		} else if (__predict_false(error == EAGAIN)) {
7458 			/* No more channel packets; done! */
7459 			break;
7460 		}
7461 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7462 
7463 		switch (pkt->cph_type) {
7464 		case VMBUS_CHANPKT_TYPE_COMP:
7465 			hn_nvs_handle_comp(sc, chan, pkt);
7466 			break;
7467 
7468 		case VMBUS_CHANPKT_TYPE_RXBUF:
7469 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7470 			break;
7471 
7472 		case VMBUS_CHANPKT_TYPE_INBAND:
7473 			hn_nvs_handle_notify(sc, pkt);
7474 			break;
7475 
7476 		default:
7477 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7478 			    pkt->cph_type);
7479 			break;
7480 		}
7481 	}
7482 	hn_chan_rollup(rxr, rxr->hn_txr);
7483 }
7484 
7485 static void
7486 hn_sysinit(void *arg __unused)
7487 {
7488 	int i;
7489 
7490 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7491 
7492 #ifdef HN_IFSTART_SUPPORT
7493 	/*
7494 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7495 	 * mainly due to the IFF_DRV_OACTIVE flag.
7496 	 */
7497 	if (hn_xpnt_vf && hn_use_if_start) {
7498 		hn_use_if_start = 0;
7499 		printf("hn: tranparent VF mode, if_transmit will be used, "
7500 		    "instead of if_start\n");
7501 	}
7502 #endif
7503 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7504 		printf("hn: invalid transparent VF attach routing "
7505 		    "wait timeout %d, reset to %d\n",
7506 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7507 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7508 	}
7509 
7510 	/*
7511 	 * Initialize VF map.
7512 	 */
7513 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7514 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7515 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7516 	    M_WAITOK | M_ZERO);
7517 
7518 	/*
7519 	 * Fix the # of TX taskqueues.
7520 	 */
7521 	if (hn_tx_taskq_cnt <= 0)
7522 		hn_tx_taskq_cnt = 1;
7523 	else if (hn_tx_taskq_cnt > mp_ncpus)
7524 		hn_tx_taskq_cnt = mp_ncpus;
7525 
7526 	/*
7527 	 * Fix the TX taskqueue mode.
7528 	 */
7529 	switch (hn_tx_taskq_mode) {
7530 	case HN_TX_TASKQ_M_INDEP:
7531 	case HN_TX_TASKQ_M_GLOBAL:
7532 	case HN_TX_TASKQ_M_EVTTQ:
7533 		break;
7534 	default:
7535 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7536 		break;
7537 	}
7538 
7539 	if (vm_guest != VM_GUEST_HV)
7540 		return;
7541 
7542 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7543 		return;
7544 
7545 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7546 	    M_DEVBUF, M_WAITOK);
7547 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7548 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7549 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7550 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7551 		    "hn tx%d", i);
7552 	}
7553 }
7554 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7555 
7556 static void
7557 hn_sysuninit(void *arg __unused)
7558 {
7559 
7560 	if (hn_tx_taskque != NULL) {
7561 		int i;
7562 
7563 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7564 			taskqueue_free(hn_tx_taskque[i]);
7565 		free(hn_tx_taskque, M_DEVBUF);
7566 	}
7567 
7568 	if (hn_vfmap != NULL)
7569 		free(hn_vfmap, M_DEVBUF);
7570 	rm_destroy(&hn_vfmap_lock);
7571 
7572 	counter_u64_free(hn_udpcs_fixup);
7573 }
7574 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7575