xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 9f27341c336aa12f6c7163c17e646e76c813b689)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/bus.h>
66 #include <sys/counter.h>
67 #include <sys/kernel.h>
68 #include <sys/limits.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/queue.h>
73 #include <sys/lock.h>
74 #include <sys/proc.h>
75 #include <sys/rmlock.h>
76 #include <sys/sbuf.h>
77 #include <sys/sched.h>
78 #include <sys/smp.h>
79 #include <sys/socket.h>
80 #include <sys/sockio.h>
81 #include <sys/sx.h>
82 #include <sys/sysctl.h>
83 #include <sys/taskqueue.h>
84 #include <sys/buf_ring.h>
85 #include <sys/eventhandler.h>
86 #include <sys/epoch.h>
87 
88 #include <machine/atomic.h>
89 #include <machine/in_cksum.h>
90 
91 #include <net/bpf.h>
92 #include <net/ethernet.h>
93 #include <net/if.h>
94 #include <net/if_dl.h>
95 #include <net/if_media.h>
96 #include <net/if_types.h>
97 #include <net/if_var.h>
98 #include <net/rndis.h>
99 #ifdef RSS
100 #include <net/rss_config.h>
101 #endif
102 
103 #include <netinet/in_systm.h>
104 #include <netinet/in.h>
105 #include <netinet/ip.h>
106 #include <netinet/ip6.h>
107 #include <netinet/tcp.h>
108 #include <netinet/tcp_lro.h>
109 #include <netinet/udp.h>
110 
111 #include <dev/hyperv/include/hyperv.h>
112 #include <dev/hyperv/include/hyperv_busdma.h>
113 #include <dev/hyperv/include/vmbus.h>
114 #include <dev/hyperv/include/vmbus_xact.h>
115 
116 #include <dev/hyperv/netvsc/ndis.h>
117 #include <dev/hyperv/netvsc/if_hnreg.h>
118 #include <dev/hyperv/netvsc/if_hnvar.h>
119 #include <dev/hyperv/netvsc/hn_nvs.h>
120 #include <dev/hyperv/netvsc/hn_rndis.h>
121 
122 #include "vmbus_if.h"
123 
124 #define HN_IFSTART_SUPPORT
125 
126 #define HN_RING_CNT_DEF_MAX		8
127 
128 #define HN_VFMAP_SIZE_DEF		8
129 
130 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
131 
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT			512
134 
135 #define HN_RNDIS_PKT_LEN					\
136 	(sizeof(struct rndis_packet_msg) +			\
137 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
138 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
139 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
141 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
142 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
143 
144 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
145 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
146 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
147 /* -1 for RNDIS packet message */
148 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
149 
150 #define HN_DIRECT_TX_SIZE_DEF		128
151 
152 #define HN_EARLY_TXEOF_THRESH		8
153 
154 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
155 
156 #define HN_LROENT_CNT_DEF		128
157 
158 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
159 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
160 /* YYY 2*MTU is a bit rough, but should be good enough. */
161 #define HN_LRO_LENLIM_MIN(ifp)		(2 * if_getmtu(ifp))
162 
163 #define HN_LRO_ACKCNT_DEF		1
164 
165 #define HN_LOCK_INIT(sc)		\
166 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
167 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
168 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
169 #define HN_LOCK(sc)					\
170 do {							\
171 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
172 		/* Relinquish cpu to avoid deadlock */	\
173 		sched_relinquish(curthread);		\
174 		DELAY(1000);				\
175 	}						\
176 } while (0)
177 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
178 
179 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
180 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
181 #define HN_CSUM_IP_HWASSIST(sc)		\
182 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
183 #define HN_CSUM_IP6_HWASSIST(sc)	\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
185 
186 #define HN_PKTSIZE_MIN(align)		\
187 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
188 	    HN_RNDIS_PKT_LEN, (align))
189 #define HN_PKTSIZE(m, align)		\
190 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
191 
192 #ifdef RSS
193 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
194 #else
195 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
196 #endif
197 
198 struct hn_txdesc {
199 #ifndef HN_USE_TXDESC_BUFRING
200 	SLIST_ENTRY(hn_txdesc)		link;
201 #endif
202 	STAILQ_ENTRY(hn_txdesc)		agg_link;
203 
204 	/* Aggregated txdescs, in sending order. */
205 	STAILQ_HEAD(, hn_txdesc)	agg_list;
206 
207 	/* The oldest packet, if transmission aggregation happens. */
208 	struct mbuf			*m;
209 	struct hn_tx_ring		*txr;
210 	int				refs;
211 	uint32_t			flags;	/* HN_TXD_FLAG_ */
212 	struct hn_nvs_sendctx		send_ctx;
213 	uint32_t			chim_index;
214 	int				chim_size;
215 
216 	bus_dmamap_t			data_dmap;
217 
218 	bus_addr_t			rndis_pkt_paddr;
219 	struct rndis_packet_msg		*rndis_pkt;
220 	bus_dmamap_t			rndis_pkt_dmap;
221 };
222 
223 #define HN_TXD_FLAG_ONLIST		0x0001
224 #define HN_TXD_FLAG_DMAMAP		0x0002
225 #define HN_TXD_FLAG_ONAGG		0x0004
226 
227 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
228 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
229 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
230 
231 struct packet_info_id {
232 	uint8_t				ver;
233 	uint8_t				flag;
234 	uint16_t			pkt_id;
235 };
236 
237 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
238 
239 
240 struct hn_rxinfo {
241 	const uint32_t			*vlan_info;
242 	const uint32_t			*csum_info;
243 	const uint32_t			*hash_info;
244 	const uint32_t			*hash_value;
245 	const struct packet_info_id	*pktinfo_id;
246 };
247 
248 struct hn_rxvf_setarg {
249 	struct hn_rx_ring	*rxr;
250 	if_t			vf_ifp;
251 };
252 
253 #define HN_RXINFO_VLAN			0x0001
254 #define HN_RXINFO_CSUM			0x0002
255 #define HN_RXINFO_HASHINF		0x0004
256 #define HN_RXINFO_HASHVAL		0x0008
257 #define HN_RXINFO_PKTINFO_ID		0x0010
258 #define HN_RXINFO_ALL			\
259 	(HN_RXINFO_VLAN |		\
260 	 HN_RXINFO_CSUM |		\
261 	 HN_RXINFO_HASHINF |		\
262 	 HN_RXINFO_HASHVAL |		\
263 	 HN_RXINFO_PKTINFO_ID)
264 
265 static int			hn_probe(device_t);
266 static int			hn_attach(device_t);
267 static int			hn_detach(device_t);
268 static int			hn_shutdown(device_t);
269 static void			hn_chan_callback(struct vmbus_channel *,
270 				    void *);
271 
272 static void			hn_init(void *);
273 static int			hn_ioctl(if_t, u_long, caddr_t);
274 #ifdef HN_IFSTART_SUPPORT
275 static void			hn_start(if_t);
276 #endif
277 static int			hn_transmit(if_t, struct mbuf *);
278 static void			hn_xmit_qflush(if_t);
279 static int			hn_ifmedia_upd(if_t);
280 static void			hn_ifmedia_sts(if_t,
281 				    struct ifmediareq *);
282 
283 static void			hn_ifnet_event(void *, if_t, int);
284 static void			hn_ifaddr_event(void *, if_t);
285 static void			hn_ifnet_attevent(void *, if_t);
286 static void			hn_ifnet_detevent(void *, if_t);
287 static void			hn_ifnet_lnkevent(void *, if_t, int);
288 
289 static bool			hn_ismyvf(const struct hn_softc *,
290 				    const if_t);
291 static void			hn_rxvf_change(struct hn_softc *,
292 				    if_t, bool);
293 static void			hn_rxvf_set(struct hn_softc *, if_t);
294 static void			hn_rxvf_set_task(void *, int);
295 static void			hn_xpnt_vf_input(if_t, struct mbuf *);
296 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
297 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
298 				    struct ifreq *);
299 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
300 static bool			hn_xpnt_vf_isready(struct hn_softc *);
301 static void			hn_xpnt_vf_setready(struct hn_softc *);
302 static void			hn_xpnt_vf_init_taskfunc(void *, int);
303 static void			hn_xpnt_vf_init(struct hn_softc *);
304 static void			hn_xpnt_vf_setenable(struct hn_softc *);
305 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
306 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
307 static void			hn_vf_rss_restore(struct hn_softc *);
308 
309 static int			hn_rndis_rxinfo(const void *, int,
310 				    struct hn_rxinfo *);
311 static void			hn_rndis_rx_data(struct hn_rx_ring *,
312 				    const void *, int);
313 static void			hn_rndis_rx_status(struct hn_softc *,
314 				    const void *, int);
315 static void			hn_rndis_init_fixat(struct hn_softc *, int);
316 
317 static void			hn_nvs_handle_notify(struct hn_softc *,
318 				    const struct vmbus_chanpkt_hdr *);
319 static void			hn_nvs_handle_comp(struct hn_softc *,
320 				    struct vmbus_channel *,
321 				    const struct vmbus_chanpkt_hdr *);
322 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
323 				    struct vmbus_channel *,
324 				    const struct vmbus_chanpkt_hdr *);
325 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
326 				    struct vmbus_channel *, uint64_t);
327 
328 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
340 #ifndef RSS
341 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
342 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
343 #endif
344 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
345 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
346 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
358 
359 static void			hn_stop(struct hn_softc *, bool);
360 static void			hn_init_locked(struct hn_softc *);
361 static int			hn_chan_attach(struct hn_softc *,
362 				    struct vmbus_channel *);
363 static void			hn_chan_detach(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static int			hn_attach_subchans(struct hn_softc *);
366 static void			hn_detach_allchans(struct hn_softc *);
367 static void			hn_chan_rollup(struct hn_rx_ring *,
368 				    struct hn_tx_ring *);
369 static void			hn_set_ring_inuse(struct hn_softc *, int);
370 static int			hn_synth_attach(struct hn_softc *, int);
371 static void			hn_synth_detach(struct hn_softc *);
372 static int			hn_synth_alloc_subchans(struct hn_softc *,
373 				    int *);
374 static bool			hn_synth_attachable(const struct hn_softc *);
375 static void			hn_suspend(struct hn_softc *);
376 static void			hn_suspend_data(struct hn_softc *);
377 static void			hn_suspend_mgmt(struct hn_softc *);
378 static void			hn_resume(struct hn_softc *);
379 static void			hn_resume_data(struct hn_softc *);
380 static void			hn_resume_mgmt(struct hn_softc *);
381 static void			hn_suspend_mgmt_taskfunc(void *, int);
382 static void			hn_chan_drain(struct hn_softc *,
383 				    struct vmbus_channel *);
384 static void			hn_disable_rx(struct hn_softc *);
385 static void			hn_drain_rxtx(struct hn_softc *, int);
386 static void			hn_polling(struct hn_softc *, u_int);
387 static void			hn_chan_polling(struct vmbus_channel *, u_int);
388 static void			hn_mtu_change_fixup(struct hn_softc *);
389 
390 static void			hn_update_link_status(struct hn_softc *);
391 static void			hn_change_network(struct hn_softc *);
392 static void			hn_link_taskfunc(void *, int);
393 static void			hn_netchg_init_taskfunc(void *, int);
394 static void			hn_netchg_status_taskfunc(void *, int);
395 static void			hn_link_status(struct hn_softc *);
396 
397 static int			hn_create_rx_data(struct hn_softc *, int);
398 static void			hn_destroy_rx_data(struct hn_softc *);
399 static int			hn_check_iplen(const struct mbuf *, int);
400 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
401 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
402 static int			hn_rxfilter_config(struct hn_softc *);
403 static int			hn_rss_reconfig(struct hn_softc *);
404 static void			hn_rss_ind_fixup(struct hn_softc *);
405 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
406 static int			hn_rxpkt(struct hn_rx_ring *);
407 static uint32_t			hn_rss_type_fromndis(uint32_t);
408 static uint32_t			hn_rss_type_tondis(uint32_t);
409 
410 static int			hn_tx_ring_create(struct hn_softc *, int);
411 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
412 static int			hn_create_tx_data(struct hn_softc *, int);
413 static void			hn_fixup_tx_data(struct hn_softc *);
414 static void			hn_fixup_rx_data(struct hn_softc *);
415 static void			hn_destroy_tx_data(struct hn_softc *);
416 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
417 static void			hn_txdesc_gc(struct hn_tx_ring *,
418 				    struct hn_txdesc *);
419 static int			hn_encap(if_t, struct hn_tx_ring *,
420 				    struct hn_txdesc *, struct mbuf **);
421 static int			hn_txpkt(if_t, struct hn_tx_ring *,
422 				    struct hn_txdesc *);
423 static void			hn_set_chim_size(struct hn_softc *, int);
424 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
425 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
426 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
427 static void			hn_resume_tx(struct hn_softc *, int);
428 static void			hn_set_txagg(struct hn_softc *);
429 static void			*hn_try_txagg(if_t,
430 				    struct hn_tx_ring *, struct hn_txdesc *,
431 				    int);
432 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
433 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
434 				    struct hn_softc *, struct vmbus_channel *,
435 				    const void *, int);
436 static int			hn_txpkt_sglist(struct hn_tx_ring *,
437 				    struct hn_txdesc *);
438 static int			hn_txpkt_chim(struct hn_tx_ring *,
439 				    struct hn_txdesc *);
440 static int			hn_xmit(struct hn_tx_ring *, int);
441 static void			hn_xmit_taskfunc(void *, int);
442 static void			hn_xmit_txeof(struct hn_tx_ring *);
443 static void			hn_xmit_txeof_taskfunc(void *, int);
444 #ifdef HN_IFSTART_SUPPORT
445 static int			hn_start_locked(struct hn_tx_ring *, int);
446 static void			hn_start_taskfunc(void *, int);
447 static void			hn_start_txeof(struct hn_tx_ring *);
448 static void			hn_start_txeof_taskfunc(void *, int);
449 #endif
450 
451 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
452 
453 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
454     "Hyper-V network interface");
455 
456 /* Trust tcp segment verification on host side. */
457 static int			hn_trust_hosttcp = 1;
458 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
459     &hn_trust_hosttcp, 0,
460     "Trust tcp segment verification on host side, "
461     "when csum info is missing (global setting)");
462 
463 /* Trust udp datagrams verification on host side. */
464 static int			hn_trust_hostudp = 1;
465 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
466     &hn_trust_hostudp, 0,
467     "Trust udp datagram verification on host side, "
468     "when csum info is missing (global setting)");
469 
470 /* Trust ip packets verification on host side. */
471 static int			hn_trust_hostip = 1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
473     &hn_trust_hostip, 0,
474     "Trust ip packet verification on host side, "
475     "when csum info is missing (global setting)");
476 
477 /*
478  * Offload UDP/IPv4 checksum.
479  */
480 static int			hn_enable_udp4cs = 1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
482     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
483 
484 /*
485  * Offload UDP/IPv6 checksum.
486  */
487 static int			hn_enable_udp6cs = 1;
488 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
489     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
490 
491 /* Stats. */
492 static counter_u64_t		hn_udpcs_fixup;
493 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
494     &hn_udpcs_fixup, "# of UDP checksum fixup");
495 
496 /*
497  * See hn_set_hlen().
498  *
499  * This value is for Azure.  For Hyper-V, set this above
500  * 65536 to disable UDP datagram checksum fixup.
501  */
502 static int			hn_udpcs_fixup_mtu = 1420;
503 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
504     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
505 
506 /* Limit TSO burst size */
507 static int			hn_tso_maxlen = IP_MAXPACKET;
508 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
509     &hn_tso_maxlen, 0, "TSO burst limit");
510 
511 /* Limit chimney send size */
512 static int			hn_tx_chimney_size = 0;
513 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
514     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
515 
516 /* Limit the size of packet for direct transmission */
517 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
518 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
519     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
520 
521 /* # of LRO entries per RX ring */
522 #if defined(INET) || defined(INET6)
523 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
524 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
525     &hn_lro_entry_count, 0, "LRO entry count");
526 #endif
527 
528 static int			hn_tx_taskq_cnt = 1;
529 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
530     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
531 
532 #define HN_TX_TASKQ_M_INDEP	0
533 #define HN_TX_TASKQ_M_GLOBAL	1
534 #define HN_TX_TASKQ_M_EVTTQ	2
535 
536 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
538     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
539     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
540 
541 #ifndef HN_USE_TXDESC_BUFRING
542 static int			hn_use_txdesc_bufring = 0;
543 #else
544 static int			hn_use_txdesc_bufring = 1;
545 #endif
546 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
547     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
548 
549 #ifdef HN_IFSTART_SUPPORT
550 /* Use ifnet.if_start instead of ifnet.if_transmit */
551 static int			hn_use_if_start = 0;
552 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
553     &hn_use_if_start, 0, "Use if_start TX method");
554 #endif
555 
556 /* # of channels to use */
557 static int			hn_chan_cnt = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
559     &hn_chan_cnt, 0,
560     "# of channels to use; each channel has one RX ring and one TX ring");
561 
562 /* # of transmit rings to use */
563 static int			hn_tx_ring_cnt = 0;
564 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
565     &hn_tx_ring_cnt, 0, "# of TX rings to use");
566 
567 /* Software TX ring deptch */
568 static int			hn_tx_swq_depth = 0;
569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
570     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
571 
572 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
573 static u_int			hn_lro_mbufq_depth = 0;
574 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
575     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
576 
577 /* Packet transmission aggregation size limit */
578 static int			hn_tx_agg_size = -1;
579 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
580     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
581 
582 /* Packet transmission aggregation count limit */
583 static int			hn_tx_agg_pkts = -1;
584 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
585     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
586 
587 /* VF list */
588 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
589     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
590     hn_vflist_sysctl, "A",
591     "VF list");
592 
593 /* VF mapping */
594 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
595     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
596     hn_vfmap_sysctl, "A",
597     "VF mapping");
598 
599 /* Transparent VF */
600 static int			hn_xpnt_vf = 1;
601 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
602     &hn_xpnt_vf, 0, "Transparent VF mod");
603 
604 /* Accurate BPF support for Transparent VF */
605 static int			hn_xpnt_vf_accbpf = 0;
606 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
607     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
608 
609 /* Extra wait for transparent VF attach routing; unit seconds. */
610 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
612     &hn_xpnt_vf_attwait, 0,
613     "Extra wait for transparent VF attach routing; unit: seconds");
614 
615 static u_int			hn_cpu_index;	/* next CPU for channel */
616 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
617 
618 static struct rmlock		hn_vfmap_lock;
619 static int			hn_vfmap_size;
620 static if_t			*hn_vfmap;
621 
622 #ifndef RSS
623 static const uint8_t
624 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
625 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
626 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
627 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
628 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
629 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
630 };
631 #endif	/* !RSS */
632 
633 static const struct hyperv_guid	hn_guid = {
634 	.hv_guid = {
635 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
636 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
637 };
638 
639 static device_method_t hn_methods[] = {
640 	/* Device interface */
641 	DEVMETHOD(device_probe,		hn_probe),
642 	DEVMETHOD(device_attach,	hn_attach),
643 	DEVMETHOD(device_detach,	hn_detach),
644 	DEVMETHOD(device_shutdown,	hn_shutdown),
645 	DEVMETHOD_END
646 };
647 
648 static driver_t hn_driver = {
649 	"hn",
650 	hn_methods,
651 	sizeof(struct hn_softc)
652 };
653 
654 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
655 MODULE_VERSION(hn, 1);
656 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
657 
658 static void
659 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
660 {
661 	int i;
662 
663 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
664 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
665 }
666 
667 static int
668 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
669 {
670 
671 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
672 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
673 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
674 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
675 }
676 
677 static int
678 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
679 {
680 	struct hn_nvs_rndis rndis;
681 
682 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
683 	    txd->chim_size > 0, ("invalid rndis chim txd"));
684 
685 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
686 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
687 	rndis.nvs_chim_idx = txd->chim_index;
688 	rndis.nvs_chim_sz = txd->chim_size;
689 
690 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
691 	    &rndis, sizeof(rndis), &txd->send_ctx));
692 }
693 
694 static __inline uint32_t
695 hn_chim_alloc(struct hn_softc *sc)
696 {
697 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
698 	u_long *bmap = sc->hn_chim_bmap;
699 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
700 
701 	for (i = 0; i < bmap_cnt; ++i) {
702 		int idx;
703 
704 		idx = ffsl(~bmap[i]);
705 		if (idx == 0)
706 			continue;
707 
708 		--idx; /* ffsl is 1-based */
709 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
710 		    ("invalid i %d and idx %d", i, idx));
711 
712 		if (atomic_testandset_long(&bmap[i], idx))
713 			continue;
714 
715 		ret = i * LONG_BIT + idx;
716 		break;
717 	}
718 	return (ret);
719 }
720 
721 static __inline void
722 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
723 {
724 	u_long mask;
725 	uint32_t idx;
726 
727 	idx = chim_idx / LONG_BIT;
728 	KASSERT(idx < sc->hn_chim_bmap_cnt,
729 	    ("invalid chimney index 0x%x", chim_idx));
730 
731 	mask = 1UL << (chim_idx % LONG_BIT);
732 	KASSERT(sc->hn_chim_bmap[idx] & mask,
733 	    ("index bitmap 0x%lx, chimney index %u, "
734 	     "bitmap idx %d, bitmask 0x%lx",
735 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
736 
737 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
738 }
739 
740 #if defined(INET6) || defined(INET)
741 
742 #define PULLUP_HDR(m, len)				\
743 do {							\
744 	if (__predict_false((m)->m_len < (len))) {	\
745 		(m) = m_pullup((m), (len));		\
746 		if ((m) == NULL)			\
747 			return (NULL);			\
748 	}						\
749 } while (0)
750 
751 /*
752  * NOTE: If this function failed, the m_head would be freed.
753  */
754 static __inline struct mbuf *
755 hn_tso_fixup(struct mbuf *m_head)
756 {
757 	struct ether_vlan_header *evl;
758 	struct tcphdr *th;
759 	int ehlen;
760 
761 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
762 
763 	PULLUP_HDR(m_head, sizeof(*evl));
764 	evl = mtod(m_head, struct ether_vlan_header *);
765 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
766 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
767 	else
768 		ehlen = ETHER_HDR_LEN;
769 	m_head->m_pkthdr.l2hlen = ehlen;
770 
771 #ifdef INET
772 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
773 		struct ip *ip;
774 		int iphlen;
775 
776 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
777 		ip = mtodo(m_head, ehlen);
778 		iphlen = ip->ip_hl << 2;
779 		m_head->m_pkthdr.l3hlen = iphlen;
780 
781 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
782 		th = mtodo(m_head, ehlen + iphlen);
783 
784 		ip->ip_len = 0;
785 		ip->ip_sum = 0;
786 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
787 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
788 	}
789 #endif
790 #if defined(INET6) && defined(INET)
791 	else
792 #endif
793 #ifdef INET6
794 	{
795 		struct ip6_hdr *ip6;
796 
797 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
798 		ip6 = mtodo(m_head, ehlen);
799 		if (ip6->ip6_nxt != IPPROTO_TCP) {
800 			m_freem(m_head);
801 			return (NULL);
802 		}
803 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
804 
805 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
806 		th = mtodo(m_head, ehlen + sizeof(*ip6));
807 
808 		ip6->ip6_plen = 0;
809 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
810 	}
811 #endif
812 	return (m_head);
813 }
814 
815 /*
816  * NOTE: If this function failed, the m_head would be freed.
817  */
818 static __inline struct mbuf *
819 hn_set_hlen(struct mbuf *m_head)
820 {
821 	const struct ether_vlan_header *evl;
822 	int ehlen;
823 
824 	PULLUP_HDR(m_head, sizeof(*evl));
825 	evl = mtod(m_head, const struct ether_vlan_header *);
826 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
827 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
828 	else
829 		ehlen = ETHER_HDR_LEN;
830 	m_head->m_pkthdr.l2hlen = ehlen;
831 
832 #ifdef INET
833 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
834 		const struct ip *ip;
835 		int iphlen;
836 
837 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
838 		ip = mtodo(m_head, ehlen);
839 		iphlen = ip->ip_hl << 2;
840 		m_head->m_pkthdr.l3hlen = iphlen;
841 
842 		/*
843 		 * UDP checksum offload does not work in Azure, if the
844 		 * following conditions meet:
845 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
846 		 * - IP_DF is not set in the IP hdr.
847 		 *
848 		 * Fallback to software checksum for these UDP datagrams.
849 		 */
850 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
851 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
852 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
853 			uint16_t off = ehlen + iphlen;
854 
855 			counter_u64_add(hn_udpcs_fixup, 1);
856 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
857 			*(uint16_t *)(m_head->m_data + off +
858                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
859 			    m_head, m_head->m_pkthdr.len, off);
860 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
861 		}
862 	}
863 #endif
864 #if defined(INET6) && defined(INET)
865 	else
866 #endif
867 #ifdef INET6
868 	{
869 		const struct ip6_hdr *ip6;
870 
871 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
872 		ip6 = mtodo(m_head, ehlen);
873 		if (ip6->ip6_nxt != IPPROTO_TCP &&
874 		    ip6->ip6_nxt != IPPROTO_UDP) {
875 			m_freem(m_head);
876 			return (NULL);
877 		}
878 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
879 	}
880 #endif
881 	return (m_head);
882 }
883 
884 /*
885  * NOTE: If this function failed, the m_head would be freed.
886  */
887 static __inline struct mbuf *
888 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
889 {
890 	const struct tcphdr *th;
891 	int ehlen, iphlen;
892 
893 	*tcpsyn = 0;
894 	ehlen = m_head->m_pkthdr.l2hlen;
895 	iphlen = m_head->m_pkthdr.l3hlen;
896 
897 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
898 	th = mtodo(m_head, ehlen + iphlen);
899 	if (th->th_flags & TH_SYN)
900 		*tcpsyn = 1;
901 	return (m_head);
902 }
903 
904 #undef PULLUP_HDR
905 
906 #endif	/* INET6 || INET */
907 
908 static int
909 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
910 {
911 	int error = 0;
912 
913 	HN_LOCK_ASSERT(sc);
914 
915 	if (sc->hn_rx_filter != filter) {
916 		error = hn_rndis_set_rxfilter(sc, filter);
917 		if (!error)
918 			sc->hn_rx_filter = filter;
919 	}
920 	return (error);
921 }
922 
923 static int
924 hn_rxfilter_config(struct hn_softc *sc)
925 {
926 	if_t ifp = sc->hn_ifp;
927 	uint32_t filter;
928 
929 	HN_LOCK_ASSERT(sc);
930 
931 	/*
932 	 * If the non-transparent mode VF is activated, we don't know how
933 	 * its RX filter is configured, so stick the synthetic device in
934 	 * the promiscous mode.
935 	 */
936 	if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
937 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
938 	} else {
939 		filter = NDIS_PACKET_TYPE_DIRECTED;
940 		if (if_getflags(ifp) & IFF_BROADCAST)
941 			filter |= NDIS_PACKET_TYPE_BROADCAST;
942 		/* TODO: support multicast list */
943 		if ((if_getflags(ifp) & IFF_ALLMULTI) ||
944 		    !if_maddr_empty(ifp))
945 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
946 	}
947 	return (hn_set_rxfilter(sc, filter));
948 }
949 
950 static void
951 hn_set_txagg(struct hn_softc *sc)
952 {
953 	uint32_t size, pkts;
954 	int i;
955 
956 	/*
957 	 * Setup aggregation size.
958 	 */
959 	if (sc->hn_agg_size < 0)
960 		size = UINT32_MAX;
961 	else
962 		size = sc->hn_agg_size;
963 
964 	if (sc->hn_rndis_agg_size < size)
965 		size = sc->hn_rndis_agg_size;
966 
967 	/* NOTE: We only aggregate packets using chimney sending buffers. */
968 	if (size > (uint32_t)sc->hn_chim_szmax)
969 		size = sc->hn_chim_szmax;
970 
971 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
972 		/* Disable */
973 		size = 0;
974 		pkts = 0;
975 		goto done;
976 	}
977 
978 	/* NOTE: Type of the per TX ring setting is 'int'. */
979 	if (size > INT_MAX)
980 		size = INT_MAX;
981 
982 	/*
983 	 * Setup aggregation packet count.
984 	 */
985 	if (sc->hn_agg_pkts < 0)
986 		pkts = UINT32_MAX;
987 	else
988 		pkts = sc->hn_agg_pkts;
989 
990 	if (sc->hn_rndis_agg_pkts < pkts)
991 		pkts = sc->hn_rndis_agg_pkts;
992 
993 	if (pkts <= 1) {
994 		/* Disable */
995 		size = 0;
996 		pkts = 0;
997 		goto done;
998 	}
999 
1000 	/* NOTE: Type of the per TX ring setting is 'short'. */
1001 	if (pkts > SHRT_MAX)
1002 		pkts = SHRT_MAX;
1003 
1004 done:
1005 	/* NOTE: Type of the per TX ring setting is 'short'. */
1006 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1007 		/* Disable */
1008 		size = 0;
1009 		pkts = 0;
1010 	}
1011 
1012 	if (bootverbose) {
1013 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1014 		    size, pkts, sc->hn_rndis_agg_align);
1015 	}
1016 
1017 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1018 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1019 
1020 		mtx_lock(&txr->hn_tx_lock);
1021 		txr->hn_agg_szmax = size;
1022 		txr->hn_agg_pktmax = pkts;
1023 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1024 		mtx_unlock(&txr->hn_tx_lock);
1025 	}
1026 }
1027 
1028 static int
1029 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1030 {
1031 
1032 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1033 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1034 		return txr->hn_txdesc_cnt;
1035 	return hn_tx_swq_depth;
1036 }
1037 
1038 static int
1039 hn_rss_reconfig(struct hn_softc *sc)
1040 {
1041 	int error;
1042 
1043 	HN_LOCK_ASSERT(sc);
1044 
1045 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1046 		return (ENXIO);
1047 
1048 	/*
1049 	 * Disable RSS first.
1050 	 *
1051 	 * NOTE:
1052 	 * Direct reconfiguration by setting the UNCHG flags does
1053 	 * _not_ work properly.
1054 	 */
1055 	if (bootverbose)
1056 		if_printf(sc->hn_ifp, "disable RSS\n");
1057 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1058 	if (error) {
1059 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1060 		return (error);
1061 	}
1062 
1063 	/*
1064 	 * Reenable the RSS w/ the updated RSS key or indirect
1065 	 * table.
1066 	 */
1067 	if (bootverbose)
1068 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1069 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1070 	if (error) {
1071 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1072 		return (error);
1073 	}
1074 	return (0);
1075 }
1076 
1077 static void
1078 hn_rss_ind_fixup(struct hn_softc *sc)
1079 {
1080 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1081 	int i, nchan;
1082 
1083 	nchan = sc->hn_rx_ring_inuse;
1084 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1085 
1086 	/*
1087 	 * Check indirect table to make sure that all channels in it
1088 	 * can be used.
1089 	 */
1090 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1091 		if (rss->rss_ind[i] >= nchan) {
1092 			if_printf(sc->hn_ifp,
1093 			    "RSS indirect table %d fixup: %u -> %d\n",
1094 			    i, rss->rss_ind[i], nchan - 1);
1095 			rss->rss_ind[i] = nchan - 1;
1096 		}
1097 	}
1098 }
1099 
1100 static int
1101 hn_ifmedia_upd(if_t ifp __unused)
1102 {
1103 
1104 	return EOPNOTSUPP;
1105 }
1106 
1107 static void
1108 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1109 {
1110 	struct hn_softc *sc = if_getsoftc(ifp);
1111 
1112 	ifmr->ifm_status = IFM_AVALID;
1113 	ifmr->ifm_active = IFM_ETHER;
1114 
1115 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1116 		ifmr->ifm_active |= IFM_NONE;
1117 		return;
1118 	}
1119 	ifmr->ifm_status |= IFM_ACTIVE;
1120 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1121 }
1122 
1123 static void
1124 hn_rxvf_set_task(void *xarg, int pending __unused)
1125 {
1126 	struct hn_rxvf_setarg *arg = xarg;
1127 
1128 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1129 }
1130 
1131 static void
1132 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1133 {
1134 	struct hn_rx_ring *rxr;
1135 	struct hn_rxvf_setarg arg;
1136 	struct task task;
1137 	int i;
1138 
1139 	HN_LOCK_ASSERT(sc);
1140 
1141 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1142 
1143 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1144 		rxr = &sc->hn_rx_ring[i];
1145 
1146 		if (i < sc->hn_rx_ring_inuse) {
1147 			arg.rxr = rxr;
1148 			arg.vf_ifp = vf_ifp;
1149 			vmbus_chan_run_task(rxr->hn_chan, &task);
1150 		} else {
1151 			rxr->hn_rxvf_ifp = vf_ifp;
1152 		}
1153 	}
1154 }
1155 
1156 static bool
1157 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1158 {
1159 	if_t hn_ifp;
1160 
1161 	hn_ifp = sc->hn_ifp;
1162 
1163 	if (ifp == hn_ifp)
1164 		return (false);
1165 
1166 	if (if_getalloctype(ifp) != IFT_ETHER)
1167 		return (false);
1168 
1169 	/* Ignore lagg/vlan interfaces */
1170 	if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1171 	    strcmp(if_getdname(ifp), "vlan") == 0)
1172 		return (false);
1173 
1174 	/*
1175 	 * During detach events if_getifaddr(ifp) might be NULL.
1176 	 * Make sure the bcmp() below doesn't panic on that:
1177 	 */
1178 	if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1179 		return (false);
1180 
1181 	if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1182 		return (false);
1183 
1184 	return (true);
1185 }
1186 
1187 static void
1188 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1189 {
1190 	if_t hn_ifp;
1191 
1192 	HN_LOCK(sc);
1193 
1194 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1195 		goto out;
1196 
1197 	if (!hn_ismyvf(sc, ifp))
1198 		goto out;
1199 	hn_ifp = sc->hn_ifp;
1200 
1201 	if (rxvf) {
1202 		if (sc->hn_flags & HN_FLAG_RXVF)
1203 			goto out;
1204 
1205 		sc->hn_flags |= HN_FLAG_RXVF;
1206 		hn_rxfilter_config(sc);
1207 	} else {
1208 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1209 			goto out;
1210 
1211 		sc->hn_flags &= ~HN_FLAG_RXVF;
1212 		if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1213 			hn_rxfilter_config(sc);
1214 		else
1215 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1216 	}
1217 
1218 	hn_nvs_set_datapath(sc,
1219 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1220 
1221 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1222 
1223 	if (rxvf) {
1224 		hn_vf_rss_fixup(sc, true);
1225 		hn_suspend_mgmt(sc);
1226 		sc->hn_link_flags &=
1227 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1228 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1229 	} else {
1230 		hn_vf_rss_restore(sc);
1231 		hn_resume_mgmt(sc);
1232 	}
1233 
1234 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1235 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1236 
1237 	if (bootverbose) {
1238 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1239 		    rxvf ? "to" : "from", if_name(ifp));
1240 	}
1241 out:
1242 	HN_UNLOCK(sc);
1243 }
1244 
1245 static void
1246 hn_ifnet_event(void *arg, if_t ifp, int event)
1247 {
1248 
1249 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1250 		return;
1251 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1252 }
1253 
1254 static void
1255 hn_ifaddr_event(void *arg, if_t ifp)
1256 {
1257 
1258 	hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1259 }
1260 
1261 static int
1262 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1263 {
1264 	if_t ifp, vf_ifp;
1265 	uint64_t tmp;
1266 	int error;
1267 
1268 	HN_LOCK_ASSERT(sc);
1269 	ifp = sc->hn_ifp;
1270 	vf_ifp = sc->hn_vf_ifp;
1271 
1272 	/*
1273 	 * Fix up requested capabilities w/ supported capabilities,
1274 	 * since the supported capabilities could have been changed.
1275 	 */
1276 	ifr->ifr_reqcap &= if_getcapabilities(ifp);
1277 	/* Pass SIOCSIFCAP to VF. */
1278 	error = ifhwioctl(SIOCSIFCAP, vf_ifp, (caddr_t)ifr, curthread);
1279 
1280 	/*
1281 	 * NOTE:
1282 	 * The error will be propagated to the callers, however, it
1283 	 * is _not_ useful here.
1284 	 */
1285 
1286 	/*
1287 	 * Merge VF's enabled capabilities.
1288 	 */
1289 	if_setcapenable(ifp, if_getcapenable(vf_ifp) & if_getcapabilities(ifp));
1290 
1291 	tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP_HWASSIST(sc);
1292 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
1293 		if_sethwassistbits(ifp, tmp, 0);
1294 	else
1295 		if_sethwassistbits(ifp, 0, tmp);
1296 
1297 	tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP6_HWASSIST(sc);
1298 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
1299 		if_sethwassistbits(ifp, tmp, 0);
1300 	else
1301 		if_sethwassistbits(ifp, 0, tmp);
1302 
1303 	tmp = if_gethwassist(vf_ifp) & CSUM_IP_TSO;
1304 	if (if_getcapenable(ifp) & IFCAP_TSO4)
1305 		if_sethwassistbits(ifp, tmp, 0);
1306 	else
1307 		if_sethwassistbits(ifp, 0, tmp);
1308 
1309 	tmp = if_gethwassist(vf_ifp) & CSUM_IP6_TSO;
1310 	if (if_getcapenable(ifp) & IFCAP_TSO6)
1311 		if_sethwassistbits(ifp, tmp, 0);
1312 	else
1313 		if_sethwassistbits(ifp, 0, tmp);
1314 
1315 	return (error);
1316 }
1317 
1318 static int
1319 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1320 {
1321 	if_t vf_ifp;
1322 	struct ifreq ifr;
1323 
1324 	HN_LOCK_ASSERT(sc);
1325 	vf_ifp = sc->hn_vf_ifp;
1326 
1327 	memset(&ifr, 0, sizeof(ifr));
1328 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1329 	ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1330 	ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1331 	return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1332 }
1333 
1334 static void
1335 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1336 {
1337 	if_t ifp = sc->hn_ifp;
1338 	int allmulti = 0;
1339 
1340 	HN_LOCK_ASSERT(sc);
1341 
1342 	/* XXX vlan(4) style mcast addr maintenance */
1343 	if (!if_maddr_empty(ifp))
1344 		allmulti = IFF_ALLMULTI;
1345 
1346 	/* Always set the VF's if_flags */
1347 	if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1348 }
1349 
1350 static void
1351 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1352 {
1353 	struct rm_priotracker pt;
1354 	if_t hn_ifp = NULL;
1355 	struct mbuf *mn;
1356 
1357 	/*
1358 	 * XXX racy, if hn(4) ever detached.
1359 	 */
1360 	rm_rlock(&hn_vfmap_lock, &pt);
1361 	if (if_getindex(vf_ifp) < hn_vfmap_size)
1362 		hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1363 	rm_runlock(&hn_vfmap_lock, &pt);
1364 
1365 	if (hn_ifp != NULL) {
1366 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1367 			/*
1368 			 * Allow tapping on the VF.
1369 			 */
1370 			ETHER_BPF_MTAP(vf_ifp, mn);
1371 
1372 			/*
1373 			 * Update VF stats.
1374 			 */
1375 			if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1376 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1377 				    mn->m_pkthdr.len);
1378 			}
1379 			/*
1380 			 * XXX IFCOUNTER_IMCAST
1381 			 * This stat updating is kinda invasive, since it
1382 			 * requires two checks on the mbuf: the length check
1383 			 * and the ethernet header check.  As of this write,
1384 			 * all multicast packets go directly to hn(4), which
1385 			 * makes imcast stat updating in the VF a try in vian.
1386 			 */
1387 
1388 			/*
1389 			 * Fix up rcvif and increase hn(4)'s ipackets.
1390 			 */
1391 			mn->m_pkthdr.rcvif = hn_ifp;
1392 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1393 		}
1394 		/*
1395 		 * Go through hn(4)'s if_input.
1396 		 */
1397 		if_input(hn_ifp, m);
1398 	} else {
1399 		/*
1400 		 * In the middle of the transition; free this
1401 		 * mbuf chain.
1402 		 */
1403 		while (m != NULL) {
1404 			mn = m->m_nextpkt;
1405 			m->m_nextpkt = NULL;
1406 			m_freem(m);
1407 			m = mn;
1408 		}
1409 	}
1410 }
1411 
1412 static void
1413 hn_mtu_change_fixup(struct hn_softc *sc)
1414 {
1415 	if_t ifp;
1416 
1417 	HN_LOCK_ASSERT(sc);
1418 	ifp = sc->hn_ifp;
1419 
1420 	hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1421 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1422 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1423 }
1424 
1425 static uint32_t
1426 hn_rss_type_fromndis(uint32_t rss_hash)
1427 {
1428 	uint32_t types = 0;
1429 
1430 	if (rss_hash & NDIS_HASH_IPV4)
1431 		types |= RSS_TYPE_IPV4;
1432 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1433 		types |= RSS_TYPE_TCP_IPV4;
1434 	if (rss_hash & NDIS_HASH_IPV6)
1435 		types |= RSS_TYPE_IPV6;
1436 	if (rss_hash & NDIS_HASH_IPV6_EX)
1437 		types |= RSS_TYPE_IPV6_EX;
1438 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1439 		types |= RSS_TYPE_TCP_IPV6;
1440 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1441 		types |= RSS_TYPE_TCP_IPV6_EX;
1442 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1443 		types |= RSS_TYPE_UDP_IPV4;
1444 	return (types);
1445 }
1446 
1447 static uint32_t
1448 hn_rss_type_tondis(uint32_t types)
1449 {
1450 	uint32_t rss_hash = 0;
1451 
1452 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1453 	    ("UDP6 and UDP6EX are not supported"));
1454 
1455 	if (types & RSS_TYPE_IPV4)
1456 		rss_hash |= NDIS_HASH_IPV4;
1457 	if (types & RSS_TYPE_TCP_IPV4)
1458 		rss_hash |= NDIS_HASH_TCP_IPV4;
1459 	if (types & RSS_TYPE_IPV6)
1460 		rss_hash |= NDIS_HASH_IPV6;
1461 	if (types & RSS_TYPE_IPV6_EX)
1462 		rss_hash |= NDIS_HASH_IPV6_EX;
1463 	if (types & RSS_TYPE_TCP_IPV6)
1464 		rss_hash |= NDIS_HASH_TCP_IPV6;
1465 	if (types & RSS_TYPE_TCP_IPV6_EX)
1466 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1467 	if (types & RSS_TYPE_UDP_IPV4)
1468 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1469 	return (rss_hash);
1470 }
1471 
1472 static void
1473 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1474 {
1475 	int i;
1476 
1477 	HN_LOCK_ASSERT(sc);
1478 
1479 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1480 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1481 }
1482 
1483 static void
1484 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1485 {
1486 	if_t ifp, vf_ifp;
1487 	struct ifrsshash ifrh;
1488 	struct ifrsskey ifrk;
1489 	int error;
1490 	uint32_t my_types, diff_types, mbuf_types = 0;
1491 
1492 	HN_LOCK_ASSERT(sc);
1493 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1494 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1495 
1496 	if (sc->hn_rx_ring_inuse == 1) {
1497 		/* No RSS on synthetic parts; done. */
1498 		return;
1499 	}
1500 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1501 		/* Synthetic parts do not support Toeplitz; done. */
1502 		return;
1503 	}
1504 
1505 	ifp = sc->hn_ifp;
1506 	vf_ifp = sc->hn_vf_ifp;
1507 
1508 	/*
1509 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1510 	 * supported.
1511 	 */
1512 	memset(&ifrk, 0, sizeof(ifrk));
1513 	strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1514 	error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1515 	if (error) {
1516 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1517 		    if_name(vf_ifp), error);
1518 		goto done;
1519 	}
1520 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1521 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1522 		    if_name(vf_ifp), ifrk.ifrk_func);
1523 		goto done;
1524 	}
1525 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1526 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1527 		    if_name(vf_ifp), ifrk.ifrk_keylen);
1528 		goto done;
1529 	}
1530 
1531 	/*
1532 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1533 	 */
1534 	memset(&ifrh, 0, sizeof(ifrh));
1535 	strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1536 	error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1537 	if (error) {
1538 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1539 		    if_name(vf_ifp), error);
1540 		goto done;
1541 	}
1542 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1543 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1544 		    if_name(vf_ifp), ifrh.ifrh_func);
1545 		goto done;
1546 	}
1547 
1548 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1549 	if ((ifrh.ifrh_types & my_types) == 0) {
1550 		/* This disables RSS; ignore it then */
1551 		if_printf(ifp, "%s intersection of RSS types failed.  "
1552 		    "VF %#x, mine %#x\n", if_name(vf_ifp),
1553 		    ifrh.ifrh_types, my_types);
1554 		goto done;
1555 	}
1556 
1557 	diff_types = my_types ^ ifrh.ifrh_types;
1558 	my_types &= ifrh.ifrh_types;
1559 	mbuf_types = my_types;
1560 
1561 	/*
1562 	 * Detect RSS hash value/type confliction.
1563 	 *
1564 	 * NOTE:
1565 	 * We don't disable the hash type, but stop delivery the hash
1566 	 * value/type through mbufs on RX path.
1567 	 *
1568 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1569 	 * hash is delivered with type of TCP_IPV4.  This means if
1570 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1571 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1572 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1573 	 * here.
1574 	 */
1575 	if ((my_types & RSS_TYPE_IPV4) &&
1576 	    (diff_types & ifrh.ifrh_types &
1577 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1578 		/* Conflict; disable IPV4 hash type/value delivery. */
1579 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1580 		mbuf_types &= ~RSS_TYPE_IPV4;
1581 	}
1582 	if ((my_types & RSS_TYPE_IPV6) &&
1583 	    (diff_types & ifrh.ifrh_types &
1584 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1585 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1586 	      RSS_TYPE_IPV6_EX))) {
1587 		/* Conflict; disable IPV6 hash type/value delivery. */
1588 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1589 		mbuf_types &= ~RSS_TYPE_IPV6;
1590 	}
1591 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1592 	    (diff_types & ifrh.ifrh_types &
1593 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1594 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1595 	      RSS_TYPE_IPV6))) {
1596 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1597 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1598 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1599 	}
1600 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1601 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1602 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1603 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1604 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1605 	}
1606 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1607 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1608 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1609 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1610 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1611 	}
1612 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1613 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1614 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1615 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1616 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1617 	}
1618 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1619 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1620 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1621 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1622 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1623 	}
1624 
1625 	/*
1626 	 * Indirect table does not matter.
1627 	 */
1628 
1629 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1630 	    hn_rss_type_tondis(my_types);
1631 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1632 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1633 
1634 	if (reconf) {
1635 		error = hn_rss_reconfig(sc);
1636 		if (error) {
1637 			/* XXX roll-back? */
1638 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1639 			/* XXX keep going. */
1640 		}
1641 	}
1642 done:
1643 	/* Hash deliverability for mbufs. */
1644 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1645 }
1646 
1647 static void
1648 hn_vf_rss_restore(struct hn_softc *sc)
1649 {
1650 
1651 	HN_LOCK_ASSERT(sc);
1652 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1653 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1654 
1655 	if (sc->hn_rx_ring_inuse == 1)
1656 		goto done;
1657 
1658 	/*
1659 	 * Restore hash types.  Key does _not_ matter.
1660 	 */
1661 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1662 		int error;
1663 
1664 		sc->hn_rss_hash = sc->hn_rss_hcap;
1665 		error = hn_rss_reconfig(sc);
1666 		if (error) {
1667 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1668 			    error);
1669 			/* XXX keep going. */
1670 		}
1671 	}
1672 done:
1673 	/* Hash deliverability for mbufs. */
1674 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1675 }
1676 
1677 static void
1678 hn_xpnt_vf_setready(struct hn_softc *sc)
1679 {
1680 	if_t ifp, vf_ifp;
1681 	struct ifreq ifr;
1682 
1683 	HN_LOCK_ASSERT(sc);
1684 	ifp = sc->hn_ifp;
1685 	vf_ifp = sc->hn_vf_ifp;
1686 
1687 	/*
1688 	 * Mark the VF ready.
1689 	 */
1690 	sc->hn_vf_rdytick = 0;
1691 
1692 	/*
1693 	 * Save information for restoration.
1694 	 */
1695 	sc->hn_saved_caps = if_getcapabilities(ifp);
1696 	sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1697 	sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1698 	sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1699 
1700 	/*
1701 	 * Intersect supported/enabled capabilities.
1702 	 *
1703 	 * NOTE:
1704 	 * if_hwassist is not changed here.
1705 	 */
1706 	if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1707 	if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1708 
1709 	/*
1710 	 * Fix TSO settings.
1711 	 */
1712 	if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1713 		if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1714 	if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1715 		if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1716 	if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1717 		if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1718 
1719 	/*
1720 	 * Change VF's enabled capabilities.
1721 	 */
1722 	memset(&ifr, 0, sizeof(ifr));
1723 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1724 	ifr.ifr_reqcap = if_getcapenable(ifp);
1725 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1726 
1727 	if (if_getmtu(ifp) != ETHERMTU) {
1728 		int error;
1729 
1730 		/*
1731 		 * Change VF's MTU.
1732 		 */
1733 		memset(&ifr, 0, sizeof(ifr));
1734 		strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1735 		ifr.ifr_mtu = if_getmtu(ifp);
1736 		error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1737 		if (error) {
1738 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1739 			    if_name(vf_ifp), if_getmtu(ifp));
1740 			if (if_getmtu(ifp) > ETHERMTU) {
1741 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1742 
1743 				/*
1744 				 * XXX
1745 				 * No need to adjust the synthetic parts' MTU;
1746 				 * failure of the adjustment will cause us
1747 				 * infinite headache.
1748 				 */
1749 				if_setmtu(ifp, ETHERMTU);
1750 				hn_mtu_change_fixup(sc);
1751 			}
1752 		}
1753 	}
1754 }
1755 
1756 static bool
1757 hn_xpnt_vf_isready(struct hn_softc *sc)
1758 {
1759 
1760 	HN_LOCK_ASSERT(sc);
1761 
1762 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1763 		return (false);
1764 
1765 	if (sc->hn_vf_rdytick == 0)
1766 		return (true);
1767 
1768 	if (sc->hn_vf_rdytick > ticks)
1769 		return (false);
1770 
1771 	/* Mark VF as ready. */
1772 	hn_xpnt_vf_setready(sc);
1773 	return (true);
1774 }
1775 
1776 static void
1777 hn_xpnt_vf_setenable(struct hn_softc *sc)
1778 {
1779 	int i;
1780 
1781 	HN_LOCK_ASSERT(sc);
1782 
1783 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1784 	rm_wlock(&sc->hn_vf_lock);
1785 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1786 	rm_wunlock(&sc->hn_vf_lock);
1787 
1788 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1789 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1790 }
1791 
1792 static void
1793 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1794 {
1795 	int i;
1796 
1797 	HN_LOCK_ASSERT(sc);
1798 
1799 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1800 	rm_wlock(&sc->hn_vf_lock);
1801 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1802 	if (clear_vf)
1803 		sc->hn_vf_ifp = NULL;
1804 	rm_wunlock(&sc->hn_vf_lock);
1805 
1806 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1807 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1808 }
1809 
1810 static void
1811 hn_xpnt_vf_init(struct hn_softc *sc)
1812 {
1813 	int error;
1814 
1815 	HN_LOCK_ASSERT(sc);
1816 
1817 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1818 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1819 
1820 	if (bootverbose) {
1821 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1822 		    if_name(sc->hn_vf_ifp));
1823 	}
1824 
1825 	/*
1826 	 * Bring the VF up.
1827 	 */
1828 	hn_xpnt_vf_saveifflags(sc);
1829 	if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1830 	error = hn_xpnt_vf_iocsetflags(sc);
1831 	if (error) {
1832 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1833 		    if_name(sc->hn_vf_ifp), error);
1834 		return;
1835 	}
1836 
1837 	/*
1838 	 * NOTE:
1839 	 * Datapath setting must happen _after_ bringing the VF up.
1840 	 */
1841 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1842 
1843 	/*
1844 	 * NOTE:
1845 	 * Fixup RSS related bits _after_ the VF is brought up, since
1846 	 * many VFs generate RSS key during it's initialization.
1847 	 */
1848 	hn_vf_rss_fixup(sc, true);
1849 
1850 	/* Mark transparent mode VF as enabled. */
1851 	hn_xpnt_vf_setenable(sc);
1852 }
1853 
1854 static void
1855 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1856 {
1857 	struct hn_softc *sc = xsc;
1858 
1859 	HN_LOCK(sc);
1860 
1861 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1862 		goto done;
1863 	if (sc->hn_vf_ifp == NULL)
1864 		goto done;
1865 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1866 		goto done;
1867 
1868 	if (sc->hn_vf_rdytick != 0) {
1869 		/* Mark VF as ready. */
1870 		hn_xpnt_vf_setready(sc);
1871 	}
1872 
1873 	if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1874 		/*
1875 		 * Delayed VF initialization.
1876 		 */
1877 		if (bootverbose) {
1878 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1879 			    if_name(sc->hn_vf_ifp));
1880 		}
1881 		hn_xpnt_vf_init(sc);
1882 	}
1883 done:
1884 	HN_UNLOCK(sc);
1885 }
1886 
1887 static void
1888 hn_ifnet_attevent(void *xsc, if_t ifp)
1889 {
1890 	struct hn_softc *sc = xsc;
1891 
1892 	HN_LOCK(sc);
1893 
1894 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1895 		goto done;
1896 
1897 	if (!hn_ismyvf(sc, ifp))
1898 		goto done;
1899 
1900 	if (sc->hn_vf_ifp != NULL) {
1901 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1902 		    if_name(sc->hn_vf_ifp));
1903 		goto done;
1904 	}
1905 
1906 	if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1907 		/*
1908 		 * ifnet.if_start is _not_ supported by transparent
1909 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1910 		 */
1911 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1912 		    "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1913 
1914 		goto done;
1915 	}
1916 
1917 	rm_wlock(&hn_vfmap_lock);
1918 
1919 	if (if_getindex(ifp) >= hn_vfmap_size) {
1920 		if_t *newmap;
1921 		int newsize;
1922 
1923 		newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1924 		newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1925 		    M_WAITOK | M_ZERO);
1926 
1927 		memcpy(newmap, hn_vfmap,
1928 		    sizeof(if_t) * hn_vfmap_size);
1929 		free(hn_vfmap, M_DEVBUF);
1930 		hn_vfmap = newmap;
1931 		hn_vfmap_size = newsize;
1932 	}
1933 	KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1934 	    ("%s: ifindex %d was mapped to %s",
1935 	     if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1936 	hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1937 
1938 	rm_wunlock(&hn_vfmap_lock);
1939 
1940 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1941 	rm_wlock(&sc->hn_vf_lock);
1942 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1943 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1944 	sc->hn_vf_ifp = ifp;
1945 	rm_wunlock(&sc->hn_vf_lock);
1946 
1947 	if (hn_xpnt_vf) {
1948 		int wait_ticks;
1949 
1950 		/*
1951 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1952 		 * Save vf_ifp's current if_input for later restoration.
1953 		 */
1954 		sc->hn_vf_input = if_getinputfn(ifp);
1955 		if_setinputfn(ifp, hn_xpnt_vf_input);
1956 
1957 		/*
1958 		 * Stop link status management; use the VF's.
1959 		 */
1960 		hn_suspend_mgmt(sc);
1961 
1962 		/*
1963 		 * Give VF sometime to complete its attach routing.
1964 		 */
1965 		wait_ticks = hn_xpnt_vf_attwait * hz;
1966 		sc->hn_vf_rdytick = ticks + wait_ticks;
1967 
1968 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1969 		    wait_ticks);
1970 	}
1971 done:
1972 	HN_UNLOCK(sc);
1973 }
1974 
1975 static void
1976 hn_ifnet_detevent(void *xsc, if_t ifp)
1977 {
1978 	struct hn_softc *sc = xsc;
1979 
1980 	HN_LOCK(sc);
1981 
1982 	if (sc->hn_vf_ifp == NULL)
1983 		goto done;
1984 
1985 	if (!hn_ismyvf(sc, ifp))
1986 		goto done;
1987 
1988 	if (hn_xpnt_vf) {
1989 		/*
1990 		 * Make sure that the delayed initialization is not running.
1991 		 *
1992 		 * NOTE:
1993 		 * - This lock _must_ be released, since the hn_vf_init task
1994 		 *   will try holding this lock.
1995 		 * - It is safe to release this lock here, since the
1996 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1997 		 *
1998 		 * XXX racy, if hn(4) ever detached.
1999 		 */
2000 		HN_UNLOCK(sc);
2001 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2002 		HN_LOCK(sc);
2003 
2004 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2005 		    if_name(sc->hn_ifp)));
2006 		if_setinputfn(ifp, sc->hn_vf_input);
2007 		sc->hn_vf_input = NULL;
2008 
2009 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2010 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2011 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2012 
2013 		if (sc->hn_vf_rdytick == 0) {
2014 			/*
2015 			 * The VF was ready; restore some settings.
2016 			 */
2017 			if_setcapabilities(ifp, sc->hn_saved_caps);
2018 			/*
2019 			 * NOTE:
2020 			 * There is _no_ need to fixup if_capenable and
2021 			 * if_hwassist, since the if_capabilities before
2022 			 * restoration was an intersection of the VF's
2023 			 * if_capabilites and the synthetic device's
2024 			 * if_capabilites.
2025 			 */
2026 			if_sethwtsomax(ifp, sc->hn_saved_tsomax);
2027 			if_sethwtsomaxsegcount(sc->hn_ifp,
2028 			    sc->hn_saved_tsosegcnt);
2029 			if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
2030 		}
2031 
2032 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2033 			/*
2034 			 * Restore RSS settings.
2035 			 */
2036 			hn_vf_rss_restore(sc);
2037 
2038 			/*
2039 			 * Resume link status management, which was suspended
2040 			 * by hn_ifnet_attevent().
2041 			 */
2042 			hn_resume_mgmt(sc);
2043 		}
2044 	}
2045 
2046 	/* Mark transparent mode VF as disabled. */
2047 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2048 
2049 	rm_wlock(&hn_vfmap_lock);
2050 
2051 	KASSERT(if_getindex(ifp) < hn_vfmap_size,
2052 	    ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2053 	if (hn_vfmap[if_getindex(ifp)] != NULL) {
2054 		KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2055 		    ("%s: ifindex %d was mapped to %s",
2056 		     if_name(ifp), if_getindex(ifp),
2057 		     if_name(hn_vfmap[if_getindex(ifp)])));
2058 		hn_vfmap[if_getindex(ifp)] = NULL;
2059 	}
2060 
2061 	rm_wunlock(&hn_vfmap_lock);
2062 done:
2063 	HN_UNLOCK(sc);
2064 }
2065 
2066 static void
2067 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2068 {
2069 	struct hn_softc *sc = xsc;
2070 
2071 	if (sc->hn_vf_ifp == ifp)
2072 		if_link_state_change(sc->hn_ifp, link_state);
2073 }
2074 
2075 static int
2076 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2077 {
2078 	struct hn_softc *sc = arg1;
2079 	unsigned int tsomax;
2080 	int error;
2081 
2082 	tsomax = if_gethwtsomax(sc->hn_ifp);
2083 	error = sysctl_handle_int(oidp, &tsomax, 0, req);
2084 	return error;
2085 }
2086 
2087 static int
2088 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2089 {
2090 	struct hn_softc *sc = arg1;
2091 	unsigned int tsomaxsegcnt;
2092 	int error;
2093 
2094 	tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2095 	error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2096 	return error;
2097 }
2098 
2099 static int
2100 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2101 {
2102 	struct hn_softc *sc = arg1;
2103 	unsigned int tsomaxsegsz;
2104 	int error;
2105 
2106 	tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2107 	error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2108 	return error;
2109 }
2110 
2111 static int
2112 hn_probe(device_t dev)
2113 {
2114 
2115 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2116 		device_set_desc(dev, "Hyper-V Network Interface");
2117 		return BUS_PROBE_DEFAULT;
2118 	}
2119 	return ENXIO;
2120 }
2121 
2122 static int
2123 hn_attach(device_t dev)
2124 {
2125 	struct hn_softc *sc = device_get_softc(dev);
2126 	struct sysctl_oid_list *child;
2127 	struct sysctl_ctx_list *ctx;
2128 	uint8_t eaddr[ETHER_ADDR_LEN];
2129 	if_t ifp = NULL;
2130 	int error, ring_cnt, tx_ring_cnt;
2131 	uint32_t mtu;
2132 
2133 	sc->hn_dev = dev;
2134 	sc->hn_prichan = vmbus_get_channel(dev);
2135 	HN_LOCK_INIT(sc);
2136 	rm_init(&sc->hn_vf_lock, "hnvf");
2137 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2138 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2139 
2140 	/*
2141 	 * Initialize these tunables once.
2142 	 */
2143 	sc->hn_agg_size = hn_tx_agg_size;
2144 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2145 
2146 	/*
2147 	 * Setup taskqueue for transmission.
2148 	 */
2149 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2150 		int i;
2151 
2152 		sc->hn_tx_taskqs =
2153 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2154 		    M_DEVBUF, M_WAITOK);
2155 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2156 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2157 			    M_WAITOK, taskqueue_thread_enqueue,
2158 			    &sc->hn_tx_taskqs[i]);
2159 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2160 			    "%s tx%d", device_get_nameunit(dev), i);
2161 		}
2162 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2163 		sc->hn_tx_taskqs = hn_tx_taskque;
2164 	}
2165 
2166 	/*
2167 	 * Setup taskqueue for mangement tasks, e.g. link status.
2168 	 */
2169 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2170 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2171 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2172 	    device_get_nameunit(dev));
2173 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2174 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2175 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2176 	    hn_netchg_status_taskfunc, sc);
2177 
2178 	if (hn_xpnt_vf) {
2179 		/*
2180 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2181 		 */
2182 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2183 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2184 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2185 		    device_get_nameunit(dev));
2186 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2187 		    hn_xpnt_vf_init_taskfunc, sc);
2188 	}
2189 
2190 	/*
2191 	 * Allocate ifnet and setup its name earlier, so that if_printf
2192 	 * can be used by functions, which will be called after
2193 	 * ether_ifattach().
2194 	 */
2195 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2196 	if_setsoftc(ifp, sc);
2197 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2198 
2199 	/*
2200 	 * Initialize ifmedia earlier so that it can be unconditionally
2201 	 * destroyed, if error happened later on.
2202 	 */
2203 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2204 
2205 	/*
2206 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2207 	 * to use (tx_ring_cnt).
2208 	 *
2209 	 * NOTE:
2210 	 * The # of RX rings to use is same as the # of channels to use.
2211 	 */
2212 	ring_cnt = hn_chan_cnt;
2213 	if (ring_cnt <= 0) {
2214 		/* Default */
2215 		ring_cnt = mp_ncpus;
2216 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2217 			ring_cnt = HN_RING_CNT_DEF_MAX;
2218 	} else if (ring_cnt > mp_ncpus) {
2219 		ring_cnt = mp_ncpus;
2220 	}
2221 #ifdef RSS
2222 	if (ring_cnt > rss_getnumbuckets())
2223 		ring_cnt = rss_getnumbuckets();
2224 #endif
2225 
2226 	tx_ring_cnt = hn_tx_ring_cnt;
2227 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2228 		tx_ring_cnt = ring_cnt;
2229 #ifdef HN_IFSTART_SUPPORT
2230 	if (hn_use_if_start) {
2231 		/* ifnet.if_start only needs one TX ring. */
2232 		tx_ring_cnt = 1;
2233 	}
2234 #endif
2235 
2236 	/*
2237 	 * Set the leader CPU for channels.
2238 	 */
2239 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2240 
2241 	/*
2242 	 * Create enough TX/RX rings, even if only limited number of
2243 	 * channels can be allocated.
2244 	 */
2245 	error = hn_create_tx_data(sc, tx_ring_cnt);
2246 	if (error)
2247 		goto failed;
2248 	error = hn_create_rx_data(sc, ring_cnt);
2249 	if (error)
2250 		goto failed;
2251 
2252 	/*
2253 	 * Create transaction context for NVS and RNDIS transactions.
2254 	 */
2255 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2256 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2257 	if (sc->hn_xact == NULL) {
2258 		error = ENXIO;
2259 		goto failed;
2260 	}
2261 
2262 	/*
2263 	 * Install orphan handler for the revocation of this device's
2264 	 * primary channel.
2265 	 *
2266 	 * NOTE:
2267 	 * The processing order is critical here:
2268 	 * Install the orphan handler, _before_ testing whether this
2269 	 * device's primary channel has been revoked or not.
2270 	 */
2271 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2272 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2273 		error = ENXIO;
2274 		goto failed;
2275 	}
2276 
2277 	/*
2278 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2279 	 */
2280 	error = hn_synth_attach(sc, ETHERMTU);
2281 	if (error)
2282 		goto failed;
2283 
2284 	error = hn_rndis_get_eaddr(sc, eaddr);
2285 	if (error)
2286 		goto failed;
2287 
2288 	error = hn_rndis_get_mtu(sc, &mtu);
2289 	if (error)
2290 		mtu = ETHERMTU;
2291 	else if (bootverbose)
2292 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2293 
2294 	if (sc->hn_rx_ring_inuse > 1) {
2295 		/*
2296 		 * Reduce TCP segment aggregation limit for multiple
2297 		 * RX rings to increase ACK timeliness.
2298 		 */
2299 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2300 	}
2301 
2302 	/*
2303 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2304 	 */
2305 	hn_fixup_tx_data(sc);
2306 	hn_fixup_rx_data(sc);
2307 
2308 	ctx = device_get_sysctl_ctx(dev);
2309 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2310 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2311 	    &sc->hn_nvs_ver, 0, "NVS version");
2312 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2313 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2314 	    hn_ndis_version_sysctl, "A", "NDIS version");
2315 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2316 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2317 	    hn_caps_sysctl, "A", "capabilities");
2318 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2319 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2320 	    hn_hwassist_sysctl, "A", "hwassist");
2321 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2322 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2323 	    "IU", "max TSO size");
2324 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2325 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2326 	    "IU", "max # of TSO segments");
2327 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2328 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2329 	    "IU", "max size of TSO segment");
2330 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2331 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2332 	    hn_rxfilter_sysctl, "A", "rxfilter");
2333 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2334 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2335 	    hn_rss_hash_sysctl, "A", "RSS hash");
2336 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2337 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2338 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2339 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2340 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2341 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2342 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2343 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2344 #ifndef RSS
2345 	/*
2346 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2347 	 */
2348 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2349 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2350 	    hn_rss_key_sysctl, "IU", "RSS key");
2351 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2352 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2354 #endif
2355 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2356 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2357 	    "RNDIS offered packet transmission aggregation size limit");
2358 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2359 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2360 	    "RNDIS offered packet transmission aggregation count limit");
2361 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2362 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2363 	    "RNDIS packet transmission aggregation alignment");
2364 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2365 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2366 	    hn_txagg_size_sysctl, "I",
2367 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2368 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2369 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2370 	    hn_txagg_pkts_sysctl, "I",
2371 	    "Packet transmission aggregation packets, "
2372 	    "0 -- disable, -1 -- auto");
2373 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2374 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2375 	    hn_polling_sysctl, "I",
2376 	    "Polling frequency: [100,1000000], 0 disable polling");
2377 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2378 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2379 	    hn_vf_sysctl, "A", "Virtual Function's name");
2380 	if (!hn_xpnt_vf) {
2381 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2382 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2383 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2384 	} else {
2385 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2386 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2387 		    hn_xpnt_vf_enabled_sysctl, "I",
2388 		    "Transparent VF enabled");
2389 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2390 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2391 		    hn_xpnt_vf_accbpf_sysctl, "I",
2392 		    "Accurate BPF for transparent VF");
2393 	}
2394 
2395 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2396 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2397 	    "switch to rsc");
2398 
2399 	/*
2400 	 * Setup the ifmedia, which has been initialized earlier.
2401 	 */
2402 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2403 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2404 	/* XXX ifmedia_set really should do this for us */
2405 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2406 
2407 	/*
2408 	 * Setup the ifnet for this interface.
2409 	 */
2410 
2411 	if_setbaudrate(ifp, IF_Gbps(10));
2412 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2413 	if_setioctlfn(ifp, hn_ioctl);
2414 	if_setinitfn(ifp, hn_init);
2415 #ifdef HN_IFSTART_SUPPORT
2416 	if (hn_use_if_start) {
2417 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2418 
2419 		if_setstartfn(ifp, hn_start);
2420 		if_setsendqlen(ifp, qdepth);
2421 		if_setsendqready(ifp);
2422 	} else
2423 #endif
2424 	{
2425 		if_settransmitfn(ifp, hn_transmit);
2426 		if_setqflushfn(ifp, hn_xmit_qflush);
2427 	}
2428 
2429 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2430 #ifdef foo
2431 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2432 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2433 #endif
2434 	if (sc->hn_caps & HN_CAP_VLAN) {
2435 		/* XXX not sure about VLAN_MTU. */
2436 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2437 	}
2438 
2439 	if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2440 	if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2441 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2442 	if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2443 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2444 	if (sc->hn_caps & HN_CAP_TSO4) {
2445 		if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2446 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2447 	}
2448 	if (sc->hn_caps & HN_CAP_TSO6) {
2449 		if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2450 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2451 	}
2452 
2453 	/* Enable all available capabilities by default. */
2454 	if_setcapenable(ifp, if_getcapabilities(ifp));
2455 
2456 	/*
2457 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2458 	 * be enabled through SIOCSIFCAP.
2459 	 */
2460 	if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2461 	if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2462 
2463 	if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2464 		/*
2465 		 * Lock hn_set_tso_maxsize() to simplify its
2466 		 * internal logic.
2467 		 */
2468 		HN_LOCK(sc);
2469 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2470 		HN_UNLOCK(sc);
2471 		if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2472 		if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2473 	}
2474 
2475 	ether_ifattach(ifp, eaddr);
2476 
2477 	if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2478 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2479 		    if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2480 	}
2481 	if (mtu < ETHERMTU) {
2482 
2483 		if_setmtu(ifp, mtu);
2484 	}
2485 
2486 	/* Inform the upper layer about the long frame support. */
2487 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2488 
2489 	/*
2490 	 * Kick off link status check.
2491 	 */
2492 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2493 	hn_update_link_status(sc);
2494 
2495 	if (!hn_xpnt_vf) {
2496 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2497 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2498 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2499 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2500 	} else {
2501 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2502 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2503 	}
2504 
2505 	/*
2506 	 * NOTE:
2507 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2508 	 * since interface's LLADDR is needed; interface LLADDR is not
2509 	 * available when ifnet_arrival event is triggered.
2510 	 */
2511 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2512 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2513 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2514 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2515 
2516 	return (0);
2517 failed:
2518 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2519 		hn_synth_detach(sc);
2520 	hn_detach(dev);
2521 	return (error);
2522 }
2523 
2524 static int
2525 hn_detach(device_t dev)
2526 {
2527 	struct hn_softc *sc = device_get_softc(dev);
2528 	if_t ifp = sc->hn_ifp, vf_ifp;
2529 
2530 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2531 		/*
2532 		 * In case that the vmbus missed the orphan handler
2533 		 * installation.
2534 		 */
2535 		vmbus_xact_ctx_orphan(sc->hn_xact);
2536 	}
2537 
2538 	if (sc->hn_ifaddr_evthand != NULL)
2539 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2540 	if (sc->hn_ifnet_evthand != NULL)
2541 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2542 	if (sc->hn_ifnet_atthand != NULL) {
2543 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2544 		    sc->hn_ifnet_atthand);
2545 	}
2546 	if (sc->hn_ifnet_dethand != NULL) {
2547 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2548 		    sc->hn_ifnet_dethand);
2549 	}
2550 	if (sc->hn_ifnet_lnkhand != NULL)
2551 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2552 
2553 	vf_ifp = sc->hn_vf_ifp;
2554 	__compiler_membar();
2555 	if (vf_ifp != NULL)
2556 		hn_ifnet_detevent(sc, vf_ifp);
2557 
2558 	if (device_is_attached(dev)) {
2559 		HN_LOCK(sc);
2560 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2561 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2562 				hn_stop(sc, true);
2563 			/*
2564 			 * NOTE:
2565 			 * hn_stop() only suspends data, so managment
2566 			 * stuffs have to be suspended manually here.
2567 			 */
2568 			hn_suspend_mgmt(sc);
2569 			hn_synth_detach(sc);
2570 		}
2571 		HN_UNLOCK(sc);
2572 		ether_ifdetach(ifp);
2573 	}
2574 
2575 	ifmedia_removeall(&sc->hn_media);
2576 	hn_destroy_rx_data(sc);
2577 	hn_destroy_tx_data(sc);
2578 
2579 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2580 		int i;
2581 
2582 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2583 			taskqueue_free(sc->hn_tx_taskqs[i]);
2584 		free(sc->hn_tx_taskqs, M_DEVBUF);
2585 	}
2586 	taskqueue_free(sc->hn_mgmt_taskq0);
2587 	if (sc->hn_vf_taskq != NULL)
2588 		taskqueue_free(sc->hn_vf_taskq);
2589 
2590 	if (sc->hn_xact != NULL) {
2591 		/*
2592 		 * Uninstall the orphan handler _before_ the xact is
2593 		 * destructed.
2594 		 */
2595 		vmbus_chan_unset_orphan(sc->hn_prichan);
2596 		vmbus_xact_ctx_destroy(sc->hn_xact);
2597 	}
2598 
2599 	if_free(ifp);
2600 
2601 	HN_LOCK_DESTROY(sc);
2602 	rm_destroy(&sc->hn_vf_lock);
2603 	return (0);
2604 }
2605 
2606 static int
2607 hn_shutdown(device_t dev)
2608 {
2609 
2610 	return (0);
2611 }
2612 
2613 static void
2614 hn_link_status(struct hn_softc *sc)
2615 {
2616 	uint32_t link_status;
2617 	int error;
2618 
2619 	error = hn_rndis_get_linkstatus(sc, &link_status);
2620 	if (error) {
2621 		/* XXX what to do? */
2622 		return;
2623 	}
2624 
2625 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2626 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2627 	else
2628 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2629 	if_link_state_change(sc->hn_ifp,
2630 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2631 	    LINK_STATE_UP : LINK_STATE_DOWN);
2632 }
2633 
2634 static void
2635 hn_link_taskfunc(void *xsc, int pending __unused)
2636 {
2637 	struct hn_softc *sc = xsc;
2638 
2639 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2640 		return;
2641 	hn_link_status(sc);
2642 }
2643 
2644 static void
2645 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2646 {
2647 	struct hn_softc *sc = xsc;
2648 
2649 	/* Prevent any link status checks from running. */
2650 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2651 
2652 	/*
2653 	 * Fake up a [link down --> link up] state change; 5 seconds
2654 	 * delay is used, which closely simulates miibus reaction
2655 	 * upon link down event.
2656 	 */
2657 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2658 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2659 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2660 	    &sc->hn_netchg_status, 5 * hz);
2661 }
2662 
2663 static void
2664 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2665 {
2666 	struct hn_softc *sc = xsc;
2667 
2668 	/* Re-allow link status checks. */
2669 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2670 	hn_link_status(sc);
2671 }
2672 
2673 static void
2674 hn_update_link_status(struct hn_softc *sc)
2675 {
2676 
2677 	if (sc->hn_mgmt_taskq != NULL)
2678 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2679 }
2680 
2681 static void
2682 hn_change_network(struct hn_softc *sc)
2683 {
2684 
2685 	if (sc->hn_mgmt_taskq != NULL)
2686 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2687 }
2688 
2689 static __inline int
2690 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2691     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2692 {
2693 	struct mbuf *m = *m_head;
2694 	int error;
2695 
2696 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2697 
2698 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2699 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2700 	if (error == EFBIG) {
2701 		struct mbuf *m_new;
2702 
2703 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2704 		if (m_new == NULL)
2705 			return ENOBUFS;
2706 		else
2707 			*m_head = m = m_new;
2708 		txr->hn_tx_collapsed++;
2709 
2710 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2711 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2712 	}
2713 	if (!error) {
2714 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2715 		    BUS_DMASYNC_PREWRITE);
2716 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2717 	}
2718 	return error;
2719 }
2720 
2721 static __inline int
2722 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2723 {
2724 
2725 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2726 	    ("put an onlist txd %#x", txd->flags));
2727 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2728 	    ("put an onagg txd %#x", txd->flags));
2729 
2730 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2731 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2732 		return 0;
2733 
2734 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2735 		struct hn_txdesc *tmp_txd;
2736 
2737 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2738 			int freed __diagused;
2739 
2740 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2741 			    ("resursive aggregation on aggregated txdesc"));
2742 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2743 			    ("not aggregated txdesc"));
2744 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2745 			    ("aggregated txdesc uses dmamap"));
2746 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2747 			    ("aggregated txdesc consumes "
2748 			     "chimney sending buffer"));
2749 			KASSERT(tmp_txd->chim_size == 0,
2750 			    ("aggregated txdesc has non-zero "
2751 			     "chimney sending size"));
2752 
2753 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2754 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2755 			freed = hn_txdesc_put(txr, tmp_txd);
2756 			KASSERT(freed, ("failed to free aggregated txdesc"));
2757 		}
2758 	}
2759 
2760 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2761 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2762 		    ("chim txd uses dmamap"));
2763 		hn_chim_free(txr->hn_sc, txd->chim_index);
2764 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2765 		txd->chim_size = 0;
2766 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2767 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2768 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2769 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2770 		    txd->data_dmap);
2771 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2772 	}
2773 
2774 	if (txd->m != NULL) {
2775 		m_freem(txd->m);
2776 		txd->m = NULL;
2777 	}
2778 
2779 	txd->flags |= HN_TXD_FLAG_ONLIST;
2780 #ifndef HN_USE_TXDESC_BUFRING
2781 	mtx_lock_spin(&txr->hn_txlist_spin);
2782 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2783 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2784 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2785 	txr->hn_txdesc_avail++;
2786 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2787 	mtx_unlock_spin(&txr->hn_txlist_spin);
2788 #else	/* HN_USE_TXDESC_BUFRING */
2789 #ifdef HN_DEBUG
2790 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2791 #endif
2792 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2793 #endif	/* !HN_USE_TXDESC_BUFRING */
2794 
2795 	return 1;
2796 }
2797 
2798 static __inline struct hn_txdesc *
2799 hn_txdesc_get(struct hn_tx_ring *txr)
2800 {
2801 	struct hn_txdesc *txd;
2802 
2803 #ifndef HN_USE_TXDESC_BUFRING
2804 	mtx_lock_spin(&txr->hn_txlist_spin);
2805 	txd = SLIST_FIRST(&txr->hn_txlist);
2806 	if (txd != NULL) {
2807 		KASSERT(txr->hn_txdesc_avail > 0,
2808 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2809 		txr->hn_txdesc_avail--;
2810 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2811 	}
2812 	mtx_unlock_spin(&txr->hn_txlist_spin);
2813 #else
2814 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2815 #endif
2816 
2817 	if (txd != NULL) {
2818 #ifdef HN_USE_TXDESC_BUFRING
2819 #ifdef HN_DEBUG
2820 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2821 #endif
2822 #endif	/* HN_USE_TXDESC_BUFRING */
2823 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2824 		    STAILQ_EMPTY(&txd->agg_list) &&
2825 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2826 		    txd->chim_size == 0 &&
2827 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2828 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2829 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2830 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2831 		txd->refs = 1;
2832 	}
2833 	return txd;
2834 }
2835 
2836 static __inline void
2837 hn_txdesc_hold(struct hn_txdesc *txd)
2838 {
2839 
2840 	/* 0->1 transition will never work */
2841 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2842 	atomic_add_int(&txd->refs, 1);
2843 }
2844 
2845 static __inline void
2846 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2847 {
2848 
2849 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2850 	    ("recursive aggregation on aggregating txdesc"));
2851 
2852 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2853 	    ("already aggregated"));
2854 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2855 	    ("recursive aggregation on to-be-aggregated txdesc"));
2856 
2857 	txd->flags |= HN_TXD_FLAG_ONAGG;
2858 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2859 }
2860 
2861 static bool
2862 hn_tx_ring_pending(struct hn_tx_ring *txr)
2863 {
2864 	bool pending = false;
2865 
2866 #ifndef HN_USE_TXDESC_BUFRING
2867 	mtx_lock_spin(&txr->hn_txlist_spin);
2868 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2869 		pending = true;
2870 	mtx_unlock_spin(&txr->hn_txlist_spin);
2871 #else
2872 	if (!buf_ring_full(txr->hn_txdesc_br))
2873 		pending = true;
2874 #endif
2875 	return (pending);
2876 }
2877 
2878 static __inline void
2879 hn_txeof(struct hn_tx_ring *txr)
2880 {
2881 	txr->hn_has_txeof = 0;
2882 	txr->hn_txeof(txr);
2883 }
2884 
2885 static void
2886 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2887     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2888 {
2889 	struct hn_txdesc *txd = sndc->hn_cbarg;
2890 	struct hn_tx_ring *txr;
2891 
2892 	txr = txd->txr;
2893 	KASSERT(txr->hn_chan == chan,
2894 	    ("channel mismatch, on chan%u, should be chan%u",
2895 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2896 
2897 	txr->hn_has_txeof = 1;
2898 	hn_txdesc_put(txr, txd);
2899 
2900 	++txr->hn_txdone_cnt;
2901 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2902 		txr->hn_txdone_cnt = 0;
2903 		if (txr->hn_oactive)
2904 			hn_txeof(txr);
2905 	}
2906 }
2907 
2908 static void
2909 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2910 {
2911 #if defined(INET) || defined(INET6)
2912 	struct epoch_tracker et;
2913 
2914 	NET_EPOCH_ENTER(et);
2915 	tcp_lro_flush_all(&rxr->hn_lro);
2916 	NET_EPOCH_EXIT(et);
2917 #endif
2918 
2919 	/*
2920 	 * NOTE:
2921 	 * 'txr' could be NULL, if multiple channels and
2922 	 * ifnet.if_start method are enabled.
2923 	 */
2924 	if (txr == NULL || !txr->hn_has_txeof)
2925 		return;
2926 
2927 	txr->hn_txdone_cnt = 0;
2928 	hn_txeof(txr);
2929 }
2930 
2931 static __inline uint32_t
2932 hn_rndis_pktmsg_offset(uint32_t ofs)
2933 {
2934 
2935 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2936 	    ("invalid RNDIS packet msg offset %u", ofs));
2937 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2938 }
2939 
2940 static __inline void *
2941 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2942     size_t pi_dlen, uint32_t pi_type)
2943 {
2944 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2945 	struct rndis_pktinfo *pi;
2946 
2947 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2948 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2949 
2950 	/*
2951 	 * Per-packet-info does not move; it only grows.
2952 	 *
2953 	 * NOTE:
2954 	 * rm_pktinfooffset in this phase counts from the beginning
2955 	 * of rndis_packet_msg.
2956 	 */
2957 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2958 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2959 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2960 	    pkt->rm_pktinfolen);
2961 	pkt->rm_pktinfolen += pi_size;
2962 
2963 	pi->rm_size = pi_size;
2964 	pi->rm_type = pi_type;
2965 	pi->rm_internal = 0;
2966 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2967 
2968 	return (pi->rm_data);
2969 }
2970 
2971 static __inline int
2972 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2973 {
2974 	struct hn_txdesc *txd;
2975 	struct mbuf *m;
2976 	int error, pkts;
2977 
2978 	txd = txr->hn_agg_txd;
2979 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2980 
2981 	/*
2982 	 * Since hn_txpkt() will reset this temporary stat, save
2983 	 * it now, so that oerrors can be updated properly, if
2984 	 * hn_txpkt() ever fails.
2985 	 */
2986 	pkts = txr->hn_stat_pkts;
2987 
2988 	/*
2989 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2990 	 * failure, save it for later freeing, if hn_txpkt() ever
2991 	 * fails.
2992 	 */
2993 	m = txd->m;
2994 	error = hn_txpkt(ifp, txr, txd);
2995 	if (__predict_false(error)) {
2996 		/* txd is freed, but m is not. */
2997 		m_freem(m);
2998 
2999 		txr->hn_flush_failed++;
3000 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
3001 	}
3002 
3003 	/* Reset all aggregation states. */
3004 	txr->hn_agg_txd = NULL;
3005 	txr->hn_agg_szleft = 0;
3006 	txr->hn_agg_pktleft = 0;
3007 	txr->hn_agg_prevpkt = NULL;
3008 
3009 	return (error);
3010 }
3011 
3012 static void *
3013 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3014     int pktsize)
3015 {
3016 	void *chim;
3017 
3018 	if (txr->hn_agg_txd != NULL) {
3019 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
3020 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
3021 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
3022 			int olen;
3023 
3024 			/*
3025 			 * Update the previous RNDIS packet's total length,
3026 			 * it can be increased due to the mandatory alignment
3027 			 * padding for this RNDIS packet.  And update the
3028 			 * aggregating txdesc's chimney sending buffer size
3029 			 * accordingly.
3030 			 *
3031 			 * XXX
3032 			 * Zero-out the padding, as required by the RNDIS spec.
3033 			 */
3034 			olen = pkt->rm_len;
3035 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3036 			agg_txd->chim_size += pkt->rm_len - olen;
3037 
3038 			/* Link this txdesc to the parent. */
3039 			hn_txdesc_agg(agg_txd, txd);
3040 
3041 			chim = (uint8_t *)pkt + pkt->rm_len;
3042 			/* Save the current packet for later fixup. */
3043 			txr->hn_agg_prevpkt = chim;
3044 
3045 			txr->hn_agg_pktleft--;
3046 			txr->hn_agg_szleft -= pktsize;
3047 			if (txr->hn_agg_szleft <=
3048 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3049 				/*
3050 				 * Probably can't aggregate more packets,
3051 				 * flush this aggregating txdesc proactively.
3052 				 */
3053 				txr->hn_agg_pktleft = 0;
3054 			}
3055 			/* Done! */
3056 			return (chim);
3057 		}
3058 		hn_flush_txagg(ifp, txr);
3059 	}
3060 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3061 
3062 	txr->hn_tx_chimney_tried++;
3063 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3064 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3065 		return (NULL);
3066 	txr->hn_tx_chimney++;
3067 
3068 	chim = txr->hn_sc->hn_chim +
3069 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3070 
3071 	if (txr->hn_agg_pktmax > 1 &&
3072 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3073 		txr->hn_agg_txd = txd;
3074 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3075 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3076 		txr->hn_agg_prevpkt = chim;
3077 	}
3078 	return (chim);
3079 }
3080 
3081 /*
3082  * NOTE:
3083  * If this function fails, then both txd and m_head0 will be freed.
3084  */
3085 static int
3086 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3087     struct mbuf **m_head0)
3088 {
3089 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3090 	int error, nsegs, i;
3091 	struct mbuf *m_head = *m_head0;
3092 	struct rndis_packet_msg *pkt;
3093 	uint32_t *pi_data;
3094 	void *chim = NULL;
3095 	int pkt_hlen, pkt_size;
3096 
3097 	pkt = txd->rndis_pkt;
3098 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3099 	if (pkt_size < txr->hn_chim_size) {
3100 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3101 		if (chim != NULL)
3102 			pkt = chim;
3103 	} else {
3104 		if (txr->hn_agg_txd != NULL)
3105 			hn_flush_txagg(ifp, txr);
3106 	}
3107 
3108 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3109 	pkt->rm_len = m_head->m_pkthdr.len;
3110 	pkt->rm_dataoffset = 0;
3111 	pkt->rm_datalen = m_head->m_pkthdr.len;
3112 	pkt->rm_oobdataoffset = 0;
3113 	pkt->rm_oobdatalen = 0;
3114 	pkt->rm_oobdataelements = 0;
3115 	pkt->rm_pktinfooffset = sizeof(*pkt);
3116 	pkt->rm_pktinfolen = 0;
3117 	pkt->rm_vchandle = 0;
3118 	pkt->rm_reserved = 0;
3119 
3120 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3121 		/*
3122 		 * Set the hash value for this packet.
3123 		 */
3124 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3125 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3126 
3127 		if (M_HASHTYPE_ISHASH(m_head))
3128 			/*
3129 			 * The flowid field contains the hash value host
3130 			 * set in the rx queue if it is a ip forwarding pkt.
3131 			 * Set the same hash value so host can send on the
3132 			 * cpu it was received.
3133 			 */
3134 			*pi_data = m_head->m_pkthdr.flowid;
3135 		else
3136 			/*
3137 			 * Otherwise just put the tx queue index.
3138 			 */
3139 			*pi_data = txr->hn_tx_idx;
3140 	}
3141 
3142 	if (m_head->m_flags & M_VLANTAG) {
3143 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3144 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3145 		*pi_data = NDIS_VLAN_INFO_MAKE(
3146 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3147 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3148 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3149 	}
3150 
3151 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3152 #if defined(INET6) || defined(INET)
3153 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3154 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3155 #ifdef INET
3156 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3157 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3158 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3159 			    m_head->m_pkthdr.tso_segsz);
3160 		}
3161 #endif
3162 #if defined(INET6) && defined(INET)
3163 		else
3164 #endif
3165 #ifdef INET6
3166 		{
3167 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3168 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3169 			    m_head->m_pkthdr.tso_segsz);
3170 		}
3171 #endif
3172 #endif	/* INET6 || INET */
3173 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3174 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3175 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3176 		if (m_head->m_pkthdr.csum_flags &
3177 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3178 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3179 		} else {
3180 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3181 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3182 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3183 		}
3184 
3185 		if (m_head->m_pkthdr.csum_flags &
3186 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3187 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3188 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3189 		} else if (m_head->m_pkthdr.csum_flags &
3190 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3191 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3192 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3193 		}
3194 	}
3195 
3196 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3197 	/* Fixup RNDIS packet message total length */
3198 	pkt->rm_len += pkt_hlen;
3199 	/* Convert RNDIS packet message offsets */
3200 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3201 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3202 
3203 	/*
3204 	 * Fast path: Chimney sending.
3205 	 */
3206 	if (chim != NULL) {
3207 		struct hn_txdesc *tgt_txd = txd;
3208 
3209 		if (txr->hn_agg_txd != NULL) {
3210 			tgt_txd = txr->hn_agg_txd;
3211 #ifdef INVARIANTS
3212 			*m_head0 = NULL;
3213 #endif
3214 		}
3215 
3216 		KASSERT(pkt == chim,
3217 		    ("RNDIS pkt not in chimney sending buffer"));
3218 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3219 		    ("chimney sending buffer is not used"));
3220 		tgt_txd->chim_size += pkt->rm_len;
3221 
3222 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3223 		    ((uint8_t *)chim) + pkt_hlen);
3224 
3225 		txr->hn_gpa_cnt = 0;
3226 		txr->hn_sendpkt = hn_txpkt_chim;
3227 		goto done;
3228 	}
3229 
3230 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3231 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3232 	    ("chimney buffer is used"));
3233 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3234 
3235 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3236 	if (__predict_false(error)) {
3237 		int freed __diagused;
3238 
3239 		/*
3240 		 * This mbuf is not linked w/ the txd yet, so free it now.
3241 		 */
3242 		m_freem(m_head);
3243 		*m_head0 = NULL;
3244 
3245 		freed = hn_txdesc_put(txr, txd);
3246 		KASSERT(freed != 0,
3247 		    ("fail to free txd upon txdma error"));
3248 
3249 		txr->hn_txdma_failed++;
3250 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3251 		return error;
3252 	}
3253 	*m_head0 = m_head;
3254 
3255 	/* +1 RNDIS packet message */
3256 	txr->hn_gpa_cnt = nsegs + 1;
3257 
3258 	/* send packet with page buffer */
3259 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3260 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3261 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3262 
3263 	/*
3264 	 * Fill the page buffers with mbuf info after the page
3265 	 * buffer for RNDIS packet message.
3266 	 */
3267 	for (i = 0; i < nsegs; ++i) {
3268 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3269 
3270 		gpa->gpa_page = atop(segs[i].ds_addr);
3271 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3272 		gpa->gpa_len = segs[i].ds_len;
3273 	}
3274 
3275 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3276 	txd->chim_size = 0;
3277 	txr->hn_sendpkt = hn_txpkt_sglist;
3278 done:
3279 	txd->m = m_head;
3280 
3281 	/* Set the completion routine */
3282 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3283 
3284 	/* Update temporary stats for later use. */
3285 	txr->hn_stat_pkts++;
3286 	txr->hn_stat_size += m_head->m_pkthdr.len;
3287 	if (m_head->m_flags & M_MCAST)
3288 		txr->hn_stat_mcasts++;
3289 
3290 	return 0;
3291 }
3292 
3293 /*
3294  * NOTE:
3295  * If this function fails, then txd will be freed, but the mbuf
3296  * associated w/ the txd will _not_ be freed.
3297  */
3298 static int
3299 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3300 {
3301 	int error, send_failed = 0, has_bpf;
3302 
3303 again:
3304 	has_bpf = bpf_peers_present(if_getbpf(ifp));
3305 	if (has_bpf) {
3306 		/*
3307 		 * Make sure that this txd and any aggregated txds are not
3308 		 * freed before ETHER_BPF_MTAP.
3309 		 */
3310 		hn_txdesc_hold(txd);
3311 	}
3312 	error = txr->hn_sendpkt(txr, txd);
3313 	if (!error) {
3314 		if (has_bpf) {
3315 			const struct hn_txdesc *tmp_txd;
3316 
3317 			ETHER_BPF_MTAP(ifp, txd->m);
3318 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3319 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3320 		}
3321 
3322 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3323 #ifdef HN_IFSTART_SUPPORT
3324 		if (!hn_use_if_start)
3325 #endif
3326 		{
3327 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3328 			    txr->hn_stat_size);
3329 			if (txr->hn_stat_mcasts != 0) {
3330 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3331 				    txr->hn_stat_mcasts);
3332 			}
3333 		}
3334 		txr->hn_pkts += txr->hn_stat_pkts;
3335 		txr->hn_sends++;
3336 	}
3337 	if (has_bpf)
3338 		hn_txdesc_put(txr, txd);
3339 
3340 	if (__predict_false(error)) {
3341 		int freed __diagused;
3342 
3343 		/*
3344 		 * This should "really rarely" happen.
3345 		 *
3346 		 * XXX Too many RX to be acked or too many sideband
3347 		 * commands to run?  Ask netvsc_channel_rollup()
3348 		 * to kick start later.
3349 		 */
3350 		txr->hn_has_txeof = 1;
3351 		if (!send_failed) {
3352 			txr->hn_send_failed++;
3353 			send_failed = 1;
3354 			/*
3355 			 * Try sending again after set hn_has_txeof;
3356 			 * in case that we missed the last
3357 			 * netvsc_channel_rollup().
3358 			 */
3359 			goto again;
3360 		}
3361 		if_printf(ifp, "send failed\n");
3362 
3363 		/*
3364 		 * Caller will perform further processing on the
3365 		 * associated mbuf, so don't free it in hn_txdesc_put();
3366 		 * only unload it from the DMA map in hn_txdesc_put(),
3367 		 * if it was loaded.
3368 		 */
3369 		txd->m = NULL;
3370 		freed = hn_txdesc_put(txr, txd);
3371 		KASSERT(freed != 0,
3372 		    ("fail to free txd upon send error"));
3373 
3374 		txr->hn_send_failed++;
3375 	}
3376 
3377 	/* Reset temporary stats, after this sending is done. */
3378 	txr->hn_stat_size = 0;
3379 	txr->hn_stat_pkts = 0;
3380 	txr->hn_stat_mcasts = 0;
3381 
3382 	return (error);
3383 }
3384 
3385 /*
3386  * Append the specified data to the indicated mbuf chain,
3387  * Extend the mbuf chain if the new data does not fit in
3388  * existing space.
3389  *
3390  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3391  * There should be an equivalent in the kernel mbuf code,
3392  * but there does not appear to be one yet.
3393  *
3394  * Differs from m_append() in that additional mbufs are
3395  * allocated with cluster size MJUMPAGESIZE, and filled
3396  * accordingly.
3397  *
3398  * Return the last mbuf in the chain or NULL if failed to
3399  * allocate new mbuf.
3400  */
3401 static struct mbuf *
3402 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3403 {
3404 	struct mbuf *m, *n;
3405 	int remainder, space;
3406 
3407 	for (m = m0; m->m_next != NULL; m = m->m_next)
3408 		;
3409 	remainder = len;
3410 	space = M_TRAILINGSPACE(m);
3411 	if (space > 0) {
3412 		/*
3413 		 * Copy into available space.
3414 		 */
3415 		if (space > remainder)
3416 			space = remainder;
3417 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3418 		m->m_len += space;
3419 		cp += space;
3420 		remainder -= space;
3421 	}
3422 	while (remainder > 0) {
3423 		/*
3424 		 * Allocate a new mbuf; could check space
3425 		 * and allocate a cluster instead.
3426 		 */
3427 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3428 		if (n == NULL)
3429 			return NULL;
3430 		n->m_len = min(MJUMPAGESIZE, remainder);
3431 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3432 		cp += n->m_len;
3433 		remainder -= n->m_len;
3434 		m->m_next = n;
3435 		m = n;
3436 	}
3437 
3438 	return m;
3439 }
3440 
3441 #if defined(INET) || defined(INET6)
3442 static __inline int
3443 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3444 {
3445 	if (hn_lro_mbufq_depth) {
3446 		tcp_lro_queue_mbuf(lc, m);
3447 		return 0;
3448 	}
3449 	return tcp_lro_rx(lc, m, 0);
3450 }
3451 #endif
3452 
3453 static int
3454 hn_rxpkt(struct hn_rx_ring *rxr)
3455 {
3456 	if_t ifp, hn_ifp = rxr->hn_ifp;
3457 	struct mbuf *m_new, *n;
3458 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3459 	int hash_type = M_HASHTYPE_NONE;
3460 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3461 	int i;
3462 
3463 	ifp = hn_ifp;
3464 	if (rxr->hn_rxvf_ifp != NULL) {
3465 		/*
3466 		 * Non-transparent mode VF; pretend this packet is from
3467 		 * the VF.
3468 		 */
3469 		ifp = rxr->hn_rxvf_ifp;
3470 		is_vf = 1;
3471 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3472 		/* Transparent mode VF. */
3473 		is_vf = 1;
3474 	}
3475 
3476 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3477 		/*
3478 		 * NOTE:
3479 		 * See the NOTE of hn_rndis_init_fixat().  This
3480 		 * function can be reached, immediately after the
3481 		 * RNDIS is initialized but before the ifnet is
3482 		 * setup on the hn_attach() path; drop the unexpected
3483 		 * packets.
3484 		 */
3485 		return (0);
3486 	}
3487 
3488 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3489 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3490 		return (0);
3491 	}
3492 
3493 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3494 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3495 		if (m_new == NULL) {
3496 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3497 			return (0);
3498 		}
3499 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3500 		    rxr->rsc.frag_len[0]);
3501 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3502 	} else {
3503 		/*
3504 		 * Get an mbuf with a cluster.  For packets 2K or less,
3505 		 * get a standard 2K cluster.  For anything larger, get a
3506 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3507 		 * if looped around to the Hyper-V TX channel, so avoid them.
3508 		 */
3509 		size = MCLBYTES;
3510 		if (rxr->rsc.pktlen > MCLBYTES) {
3511 			/* 4096 */
3512 			size = MJUMPAGESIZE;
3513 		}
3514 
3515 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3516 		if (m_new == NULL) {
3517 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3518 			return (0);
3519 		}
3520 
3521 		n = m_new;
3522 		for (i = 0; i < rxr->rsc.cnt; i++) {
3523 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3524 			    rxr->rsc.frag_data[i]);
3525 			if (n == NULL) {
3526 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3527 				return (0);
3528 			} else {
3529 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3530 			}
3531 		}
3532 	}
3533 	if (rxr->rsc.pktlen <= MHLEN)
3534 		rxr->hn_small_pkts++;
3535 
3536 	m_new->m_pkthdr.rcvif = ifp;
3537 
3538 	if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3539 		do_csum = 0;
3540 
3541 	/* receive side checksum offload */
3542 	if (rxr->rsc.csum_info != NULL) {
3543 		/* IP csum offload */
3544 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3545 			m_new->m_pkthdr.csum_flags |=
3546 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3547 			rxr->hn_csum_ip++;
3548 		}
3549 
3550 		/* TCP/UDP csum offload */
3551 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3552 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3553 			m_new->m_pkthdr.csum_flags |=
3554 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3555 			m_new->m_pkthdr.csum_data = 0xffff;
3556 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3557 				rxr->hn_csum_tcp++;
3558 			else
3559 				rxr->hn_csum_udp++;
3560 		}
3561 
3562 		/*
3563 		 * XXX
3564 		 * As of this write (Oct 28th, 2016), host side will turn
3565 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3566 		 * the do_lro setting here is actually _not_ accurate.  We
3567 		 * depend on the RSS hash type check to reset do_lro.
3568 		 */
3569 		if ((*(rxr->rsc.csum_info) &
3570 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3571 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3572 			do_lro = 1;
3573 	} else {
3574 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3575 		if (l3proto == ETHERTYPE_IP) {
3576 			if (l4proto == IPPROTO_TCP) {
3577 				if (do_csum &&
3578 				    (rxr->hn_trust_hcsum &
3579 				     HN_TRUST_HCSUM_TCP)) {
3580 					rxr->hn_csum_trusted++;
3581 					m_new->m_pkthdr.csum_flags |=
3582 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3583 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3584 					m_new->m_pkthdr.csum_data = 0xffff;
3585 				}
3586 				do_lro = 1;
3587 			} else if (l4proto == IPPROTO_UDP) {
3588 				if (do_csum &&
3589 				    (rxr->hn_trust_hcsum &
3590 				     HN_TRUST_HCSUM_UDP)) {
3591 					rxr->hn_csum_trusted++;
3592 					m_new->m_pkthdr.csum_flags |=
3593 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3594 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3595 					m_new->m_pkthdr.csum_data = 0xffff;
3596 				}
3597 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3598 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3599 				rxr->hn_csum_trusted++;
3600 				m_new->m_pkthdr.csum_flags |=
3601 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3602 			}
3603 		}
3604 	}
3605 
3606 	if (rxr->rsc.vlan_info != NULL) {
3607 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3608 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3609 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3610 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3611 		m_new->m_flags |= M_VLANTAG;
3612 	}
3613 
3614 	/*
3615 	 * If VF is activated (tranparent/non-transparent mode does not
3616 	 * matter here).
3617 	 *
3618 	 * - Disable LRO
3619 	 *
3620 	 *   hn(4) will only receive broadcast packets, multicast packets,
3621 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3622 	 *   packet types.
3623 	 *
3624 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3625 	 *   all, since the LRO flush will use hn(4) as the receiving
3626 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3627 	 */
3628 	if (is_vf)
3629 		do_lro = 0;
3630 
3631 	/*
3632 	 * If VF is activated (tranparent/non-transparent mode does not
3633 	 * matter here), do _not_ mess with unsupported hash types or
3634 	 * functions.
3635 	 */
3636 	if (rxr->rsc.hash_info != NULL) {
3637 		rxr->hn_rss_pkts++;
3638 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3639 		if (!is_vf)
3640 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3641 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3642 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3643 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3644 			    rxr->hn_mbuf_hash);
3645 
3646 			/*
3647 			 * NOTE:
3648 			 * do_lro is resetted, if the hash types are not TCP
3649 			 * related.  See the comment in the above csum_flags
3650 			 * setup section.
3651 			 */
3652 			switch (type) {
3653 			case NDIS_HASH_IPV4:
3654 				hash_type = M_HASHTYPE_RSS_IPV4;
3655 				do_lro = 0;
3656 				break;
3657 
3658 			case NDIS_HASH_TCP_IPV4:
3659 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3660 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3661 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3662 
3663 					if (is_vf)
3664 						def_htype = M_HASHTYPE_NONE;
3665 
3666 					/*
3667 					 * UDP 4-tuple hash is delivered as
3668 					 * TCP 4-tuple hash.
3669 					 */
3670 					if (l3proto == ETHERTYPE_MAX) {
3671 						hn_rxpkt_proto(m_new,
3672 						    &l3proto, &l4proto);
3673 					}
3674 					if (l3proto == ETHERTYPE_IP) {
3675 						if (l4proto == IPPROTO_UDP &&
3676 						    (rxr->hn_mbuf_hash &
3677 						     NDIS_HASH_UDP_IPV4_X)) {
3678 							hash_type =
3679 							M_HASHTYPE_RSS_UDP_IPV4;
3680 							do_lro = 0;
3681 						} else if (l4proto !=
3682 						    IPPROTO_TCP) {
3683 							hash_type = def_htype;
3684 							do_lro = 0;
3685 						}
3686 					} else {
3687 						hash_type = def_htype;
3688 						do_lro = 0;
3689 					}
3690 				}
3691 				break;
3692 
3693 			case NDIS_HASH_IPV6:
3694 				hash_type = M_HASHTYPE_RSS_IPV6;
3695 				do_lro = 0;
3696 				break;
3697 
3698 			case NDIS_HASH_IPV6_EX:
3699 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3700 				do_lro = 0;
3701 				break;
3702 
3703 			case NDIS_HASH_TCP_IPV6:
3704 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3705 				break;
3706 
3707 			case NDIS_HASH_TCP_IPV6_EX:
3708 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3709 				break;
3710 			}
3711 		}
3712 	} else if (!is_vf) {
3713 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3714 		hash_type = M_HASHTYPE_OPAQUE;
3715 	}
3716 	M_HASHTYPE_SET(m_new, hash_type);
3717 
3718 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3719 	if (hn_ifp != ifp) {
3720 		const struct ether_header *eh;
3721 
3722 		/*
3723 		 * Non-transparent mode VF is activated.
3724 		 */
3725 
3726 		/*
3727 		 * Allow tapping on hn(4).
3728 		 */
3729 		ETHER_BPF_MTAP(hn_ifp, m_new);
3730 
3731 		/*
3732 		 * Update hn(4)'s stats.
3733 		 */
3734 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3735 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3736 		/* Checked at the beginning of this function. */
3737 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3738 		eh = mtod(m_new, struct ether_header *);
3739 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3740 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3741 	}
3742 	rxr->hn_pkts++;
3743 
3744 	if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3745 #if defined(INET) || defined(INET6)
3746 		struct lro_ctrl *lro = &rxr->hn_lro;
3747 
3748 		if (lro->lro_cnt) {
3749 			rxr->hn_lro_tried++;
3750 			if (hn_lro_rx(lro, m_new) == 0) {
3751 				/* DONE! */
3752 				return 0;
3753 			}
3754 		}
3755 #endif
3756 	}
3757 	if_input(ifp, m_new);
3758 
3759 	return (0);
3760 }
3761 
3762 static int
3763 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3764 {
3765 	struct hn_softc *sc = if_getsoftc(ifp);
3766 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3767 	if_t vf_ifp;
3768 	int mask, error = 0;
3769 	struct ifrsskey *ifrk;
3770 	struct ifrsshash *ifrh;
3771 	uint32_t mtu;
3772 
3773 	switch (cmd) {
3774 	case SIOCSIFMTU:
3775 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3776 			error = EINVAL;
3777 			break;
3778 		}
3779 
3780 		HN_LOCK(sc);
3781 
3782 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3783 			HN_UNLOCK(sc);
3784 			break;
3785 		}
3786 
3787 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3788 			/* Can't change MTU */
3789 			HN_UNLOCK(sc);
3790 			error = EOPNOTSUPP;
3791 			break;
3792 		}
3793 
3794 		if (if_getmtu(ifp) == ifr->ifr_mtu) {
3795 			HN_UNLOCK(sc);
3796 			break;
3797 		}
3798 
3799 		if (hn_xpnt_vf_isready(sc)) {
3800 			vf_ifp = sc->hn_vf_ifp;
3801 			ifr_vf = *ifr;
3802 			strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3803 			    sizeof(ifr_vf.ifr_name));
3804 			error = ifhwioctl(SIOCSIFMTU,vf_ifp,
3805 			    (caddr_t)&ifr_vf, curthread);
3806 			if (error) {
3807 				HN_UNLOCK(sc);
3808 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3809 				    if_name(vf_ifp), ifr->ifr_mtu, error);
3810 				break;
3811 			}
3812 		}
3813 
3814 		/*
3815 		 * Suspend this interface before the synthetic parts
3816 		 * are ripped.
3817 		 */
3818 		hn_suspend(sc);
3819 
3820 		/*
3821 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3822 		 */
3823 		hn_synth_detach(sc);
3824 
3825 		/*
3826 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3827 		 * with the new MTU setting.
3828 		 */
3829 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3830 		if (error) {
3831 			HN_UNLOCK(sc);
3832 			break;
3833 		}
3834 
3835 		error = hn_rndis_get_mtu(sc, &mtu);
3836 		if (error)
3837 			mtu = ifr->ifr_mtu;
3838 		else if (bootverbose)
3839 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3840 
3841 		/*
3842 		 * Commit the requested MTU, after the synthetic parts
3843 		 * have been successfully attached.
3844 		 */
3845 		if (mtu >= ifr->ifr_mtu) {
3846 			mtu = ifr->ifr_mtu;
3847 		} else {
3848 			if_printf(ifp, "fixup mtu %d -> %u\n",
3849 			    ifr->ifr_mtu, mtu);
3850 		}
3851 		if_setmtu(ifp, mtu);
3852 
3853 		/*
3854 		 * Synthetic parts' reattach may change the chimney
3855 		 * sending size; update it.
3856 		 */
3857 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3858 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3859 
3860 		/*
3861 		 * Make sure that various parameters based on MTU are
3862 		 * still valid, after the MTU change.
3863 		 */
3864 		hn_mtu_change_fixup(sc);
3865 
3866 		/*
3867 		 * All done!  Resume the interface now.
3868 		 */
3869 		hn_resume(sc);
3870 
3871 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3872 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3873 			/*
3874 			 * Since we have reattached the NVS part,
3875 			 * change the datapath to VF again; in case
3876 			 * that it is lost, after the NVS was detached.
3877 			 */
3878 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3879 		}
3880 
3881 		HN_UNLOCK(sc);
3882 		break;
3883 
3884 	case SIOCSIFFLAGS:
3885 		HN_LOCK(sc);
3886 
3887 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3888 			HN_UNLOCK(sc);
3889 			break;
3890 		}
3891 
3892 		if (hn_xpnt_vf_isready(sc))
3893 			hn_xpnt_vf_saveifflags(sc);
3894 
3895 		if (if_getflags(ifp) & IFF_UP) {
3896 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3897 				/*
3898 				 * Caller meight hold mutex, e.g.
3899 				 * bpf; use busy-wait for the RNDIS
3900 				 * reply.
3901 				 */
3902 				HN_NO_SLEEPING(sc);
3903 				hn_rxfilter_config(sc);
3904 				HN_SLEEPING_OK(sc);
3905 
3906 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3907 					error = hn_xpnt_vf_iocsetflags(sc);
3908 			} else {
3909 				hn_init_locked(sc);
3910 			}
3911 		} else {
3912 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3913 				hn_stop(sc, false);
3914 		}
3915 		sc->hn_if_flags = if_getflags(ifp);
3916 
3917 		HN_UNLOCK(sc);
3918 		break;
3919 
3920 	case SIOCSIFCAP:
3921 		HN_LOCK(sc);
3922 
3923 		if (hn_xpnt_vf_isready(sc)) {
3924 			ifr_vf = *ifr;
3925 			strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3926 			    sizeof(ifr_vf.ifr_name));
3927 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3928 			HN_UNLOCK(sc);
3929 			break;
3930 		}
3931 
3932 		/*
3933 		 * Fix up requested capabilities w/ supported capabilities,
3934 		 * since the supported capabilities could have been changed.
3935 		 */
3936 		mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3937 		    if_getcapenable(ifp);
3938 
3939 		if (mask & IFCAP_TXCSUM) {
3940 			if_togglecapenable(ifp, IFCAP_TXCSUM);
3941 			if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3942 				if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3943 			else
3944 				if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3945 		}
3946 		if (mask & IFCAP_TXCSUM_IPV6) {
3947 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3948 			if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3949 				if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3950 			else
3951 				if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3952 		}
3953 
3954 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3955 		if (mask & IFCAP_RXCSUM)
3956 			if_togglecapenable(ifp, IFCAP_RXCSUM);
3957 #ifdef foo
3958 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3959 		if (mask & IFCAP_RXCSUM_IPV6)
3960 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3961 #endif
3962 
3963 		if (mask & IFCAP_LRO)
3964 			if_togglecapenable(ifp, IFCAP_LRO);
3965 
3966 		if (mask & IFCAP_TSO4) {
3967 			if_togglecapenable(ifp, IFCAP_TSO4);
3968 			if (if_getcapenable(ifp) & IFCAP_TSO4)
3969 				if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3970 			else
3971 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3972 		}
3973 		if (mask & IFCAP_TSO6) {
3974 			if_togglecapenable(ifp, IFCAP_TSO6);
3975 			if (if_getcapenable(ifp) & IFCAP_TSO6)
3976 				if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3977 			else
3978 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3979 		}
3980 
3981 		HN_UNLOCK(sc);
3982 		break;
3983 
3984 	case SIOCADDMULTI:
3985 	case SIOCDELMULTI:
3986 		HN_LOCK(sc);
3987 
3988 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3989 			HN_UNLOCK(sc);
3990 			break;
3991 		}
3992 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3993 			/*
3994 			 * Multicast uses mutex; use busy-wait for
3995 			 * the RNDIS reply.
3996 			 */
3997 			HN_NO_SLEEPING(sc);
3998 			hn_rxfilter_config(sc);
3999 			HN_SLEEPING_OK(sc);
4000 		}
4001 
4002 		/* XXX vlan(4) style mcast addr maintenance */
4003 		if (hn_xpnt_vf_isready(sc)) {
4004 			int old_if_flags;
4005 
4006 			old_if_flags = if_getflags(sc->hn_vf_ifp);
4007 			hn_xpnt_vf_saveifflags(sc);
4008 
4009 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
4010 			    ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
4011 			     IFF_ALLMULTI))
4012 				error = hn_xpnt_vf_iocsetflags(sc);
4013 		}
4014 
4015 		HN_UNLOCK(sc);
4016 		break;
4017 
4018 	case SIOCSIFMEDIA:
4019 	case SIOCGIFMEDIA:
4020 		HN_LOCK(sc);
4021 		if (hn_xpnt_vf_isready(sc)) {
4022 			/*
4023 			 * SIOCGIFMEDIA expects ifmediareq, so don't
4024 			 * create and pass ifr_vf to the VF here; just
4025 			 * replace the ifr_name.
4026 			 */
4027 			vf_ifp = sc->hn_vf_ifp;
4028 			strlcpy(ifr->ifr_name, if_name(vf_ifp),
4029 			    sizeof(ifr->ifr_name));
4030 			error = ifhwioctl(cmd, vf_ifp, data, curthread);
4031 			/* Restore the ifr_name. */
4032 			strlcpy(ifr->ifr_name, if_name(ifp),
4033 			    sizeof(ifr->ifr_name));
4034 			HN_UNLOCK(sc);
4035 			break;
4036 		}
4037 		HN_UNLOCK(sc);
4038 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4039 		break;
4040 
4041 	case SIOCGIFRSSHASH:
4042 		ifrh = (struct ifrsshash *)data;
4043 		HN_LOCK(sc);
4044 		if (sc->hn_rx_ring_inuse == 1) {
4045 			HN_UNLOCK(sc);
4046 			ifrh->ifrh_func = RSS_FUNC_NONE;
4047 			ifrh->ifrh_types = 0;
4048 			break;
4049 		}
4050 
4051 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4052 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4053 		else
4054 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4055 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4056 		HN_UNLOCK(sc);
4057 		break;
4058 
4059 	case SIOCGIFRSSKEY:
4060 		ifrk = (struct ifrsskey *)data;
4061 		HN_LOCK(sc);
4062 		if (sc->hn_rx_ring_inuse == 1) {
4063 			HN_UNLOCK(sc);
4064 			ifrk->ifrk_func = RSS_FUNC_NONE;
4065 			ifrk->ifrk_keylen = 0;
4066 			break;
4067 		}
4068 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4069 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4070 		else
4071 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4072 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4073 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4074 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4075 		HN_UNLOCK(sc);
4076 		break;
4077 
4078 	default:
4079 		error = ether_ioctl(ifp, cmd, data);
4080 		break;
4081 	}
4082 	return (error);
4083 }
4084 
4085 static void
4086 hn_stop(struct hn_softc *sc, bool detaching)
4087 {
4088 	if_t ifp = sc->hn_ifp;
4089 	int i;
4090 
4091 	HN_LOCK_ASSERT(sc);
4092 
4093 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4094 	    ("synthetic parts were not attached"));
4095 
4096 	/* Clear RUNNING bit ASAP. */
4097 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4098 
4099 	/* Disable polling. */
4100 	hn_polling(sc, 0);
4101 
4102 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4103 		KASSERT(sc->hn_vf_ifp != NULL,
4104 		    ("%s: VF is not attached", if_name(ifp)));
4105 
4106 		/* Mark transparent mode VF as disabled. */
4107 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4108 
4109 		/*
4110 		 * NOTE:
4111 		 * Datapath setting must happen _before_ bringing
4112 		 * the VF down.
4113 		 */
4114 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4115 
4116 		/*
4117 		 * Bring the VF down.
4118 		 */
4119 		hn_xpnt_vf_saveifflags(sc);
4120 		if_setflagbits(ifp, 0, IFF_UP);
4121 		hn_xpnt_vf_iocsetflags(sc);
4122 	}
4123 
4124 	/* Suspend data transfers. */
4125 	hn_suspend_data(sc);
4126 
4127 	/* Clear OACTIVE bit. */
4128 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4129 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4130 		sc->hn_tx_ring[i].hn_oactive = 0;
4131 
4132 	/*
4133 	 * If the non-transparent mode VF is active, make sure
4134 	 * that the RX filter still allows packet reception.
4135 	 */
4136 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4137 		hn_rxfilter_config(sc);
4138 }
4139 
4140 static void
4141 hn_init_locked(struct hn_softc *sc)
4142 {
4143 	if_t ifp = sc->hn_ifp;
4144 	int i;
4145 
4146 	HN_LOCK_ASSERT(sc);
4147 
4148 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4149 		return;
4150 
4151 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4152 		return;
4153 
4154 	/* Configure RX filter */
4155 	hn_rxfilter_config(sc);
4156 
4157 	/* Clear OACTIVE bit. */
4158 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4159 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4160 		sc->hn_tx_ring[i].hn_oactive = 0;
4161 
4162 	/* Clear TX 'suspended' bit. */
4163 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4164 
4165 	if (hn_xpnt_vf_isready(sc)) {
4166 		/* Initialize transparent VF. */
4167 		hn_xpnt_vf_init(sc);
4168 	}
4169 
4170 	/* Everything is ready; unleash! */
4171 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4172 
4173 	/* Re-enable polling if requested. */
4174 	if (sc->hn_pollhz > 0)
4175 		hn_polling(sc, sc->hn_pollhz);
4176 }
4177 
4178 static void
4179 hn_init(void *xsc)
4180 {
4181 	struct hn_softc *sc = xsc;
4182 
4183 	HN_LOCK(sc);
4184 	hn_init_locked(sc);
4185 	HN_UNLOCK(sc);
4186 }
4187 
4188 static int
4189 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4190 {
4191 	struct hn_softc *sc = arg1;
4192 	unsigned int lenlim;
4193 	int error;
4194 
4195 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4196 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4197 	if (error || req->newptr == NULL)
4198 		return error;
4199 
4200 	HN_LOCK(sc);
4201 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4202 	    lenlim > TCP_LRO_LENGTH_MAX) {
4203 		HN_UNLOCK(sc);
4204 		return EINVAL;
4205 	}
4206 	hn_set_lro_lenlim(sc, lenlim);
4207 	HN_UNLOCK(sc);
4208 
4209 	return 0;
4210 }
4211 
4212 static int
4213 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4214 {
4215 	struct hn_softc *sc = arg1;
4216 	int ackcnt, error, i;
4217 
4218 	/*
4219 	 * lro_ackcnt_lim is append count limit,
4220 	 * +1 to turn it into aggregation limit.
4221 	 */
4222 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4223 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4224 	if (error || req->newptr == NULL)
4225 		return error;
4226 
4227 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4228 		return EINVAL;
4229 
4230 	/*
4231 	 * Convert aggregation limit back to append
4232 	 * count limit.
4233 	 */
4234 	--ackcnt;
4235 	HN_LOCK(sc);
4236 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4237 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4238 	HN_UNLOCK(sc);
4239 	return 0;
4240 }
4241 
4242 static int
4243 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4244 {
4245 	struct hn_softc *sc = arg1;
4246 	int hcsum = arg2;
4247 	int on, error, i;
4248 
4249 	on = 0;
4250 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4251 		on = 1;
4252 
4253 	error = sysctl_handle_int(oidp, &on, 0, req);
4254 	if (error || req->newptr == NULL)
4255 		return error;
4256 
4257 	HN_LOCK(sc);
4258 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4259 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4260 
4261 		if (on)
4262 			rxr->hn_trust_hcsum |= hcsum;
4263 		else
4264 			rxr->hn_trust_hcsum &= ~hcsum;
4265 	}
4266 	HN_UNLOCK(sc);
4267 	return 0;
4268 }
4269 
4270 static int
4271 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4272 {
4273 	struct hn_softc *sc = arg1;
4274 	int chim_size, error;
4275 
4276 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4277 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4278 	if (error || req->newptr == NULL)
4279 		return error;
4280 
4281 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4282 		return EINVAL;
4283 
4284 	HN_LOCK(sc);
4285 	hn_set_chim_size(sc, chim_size);
4286 	HN_UNLOCK(sc);
4287 	return 0;
4288 }
4289 
4290 static int
4291 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4292 {
4293 	struct hn_softc *sc = arg1;
4294 	int ofs = arg2, i, error;
4295 	struct hn_rx_ring *rxr;
4296 	uint64_t stat;
4297 
4298 	stat = 0;
4299 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4300 		rxr = &sc->hn_rx_ring[i];
4301 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4302 	}
4303 
4304 	error = sysctl_handle_64(oidp, &stat, 0, req);
4305 	if (error || req->newptr == NULL)
4306 		return error;
4307 
4308 	/* Zero out this stat. */
4309 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4310 		rxr = &sc->hn_rx_ring[i];
4311 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4312 	}
4313 	return 0;
4314 }
4315 
4316 static int
4317 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4318 {
4319 	struct hn_softc *sc = arg1;
4320 	int ofs = arg2, i, error;
4321 	struct hn_rx_ring *rxr;
4322 	u_long stat;
4323 
4324 	stat = 0;
4325 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4326 		rxr = &sc->hn_rx_ring[i];
4327 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4328 	}
4329 
4330 	error = sysctl_handle_long(oidp, &stat, 0, req);
4331 	if (error || req->newptr == NULL)
4332 		return error;
4333 
4334 	/* Zero out this stat. */
4335 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4336 		rxr = &sc->hn_rx_ring[i];
4337 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4338 	}
4339 	return 0;
4340 }
4341 
4342 static int
4343 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4344 {
4345 	struct hn_softc *sc = arg1;
4346 	int ofs = arg2, i, error;
4347 	struct hn_tx_ring *txr;
4348 	u_long stat;
4349 
4350 	stat = 0;
4351 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4352 		txr = &sc->hn_tx_ring[i];
4353 		stat += *((u_long *)((uint8_t *)txr + ofs));
4354 	}
4355 
4356 	error = sysctl_handle_long(oidp, &stat, 0, req);
4357 	if (error || req->newptr == NULL)
4358 		return error;
4359 
4360 	/* Zero out this stat. */
4361 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4362 		txr = &sc->hn_tx_ring[i];
4363 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4364 	}
4365 	return 0;
4366 }
4367 
4368 static int
4369 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4370 {
4371 	struct hn_softc *sc = arg1;
4372 	int ofs = arg2, i, error, conf;
4373 	struct hn_tx_ring *txr;
4374 
4375 	txr = &sc->hn_tx_ring[0];
4376 	conf = *((int *)((uint8_t *)txr + ofs));
4377 
4378 	error = sysctl_handle_int(oidp, &conf, 0, req);
4379 	if (error || req->newptr == NULL)
4380 		return error;
4381 
4382 	HN_LOCK(sc);
4383 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4384 		txr = &sc->hn_tx_ring[i];
4385 		*((int *)((uint8_t *)txr + ofs)) = conf;
4386 	}
4387 	HN_UNLOCK(sc);
4388 
4389 	return 0;
4390 }
4391 
4392 static int
4393 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4394 {
4395 	struct hn_softc *sc = arg1;
4396 	int error, size;
4397 
4398 	size = sc->hn_agg_size;
4399 	error = sysctl_handle_int(oidp, &size, 0, req);
4400 	if (error || req->newptr == NULL)
4401 		return (error);
4402 
4403 	HN_LOCK(sc);
4404 	sc->hn_agg_size = size;
4405 	hn_set_txagg(sc);
4406 	HN_UNLOCK(sc);
4407 
4408 	return (0);
4409 }
4410 
4411 static int
4412 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4413 {
4414 	struct hn_softc *sc = arg1;
4415 	int error, pkts;
4416 
4417 	pkts = sc->hn_agg_pkts;
4418 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4419 	if (error || req->newptr == NULL)
4420 		return (error);
4421 
4422 	HN_LOCK(sc);
4423 	sc->hn_agg_pkts = pkts;
4424 	hn_set_txagg(sc);
4425 	HN_UNLOCK(sc);
4426 
4427 	return (0);
4428 }
4429 
4430 static int
4431 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4432 {
4433 	struct hn_softc *sc = arg1;
4434 	int pkts;
4435 
4436 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4437 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4438 }
4439 
4440 static int
4441 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4442 {
4443 	struct hn_softc *sc = arg1;
4444 	int align;
4445 
4446 	align = sc->hn_tx_ring[0].hn_agg_align;
4447 	return (sysctl_handle_int(oidp, &align, 0, req));
4448 }
4449 
4450 static void
4451 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4452 {
4453 	if (pollhz == 0)
4454 		vmbus_chan_poll_disable(chan);
4455 	else
4456 		vmbus_chan_poll_enable(chan, pollhz);
4457 }
4458 
4459 static void
4460 hn_polling(struct hn_softc *sc, u_int pollhz)
4461 {
4462 	int nsubch = sc->hn_rx_ring_inuse - 1;
4463 
4464 	HN_LOCK_ASSERT(sc);
4465 
4466 	if (nsubch > 0) {
4467 		struct vmbus_channel **subch;
4468 		int i;
4469 
4470 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4471 		for (i = 0; i < nsubch; ++i)
4472 			hn_chan_polling(subch[i], pollhz);
4473 		vmbus_subchan_rel(subch, nsubch);
4474 	}
4475 	hn_chan_polling(sc->hn_prichan, pollhz);
4476 }
4477 
4478 static int
4479 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4480 {
4481 	struct hn_softc *sc = arg1;
4482 	int pollhz, error;
4483 
4484 	pollhz = sc->hn_pollhz;
4485 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4486 	if (error || req->newptr == NULL)
4487 		return (error);
4488 
4489 	if (pollhz != 0 &&
4490 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4491 		return (EINVAL);
4492 
4493 	HN_LOCK(sc);
4494 	if (sc->hn_pollhz != pollhz) {
4495 		sc->hn_pollhz = pollhz;
4496 		if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4497 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4498 			hn_polling(sc, sc->hn_pollhz);
4499 	}
4500 	HN_UNLOCK(sc);
4501 
4502 	return (0);
4503 }
4504 
4505 static int
4506 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4507 {
4508 	struct hn_softc *sc = arg1;
4509 	char verstr[16];
4510 
4511 	snprintf(verstr, sizeof(verstr), "%u.%u",
4512 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4513 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4514 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4515 }
4516 
4517 static int
4518 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4519 {
4520 	struct hn_softc *sc = arg1;
4521 	char caps_str[128];
4522 	uint32_t caps;
4523 
4524 	HN_LOCK(sc);
4525 	caps = sc->hn_caps;
4526 	HN_UNLOCK(sc);
4527 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4528 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4529 }
4530 
4531 static int
4532 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4533 {
4534 	struct hn_softc *sc = arg1;
4535 	char assist_str[128];
4536 	uint32_t hwassist;
4537 
4538 	HN_LOCK(sc);
4539 	hwassist = if_gethwassist(sc->hn_ifp);
4540 	HN_UNLOCK(sc);
4541 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4542 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4543 }
4544 
4545 static int
4546 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4547 {
4548 	struct hn_softc *sc = arg1;
4549 	char filter_str[128];
4550 	uint32_t filter;
4551 
4552 	HN_LOCK(sc);
4553 	filter = sc->hn_rx_filter;
4554 	HN_UNLOCK(sc);
4555 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4556 	    NDIS_PACKET_TYPES);
4557 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4558 }
4559 
4560 static int
4561 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4562 {
4563 	struct hn_softc *sc = arg1;
4564 	uint32_t mtu;
4565 	int error;
4566 	HN_LOCK(sc);
4567 	error = hn_rndis_get_mtu(sc, &mtu);
4568 	if (error) {
4569 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4570 		goto back;
4571 	}
4572 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4573 	if (error || req->newptr == NULL)
4574 		goto back;
4575 
4576 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4577 	if (error)
4578 		goto back;
4579 	error = hn_rndis_reconf_offload(sc, mtu);
4580 back:
4581 	HN_UNLOCK(sc);
4582 	return (error);
4583 }
4584 #ifndef RSS
4585 
4586 static int
4587 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4588 {
4589 	struct hn_softc *sc = arg1;
4590 	int error;
4591 
4592 	HN_LOCK(sc);
4593 
4594 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4595 	if (error || req->newptr == NULL)
4596 		goto back;
4597 
4598 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4599 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4600 		/*
4601 		 * RSS key is synchronized w/ VF's, don't allow users
4602 		 * to change it.
4603 		 */
4604 		error = EBUSY;
4605 		goto back;
4606 	}
4607 
4608 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4609 	if (error)
4610 		goto back;
4611 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4612 
4613 	if (sc->hn_rx_ring_inuse > 1) {
4614 		error = hn_rss_reconfig(sc);
4615 	} else {
4616 		/* Not RSS capable, at least for now; just save the RSS key. */
4617 		error = 0;
4618 	}
4619 back:
4620 	HN_UNLOCK(sc);
4621 	return (error);
4622 }
4623 
4624 static int
4625 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4626 {
4627 	struct hn_softc *sc = arg1;
4628 	int error;
4629 
4630 	HN_LOCK(sc);
4631 
4632 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4633 	if (error || req->newptr == NULL)
4634 		goto back;
4635 
4636 	/*
4637 	 * Don't allow RSS indirect table change, if this interface is not
4638 	 * RSS capable currently.
4639 	 */
4640 	if (sc->hn_rx_ring_inuse == 1) {
4641 		error = EOPNOTSUPP;
4642 		goto back;
4643 	}
4644 
4645 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4646 	if (error)
4647 		goto back;
4648 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4649 
4650 	hn_rss_ind_fixup(sc);
4651 	error = hn_rss_reconfig(sc);
4652 back:
4653 	HN_UNLOCK(sc);
4654 	return (error);
4655 }
4656 
4657 #endif	/* !RSS */
4658 
4659 static int
4660 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4661 {
4662 	struct hn_softc *sc = arg1;
4663 	char hash_str[128];
4664 	uint32_t hash;
4665 
4666 	HN_LOCK(sc);
4667 	hash = sc->hn_rss_hash;
4668 	HN_UNLOCK(sc);
4669 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4670 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4671 }
4672 
4673 static int
4674 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4675 {
4676 	struct hn_softc *sc = arg1;
4677 	char hash_str[128];
4678 	uint32_t hash;
4679 
4680 	HN_LOCK(sc);
4681 	hash = sc->hn_rss_hcap;
4682 	HN_UNLOCK(sc);
4683 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4684 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4685 }
4686 
4687 static int
4688 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4689 {
4690 	struct hn_softc *sc = arg1;
4691 	char hash_str[128];
4692 	uint32_t hash;
4693 
4694 	HN_LOCK(sc);
4695 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4696 	HN_UNLOCK(sc);
4697 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4698 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4699 }
4700 
4701 static int
4702 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4703 {
4704 	struct hn_softc *sc = arg1;
4705 	char vf_name[IFNAMSIZ + 1];
4706 	if_t vf_ifp;
4707 
4708 	HN_LOCK(sc);
4709 	vf_name[0] = '\0';
4710 	vf_ifp = sc->hn_vf_ifp;
4711 	if (vf_ifp != NULL)
4712 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4713 	HN_UNLOCK(sc);
4714 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4715 }
4716 
4717 static int
4718 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4719 {
4720 	struct hn_softc *sc = arg1;
4721 	char vf_name[IFNAMSIZ + 1];
4722 	if_t vf_ifp;
4723 
4724 	HN_LOCK(sc);
4725 	vf_name[0] = '\0';
4726 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4727 	if (vf_ifp != NULL)
4728 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4729 	HN_UNLOCK(sc);
4730 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4731 }
4732 
4733 static int
4734 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4735 {
4736 	struct rm_priotracker pt;
4737 	struct sbuf *sb;
4738 	int error, i;
4739 	bool first;
4740 
4741 	error = sysctl_wire_old_buffer(req, 0);
4742 	if (error != 0)
4743 		return (error);
4744 
4745 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4746 	if (sb == NULL)
4747 		return (ENOMEM);
4748 
4749 	rm_rlock(&hn_vfmap_lock, &pt);
4750 
4751 	first = true;
4752 	for (i = 0; i < hn_vfmap_size; ++i) {
4753 		struct epoch_tracker et;
4754 		if_t ifp;
4755 
4756 		if (hn_vfmap[i] == NULL)
4757 			continue;
4758 
4759 		NET_EPOCH_ENTER(et);
4760 		ifp = ifnet_byindex(i);
4761 		if (ifp != NULL) {
4762 			if (first)
4763 				sbuf_printf(sb, "%s", if_name(ifp));
4764 			else
4765 				sbuf_printf(sb, " %s", if_name(ifp));
4766 			first = false;
4767 		}
4768 		NET_EPOCH_EXIT(et);
4769 	}
4770 
4771 	rm_runlock(&hn_vfmap_lock, &pt);
4772 
4773 	error = sbuf_finish(sb);
4774 	sbuf_delete(sb);
4775 	return (error);
4776 }
4777 
4778 static int
4779 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4780 {
4781 	struct rm_priotracker pt;
4782 	struct sbuf *sb;
4783 	int error, i;
4784 	bool first;
4785 
4786 	error = sysctl_wire_old_buffer(req, 0);
4787 	if (error != 0)
4788 		return (error);
4789 
4790 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4791 	if (sb == NULL)
4792 		return (ENOMEM);
4793 
4794 	rm_rlock(&hn_vfmap_lock, &pt);
4795 
4796 	first = true;
4797 	for (i = 0; i < hn_vfmap_size; ++i) {
4798 		struct epoch_tracker et;
4799 		if_t ifp, hn_ifp;
4800 
4801 		hn_ifp = hn_vfmap[i];
4802 		if (hn_ifp == NULL)
4803 			continue;
4804 
4805 		NET_EPOCH_ENTER(et);
4806 		ifp = ifnet_byindex(i);
4807 		if (ifp != NULL) {
4808 			if (first) {
4809 				sbuf_printf(sb, "%s:%s", if_name(ifp),
4810 				    if_name(hn_ifp));
4811 			} else {
4812 				sbuf_printf(sb, " %s:%s", if_name(ifp),
4813 				    if_name(hn_ifp));
4814 			}
4815 			first = false;
4816 		}
4817 		NET_EPOCH_EXIT(et);
4818 	}
4819 
4820 	rm_runlock(&hn_vfmap_lock, &pt);
4821 
4822 	error = sbuf_finish(sb);
4823 	sbuf_delete(sb);
4824 	return (error);
4825 }
4826 
4827 static int
4828 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4829 {
4830 	struct hn_softc *sc = arg1;
4831 	int error, onoff = 0;
4832 
4833 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4834 		onoff = 1;
4835 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4836 	if (error || req->newptr == NULL)
4837 		return (error);
4838 
4839 	HN_LOCK(sc);
4840 	/* NOTE: hn_vf_lock for hn_transmit() */
4841 	rm_wlock(&sc->hn_vf_lock);
4842 	if (onoff)
4843 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4844 	else
4845 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4846 	rm_wunlock(&sc->hn_vf_lock);
4847 	HN_UNLOCK(sc);
4848 
4849 	return (0);
4850 }
4851 
4852 static int
4853 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4854 {
4855 	struct hn_softc *sc = arg1;
4856 	int enabled = 0;
4857 
4858 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4859 		enabled = 1;
4860 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4861 }
4862 
4863 static int
4864 hn_check_iplen(const struct mbuf *m, int hoff)
4865 {
4866 	const struct ip *ip;
4867 	int len, iphlen, iplen;
4868 	const struct tcphdr *th;
4869 	int thoff;				/* TCP data offset */
4870 
4871 	len = hoff + sizeof(struct ip);
4872 
4873 	/* The packet must be at least the size of an IP header. */
4874 	if (m->m_pkthdr.len < len)
4875 		return IPPROTO_DONE;
4876 
4877 	/* The fixed IP header must reside completely in the first mbuf. */
4878 	if (m->m_len < len)
4879 		return IPPROTO_DONE;
4880 
4881 	ip = mtodo(m, hoff);
4882 
4883 	/* Bound check the packet's stated IP header length. */
4884 	iphlen = ip->ip_hl << 2;
4885 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4886 		return IPPROTO_DONE;
4887 
4888 	/* The full IP header must reside completely in the one mbuf. */
4889 	if (m->m_len < hoff + iphlen)
4890 		return IPPROTO_DONE;
4891 
4892 	iplen = ntohs(ip->ip_len);
4893 
4894 	/*
4895 	 * Check that the amount of data in the buffers is as
4896 	 * at least much as the IP header would have us expect.
4897 	 */
4898 	if (m->m_pkthdr.len < hoff + iplen)
4899 		return IPPROTO_DONE;
4900 
4901 	/*
4902 	 * Ignore IP fragments.
4903 	 */
4904 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4905 		return IPPROTO_DONE;
4906 
4907 	/*
4908 	 * The TCP/IP or UDP/IP header must be entirely contained within
4909 	 * the first fragment of a packet.
4910 	 */
4911 	switch (ip->ip_p) {
4912 	case IPPROTO_TCP:
4913 		if (iplen < iphlen + sizeof(struct tcphdr))
4914 			return IPPROTO_DONE;
4915 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4916 			return IPPROTO_DONE;
4917 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4918 		thoff = th->th_off << 2;
4919 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4920 			return IPPROTO_DONE;
4921 		if (m->m_len < hoff + iphlen + thoff)
4922 			return IPPROTO_DONE;
4923 		break;
4924 	case IPPROTO_UDP:
4925 		if (iplen < iphlen + sizeof(struct udphdr))
4926 			return IPPROTO_DONE;
4927 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4928 			return IPPROTO_DONE;
4929 		break;
4930 	default:
4931 		if (iplen < iphlen)
4932 			return IPPROTO_DONE;
4933 		break;
4934 	}
4935 	return ip->ip_p;
4936 }
4937 
4938 static void
4939 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4940 {
4941 	const struct ether_header *eh;
4942 	uint16_t etype;
4943 	int hoff;
4944 
4945 	hoff = sizeof(*eh);
4946 	/* Checked at the beginning of this function. */
4947 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4948 
4949 	eh = mtod(m_new, const struct ether_header *);
4950 	etype = ntohs(eh->ether_type);
4951 	if (etype == ETHERTYPE_VLAN) {
4952 		const struct ether_vlan_header *evl;
4953 
4954 		hoff = sizeof(*evl);
4955 		if (m_new->m_len < hoff)
4956 			return;
4957 		evl = mtod(m_new, const struct ether_vlan_header *);
4958 		etype = ntohs(evl->evl_proto);
4959 	}
4960 	*l3proto = etype;
4961 
4962 	if (etype == ETHERTYPE_IP)
4963 		*l4proto = hn_check_iplen(m_new, hoff);
4964 	else
4965 		*l4proto = IPPROTO_DONE;
4966 }
4967 
4968 static int
4969 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4970 {
4971 	struct sysctl_oid_list *child;
4972 	struct sysctl_ctx_list *ctx;
4973 	device_t dev = sc->hn_dev;
4974 #if defined(INET) || defined(INET6)
4975 	int lroent_cnt;
4976 #endif
4977 	int i;
4978 
4979 	/*
4980 	 * Create RXBUF for reception.
4981 	 *
4982 	 * NOTE:
4983 	 * - It is shared by all channels.
4984 	 * - A large enough buffer is allocated, certain version of NVSes
4985 	 *   may further limit the usable space.
4986 	 */
4987 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4988 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4989 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4990 	if (sc->hn_rxbuf == NULL) {
4991 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4992 		return (ENOMEM);
4993 	}
4994 
4995 	sc->hn_rx_ring_cnt = ring_cnt;
4996 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4997 
4998 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4999 	    M_DEVBUF, M_WAITOK | M_ZERO);
5000 
5001 #if defined(INET) || defined(INET6)
5002 	lroent_cnt = hn_lro_entry_count;
5003 	if (lroent_cnt < TCP_LRO_ENTRIES)
5004 		lroent_cnt = TCP_LRO_ENTRIES;
5005 	if (bootverbose)
5006 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
5007 #endif	/* INET || INET6 */
5008 
5009 	ctx = device_get_sysctl_ctx(dev);
5010 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
5011 
5012 	/* Create dev.hn.UNIT.rx sysctl tree */
5013 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
5014 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5015 
5016 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5017 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5018 
5019 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
5020 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
5021 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
5022 		if (rxr->hn_br == NULL) {
5023 			device_printf(dev, "allocate bufring failed\n");
5024 			return (ENOMEM);
5025 		}
5026 
5027 		if (hn_trust_hosttcp)
5028 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5029 		if (hn_trust_hostudp)
5030 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5031 		if (hn_trust_hostip)
5032 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5033 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5034 		rxr->hn_ifp = sc->hn_ifp;
5035 		if (i < sc->hn_tx_ring_cnt)
5036 			rxr->hn_txr = &sc->hn_tx_ring[i];
5037 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5038 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5039 		rxr->hn_rx_idx = i;
5040 		rxr->hn_rxbuf = sc->hn_rxbuf;
5041 
5042 		/*
5043 		 * Initialize LRO.
5044 		 */
5045 #if defined(INET) || defined(INET6)
5046 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5047 		    hn_lro_mbufq_depth);
5048 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5049 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5050 #endif	/* INET || INET6 */
5051 
5052 		if (sc->hn_rx_sysctl_tree != NULL) {
5053 			char name[16];
5054 
5055 			/*
5056 			 * Create per RX ring sysctl tree:
5057 			 * dev.hn.UNIT.rx.RINGID
5058 			 */
5059 			snprintf(name, sizeof(name), "%d", i);
5060 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5061 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5062 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5063 
5064 			if (rxr->hn_rx_sysctl_tree != NULL) {
5065 				SYSCTL_ADD_ULONG(ctx,
5066 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5067 				    OID_AUTO, "packets",
5068 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5069 				    "# of packets received");
5070 				SYSCTL_ADD_ULONG(ctx,
5071 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5072 				    OID_AUTO, "rss_pkts",
5073 				    CTLFLAG_RW | CTLFLAG_STATS,
5074 				    &rxr->hn_rss_pkts,
5075 				    "# of packets w/ RSS info received");
5076 				SYSCTL_ADD_ULONG(ctx,
5077 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5078 				    OID_AUTO, "rsc_pkts",
5079 				    CTLFLAG_RW | CTLFLAG_STATS,
5080 				    &rxr->hn_rsc_pkts,
5081 				    "# of RSC packets received");
5082 				SYSCTL_ADD_ULONG(ctx,
5083 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5084 				    OID_AUTO, "rsc_drop",
5085 				    CTLFLAG_RW | CTLFLAG_STATS,
5086 				    &rxr->hn_rsc_drop,
5087 				    "# of RSC fragments dropped");
5088 				SYSCTL_ADD_INT(ctx,
5089 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5090 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5091 				    &rxr->hn_pktbuf_len, 0,
5092 				    "Temporary channel packet buffer length");
5093 			}
5094 		}
5095 	}
5096 
5097 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5098 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5099 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5100 	    hn_rx_stat_u64_sysctl,
5101 	    "LU", "LRO queued");
5102 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5103 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5104 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5105 	    hn_rx_stat_u64_sysctl,
5106 	    "LU", "LRO flushed");
5107 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5108 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5109 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5110 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5111 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5112 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5113 	    hn_lro_lenlim_sysctl, "IU",
5114 	    "Max # of data bytes to be aggregated by LRO");
5115 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5116 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5117 	    hn_lro_ackcnt_sysctl, "I",
5118 	    "Max # of ACKs to be aggregated by LRO");
5119 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5120 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5121 	    hn_trust_hcsum_sysctl, "I",
5122 	    "Trust tcp segment verification on host side, "
5123 	    "when csum info is missing");
5124 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5125 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5126 	    hn_trust_hcsum_sysctl, "I",
5127 	    "Trust udp datagram verification on host side, "
5128 	    "when csum info is missing");
5129 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5130 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5131 	    hn_trust_hcsum_sysctl, "I",
5132 	    "Trust ip packet verification on host side, "
5133 	    "when csum info is missing");
5134 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5135 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5136 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5137 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5138 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5139 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5140 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5141 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5142 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5143 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5144 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5145 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5146 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5147 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5148 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5149 	    hn_rx_stat_ulong_sysctl, "LU",
5150 	    "# of packets that we trust host's csum verification");
5151 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5152 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5153 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5154 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5155 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5156 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5157 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5158 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5159 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5160 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5161 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5162 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5163 
5164 	return (0);
5165 }
5166 
5167 static void
5168 hn_destroy_rx_data(struct hn_softc *sc)
5169 {
5170 	int i;
5171 
5172 	if (sc->hn_rxbuf != NULL) {
5173 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5174 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5175 		else
5176 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5177 		sc->hn_rxbuf = NULL;
5178 	}
5179 
5180 	if (sc->hn_rx_ring_cnt == 0)
5181 		return;
5182 
5183 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5184 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5185 
5186 		if (rxr->hn_br == NULL)
5187 			continue;
5188 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5189 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5190 		} else {
5191 			device_printf(sc->hn_dev,
5192 			    "%dth channel bufring is referenced", i);
5193 		}
5194 		rxr->hn_br = NULL;
5195 
5196 #if defined(INET) || defined(INET6)
5197 		tcp_lro_free(&rxr->hn_lro);
5198 #endif
5199 		free(rxr->hn_pktbuf, M_DEVBUF);
5200 	}
5201 	free(sc->hn_rx_ring, M_DEVBUF);
5202 	sc->hn_rx_ring = NULL;
5203 
5204 	sc->hn_rx_ring_cnt = 0;
5205 	sc->hn_rx_ring_inuse = 0;
5206 }
5207 
5208 static int
5209 hn_tx_ring_create(struct hn_softc *sc, int id)
5210 {
5211 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5212 	device_t dev = sc->hn_dev;
5213 	bus_dma_tag_t parent_dtag;
5214 	int error, i;
5215 
5216 	txr->hn_sc = sc;
5217 	txr->hn_tx_idx = id;
5218 
5219 #ifndef HN_USE_TXDESC_BUFRING
5220 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5221 #endif
5222 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5223 
5224 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5225 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5226 	    M_DEVBUF, M_WAITOK | M_ZERO);
5227 #ifndef HN_USE_TXDESC_BUFRING
5228 	SLIST_INIT(&txr->hn_txlist);
5229 #else
5230 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5231 	    M_WAITOK, &txr->hn_tx_lock);
5232 #endif
5233 
5234 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5235 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5236 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5237 	} else {
5238 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5239 	}
5240 
5241 #ifdef HN_IFSTART_SUPPORT
5242 	if (hn_use_if_start) {
5243 		txr->hn_txeof = hn_start_txeof;
5244 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5245 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5246 	} else
5247 #endif
5248 	{
5249 		int br_depth;
5250 
5251 		txr->hn_txeof = hn_xmit_txeof;
5252 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5253 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5254 
5255 		br_depth = hn_get_txswq_depth(txr);
5256 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5257 		    M_WAITOK, &txr->hn_tx_lock);
5258 	}
5259 
5260 	txr->hn_direct_tx_size = hn_direct_tx_size;
5261 
5262 	/*
5263 	 * Always schedule transmission instead of trying to do direct
5264 	 * transmission.  This one gives the best performance so far.
5265 	 */
5266 	txr->hn_sched_tx = 1;
5267 
5268 	parent_dtag = bus_get_dma_tag(dev);
5269 
5270 	/* DMA tag for RNDIS packet messages. */
5271 	error = bus_dma_tag_create(parent_dtag, /* parent */
5272 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5273 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5274 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5275 	    BUS_SPACE_MAXADDR,		/* highaddr */
5276 	    NULL, NULL,			/* filter, filterarg */
5277 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5278 	    1,				/* nsegments */
5279 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5280 	    0,				/* flags */
5281 	    NULL,			/* lockfunc */
5282 	    NULL,			/* lockfuncarg */
5283 	    &txr->hn_tx_rndis_dtag);
5284 	if (error) {
5285 		device_printf(dev, "failed to create rndis dmatag\n");
5286 		return error;
5287 	}
5288 
5289 	/* DMA tag for data. */
5290 	error = bus_dma_tag_create(parent_dtag, /* parent */
5291 	    1,				/* alignment */
5292 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5293 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5294 	    BUS_SPACE_MAXADDR,		/* highaddr */
5295 	    NULL, NULL,			/* filter, filterarg */
5296 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5297 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5298 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5299 	    0,				/* flags */
5300 	    NULL,			/* lockfunc */
5301 	    NULL,			/* lockfuncarg */
5302 	    &txr->hn_tx_data_dtag);
5303 	if (error) {
5304 		device_printf(dev, "failed to create data dmatag\n");
5305 		return error;
5306 	}
5307 
5308 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5309 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5310 
5311 		txd->txr = txr;
5312 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5313 		STAILQ_INIT(&txd->agg_list);
5314 
5315 		/*
5316 		 * Allocate and load RNDIS packet message.
5317 		 */
5318         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5319 		    (void **)&txd->rndis_pkt,
5320 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5321 		    &txd->rndis_pkt_dmap);
5322 		if (error) {
5323 			device_printf(dev,
5324 			    "failed to allocate rndis_packet_msg, %d\n", i);
5325 			return error;
5326 		}
5327 
5328 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5329 		    txd->rndis_pkt_dmap,
5330 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5331 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5332 		    BUS_DMA_NOWAIT);
5333 		if (error) {
5334 			device_printf(dev,
5335 			    "failed to load rndis_packet_msg, %d\n", i);
5336 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5337 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5338 			return error;
5339 		}
5340 
5341 		/* DMA map for TX data. */
5342 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5343 		    &txd->data_dmap);
5344 		if (error) {
5345 			device_printf(dev,
5346 			    "failed to allocate tx data dmamap\n");
5347 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5348 			    txd->rndis_pkt_dmap);
5349 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5350 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5351 			return error;
5352 		}
5353 
5354 		/* All set, put it to list */
5355 		txd->flags |= HN_TXD_FLAG_ONLIST;
5356 #ifndef HN_USE_TXDESC_BUFRING
5357 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5358 #else
5359 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5360 #endif
5361 	}
5362 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5363 
5364 	if (sc->hn_tx_sysctl_tree != NULL) {
5365 		struct sysctl_oid_list *child;
5366 		struct sysctl_ctx_list *ctx;
5367 		char name[16];
5368 
5369 		/*
5370 		 * Create per TX ring sysctl tree:
5371 		 * dev.hn.UNIT.tx.RINGID
5372 		 */
5373 		ctx = device_get_sysctl_ctx(dev);
5374 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5375 
5376 		snprintf(name, sizeof(name), "%d", id);
5377 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5378 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5379 
5380 		if (txr->hn_tx_sysctl_tree != NULL) {
5381 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5382 
5383 #ifdef HN_DEBUG
5384 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5385 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5386 			    "# of available TX descs");
5387 #endif
5388 #ifdef HN_IFSTART_SUPPORT
5389 			if (!hn_use_if_start)
5390 #endif
5391 			{
5392 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5393 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5394 				    "over active");
5395 			}
5396 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5397 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5398 			    "# of packets transmitted");
5399 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5400 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5401 			    "# of sends");
5402 		}
5403 	}
5404 
5405 	return 0;
5406 }
5407 
5408 static void
5409 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5410 {
5411 	struct hn_tx_ring *txr = txd->txr;
5412 
5413 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5414 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5415 
5416 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5417 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5418 	    txd->rndis_pkt_dmap);
5419 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5420 }
5421 
5422 static void
5423 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5424 {
5425 
5426 	KASSERT(txd->refs == 0 || txd->refs == 1,
5427 	    ("invalid txd refs %d", txd->refs));
5428 
5429 	/* Aggregated txds will be freed by their aggregating txd. */
5430 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5431 		int freed __diagused;
5432 
5433 		freed = hn_txdesc_put(txr, txd);
5434 		KASSERT(freed, ("can't free txdesc"));
5435 	}
5436 }
5437 
5438 static void
5439 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5440 {
5441 	int i;
5442 
5443 	if (txr->hn_txdesc == NULL)
5444 		return;
5445 
5446 	/*
5447 	 * NOTE:
5448 	 * Because the freeing of aggregated txds will be deferred
5449 	 * to the aggregating txd, two passes are used here:
5450 	 * - The first pass GCes any pending txds.  This GC is necessary,
5451 	 *   since if the channels are revoked, hypervisor will not
5452 	 *   deliver send-done for all pending txds.
5453 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5454 	 *   were freed.
5455 	 */
5456 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5457 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5458 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5459 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5460 
5461 	if (txr->hn_tx_data_dtag != NULL)
5462 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5463 	if (txr->hn_tx_rndis_dtag != NULL)
5464 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5465 
5466 #ifdef HN_USE_TXDESC_BUFRING
5467 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5468 #endif
5469 
5470 	free(txr->hn_txdesc, M_DEVBUF);
5471 	txr->hn_txdesc = NULL;
5472 
5473 	if (txr->hn_mbuf_br != NULL)
5474 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5475 
5476 #ifndef HN_USE_TXDESC_BUFRING
5477 	mtx_destroy(&txr->hn_txlist_spin);
5478 #endif
5479 	mtx_destroy(&txr->hn_tx_lock);
5480 }
5481 
5482 static int
5483 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5484 {
5485 	struct sysctl_oid_list *child;
5486 	struct sysctl_ctx_list *ctx;
5487 	int i;
5488 
5489 	/*
5490 	 * Create TXBUF for chimney sending.
5491 	 *
5492 	 * NOTE: It is shared by all channels.
5493 	 */
5494 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5495 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5496 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5497 	if (sc->hn_chim == NULL) {
5498 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5499 		return (ENOMEM);
5500 	}
5501 
5502 	sc->hn_tx_ring_cnt = ring_cnt;
5503 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5504 
5505 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5506 	    M_DEVBUF, M_WAITOK | M_ZERO);
5507 
5508 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5509 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5510 
5511 	/* Create dev.hn.UNIT.tx sysctl tree */
5512 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5513 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5514 
5515 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5516 		int error;
5517 
5518 		error = hn_tx_ring_create(sc, i);
5519 		if (error)
5520 			return error;
5521 	}
5522 
5523 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5524 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5525 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5526 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5527 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5528 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5529 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5530 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5531 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5532 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5533 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5534 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5535 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5536 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5537 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5538 	    hn_tx_stat_ulong_sysctl, "LU",
5539 	    "# of packet transmission aggregation flush failure");
5540 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5541 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5542 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5543 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5544 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5545 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5546 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5547 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5548 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5549 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5550 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5551 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5552 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5553 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5554 	    "# of total TX descs");
5555 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5556 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5557 	    "Chimney send packet size upper boundary");
5558 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5559 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5560 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5561 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5562 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5563 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5564 	    hn_tx_conf_int_sysctl, "I",
5565 	    "Size of the packet for direct transmission");
5566 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5567 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5568 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5569 	    hn_tx_conf_int_sysctl, "I",
5570 	    "Always schedule transmission "
5571 	    "instead of doing direct transmission");
5572 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5573 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5574 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5575 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5576 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5577 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5578 	    "Applied packet transmission aggregation size");
5579 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5580 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5581 	    hn_txagg_pktmax_sysctl, "I",
5582 	    "Applied packet transmission aggregation packets");
5583 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5584 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5585 	    hn_txagg_align_sysctl, "I",
5586 	    "Applied packet transmission aggregation alignment");
5587 
5588 	return 0;
5589 }
5590 
5591 static void
5592 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5593 {
5594 	int i;
5595 
5596 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5597 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5598 }
5599 
5600 static void
5601 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5602 {
5603 	if_t ifp = sc->hn_ifp;
5604 	u_int hw_tsomax;
5605 	int tso_minlen;
5606 
5607 	HN_LOCK_ASSERT(sc);
5608 
5609 	if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5610 		return;
5611 
5612 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5613 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5614 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5615 
5616 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5617 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5618 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5619 
5620 	if (tso_maxlen < tso_minlen)
5621 		tso_maxlen = tso_minlen;
5622 	else if (tso_maxlen > IP_MAXPACKET)
5623 		tso_maxlen = IP_MAXPACKET;
5624 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5625 		tso_maxlen = sc->hn_ndis_tso_szmax;
5626 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5627 
5628 	if (hn_xpnt_vf_isready(sc)) {
5629 		if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5630 			hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5631 	}
5632 	if_sethwtsomax(ifp, hw_tsomax);
5633 	if (bootverbose)
5634 		if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5635 }
5636 
5637 static void
5638 hn_fixup_tx_data(struct hn_softc *sc)
5639 {
5640 	uint64_t csum_assist;
5641 	int i;
5642 
5643 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5644 	if (hn_tx_chimney_size > 0 &&
5645 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5646 		hn_set_chim_size(sc, hn_tx_chimney_size);
5647 
5648 	csum_assist = 0;
5649 	if (sc->hn_caps & HN_CAP_IPCS)
5650 		csum_assist |= CSUM_IP;
5651 	if (sc->hn_caps & HN_CAP_TCP4CS)
5652 		csum_assist |= CSUM_IP_TCP;
5653 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5654 		csum_assist |= CSUM_IP_UDP;
5655 	if (sc->hn_caps & HN_CAP_TCP6CS)
5656 		csum_assist |= CSUM_IP6_TCP;
5657 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5658 		csum_assist |= CSUM_IP6_UDP;
5659 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5660 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5661 
5662 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5663 		/*
5664 		 * Support HASHVAL pktinfo on TX path.
5665 		 */
5666 		if (bootverbose)
5667 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5668 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5669 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5670 	}
5671 }
5672 
5673 static void
5674 hn_fixup_rx_data(struct hn_softc *sc)
5675 {
5676 
5677 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5678 		int i;
5679 
5680 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5681 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5682 	}
5683 }
5684 
5685 static void
5686 hn_destroy_tx_data(struct hn_softc *sc)
5687 {
5688 	int i;
5689 
5690 	if (sc->hn_chim != NULL) {
5691 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5692 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5693 		} else {
5694 			device_printf(sc->hn_dev,
5695 			    "chimney sending buffer is referenced");
5696 		}
5697 		sc->hn_chim = NULL;
5698 	}
5699 
5700 	if (sc->hn_tx_ring_cnt == 0)
5701 		return;
5702 
5703 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5704 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5705 
5706 	free(sc->hn_tx_ring, M_DEVBUF);
5707 	sc->hn_tx_ring = NULL;
5708 
5709 	sc->hn_tx_ring_cnt = 0;
5710 	sc->hn_tx_ring_inuse = 0;
5711 }
5712 
5713 #ifdef HN_IFSTART_SUPPORT
5714 
5715 static void
5716 hn_start_taskfunc(void *xtxr, int pending __unused)
5717 {
5718 	struct hn_tx_ring *txr = xtxr;
5719 
5720 	mtx_lock(&txr->hn_tx_lock);
5721 	hn_start_locked(txr, 0);
5722 	mtx_unlock(&txr->hn_tx_lock);
5723 }
5724 
5725 static int
5726 hn_start_locked(struct hn_tx_ring *txr, int len)
5727 {
5728 	struct hn_softc *sc = txr->hn_sc;
5729 	if_t ifp = sc->hn_ifp;
5730 	int sched = 0;
5731 
5732 	KASSERT(hn_use_if_start,
5733 	    ("hn_start_locked is called, when if_start is disabled"));
5734 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5735 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5736 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5737 
5738 	if (__predict_false(txr->hn_suspended))
5739 		return (0);
5740 
5741 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5742 	    IFF_DRV_RUNNING)
5743 		return (0);
5744 
5745 	while (!if_sendq_empty(ifp)) {
5746 		struct hn_txdesc *txd;
5747 		struct mbuf *m_head;
5748 		int error;
5749 
5750 		m_head = if_dequeue(ifp);
5751 		if (m_head == NULL)
5752 			break;
5753 
5754 		if (len > 0 && m_head->m_pkthdr.len > len) {
5755 			/*
5756 			 * This sending could be time consuming; let callers
5757 			 * dispatch this packet sending (and sending of any
5758 			 * following up packets) to tx taskqueue.
5759 			 */
5760 			if_sendq_prepend(ifp, m_head);
5761 			sched = 1;
5762 			break;
5763 		}
5764 
5765 #if defined(INET6) || defined(INET)
5766 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5767 			m_head = hn_tso_fixup(m_head);
5768 			if (__predict_false(m_head == NULL)) {
5769 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5770 				continue;
5771 			}
5772 		} else if (m_head->m_pkthdr.csum_flags &
5773 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5774 			m_head = hn_set_hlen(m_head);
5775 			if (__predict_false(m_head == NULL)) {
5776 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5777 				continue;
5778 			}
5779 		}
5780 #endif
5781 
5782 		txd = hn_txdesc_get(txr);
5783 		if (txd == NULL) {
5784 			txr->hn_no_txdescs++;
5785 			if_sendq_prepend(ifp, m_head);
5786 			if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5787 			break;
5788 		}
5789 
5790 		error = hn_encap(ifp, txr, txd, &m_head);
5791 		if (error) {
5792 			/* Both txd and m_head are freed */
5793 			KASSERT(txr->hn_agg_txd == NULL,
5794 			    ("encap failed w/ pending aggregating txdesc"));
5795 			continue;
5796 		}
5797 
5798 		if (txr->hn_agg_pktleft == 0) {
5799 			if (txr->hn_agg_txd != NULL) {
5800 				KASSERT(m_head == NULL,
5801 				    ("pending mbuf for aggregating txdesc"));
5802 				error = hn_flush_txagg(ifp, txr);
5803 				if (__predict_false(error)) {
5804 					if_setdrvflagbits(ifp,
5805 					    IFF_DRV_OACTIVE, 0);
5806 					break;
5807 				}
5808 			} else {
5809 				KASSERT(m_head != NULL, ("mbuf was freed"));
5810 				error = hn_txpkt(ifp, txr, txd);
5811 				if (__predict_false(error)) {
5812 					/* txd is freed, but m_head is not */
5813 					if_sendq_prepend(ifp, m_head);
5814 					if_setdrvflagbits(ifp,
5815 					    IFF_DRV_OACTIVE, 0);
5816 					break;
5817 				}
5818 			}
5819 		}
5820 #ifdef INVARIANTS
5821 		else {
5822 			KASSERT(txr->hn_agg_txd != NULL,
5823 			    ("no aggregating txdesc"));
5824 			KASSERT(m_head == NULL,
5825 			    ("pending mbuf for aggregating txdesc"));
5826 		}
5827 #endif
5828 	}
5829 
5830 	/* Flush pending aggerated transmission. */
5831 	if (txr->hn_agg_txd != NULL)
5832 		hn_flush_txagg(ifp, txr);
5833 	return (sched);
5834 }
5835 
5836 static void
5837 hn_start(if_t ifp)
5838 {
5839 	struct hn_softc *sc = if_getsoftc(ifp);
5840 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5841 
5842 	if (txr->hn_sched_tx)
5843 		goto do_sched;
5844 
5845 	if (mtx_trylock(&txr->hn_tx_lock)) {
5846 		int sched;
5847 
5848 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5849 		mtx_unlock(&txr->hn_tx_lock);
5850 		if (!sched)
5851 			return;
5852 	}
5853 do_sched:
5854 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5855 }
5856 
5857 static void
5858 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5859 {
5860 	struct hn_tx_ring *txr = xtxr;
5861 
5862 	mtx_lock(&txr->hn_tx_lock);
5863 	if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5864 	hn_start_locked(txr, 0);
5865 	mtx_unlock(&txr->hn_tx_lock);
5866 }
5867 
5868 static void
5869 hn_start_txeof(struct hn_tx_ring *txr)
5870 {
5871 	struct hn_softc *sc = txr->hn_sc;
5872 	if_t ifp = sc->hn_ifp;
5873 
5874 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5875 
5876 	if (txr->hn_sched_tx)
5877 		goto do_sched;
5878 
5879 	if (mtx_trylock(&txr->hn_tx_lock)) {
5880 		int sched;
5881 
5882 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5883 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5884 		mtx_unlock(&txr->hn_tx_lock);
5885 		if (sched) {
5886 			taskqueue_enqueue(txr->hn_tx_taskq,
5887 			    &txr->hn_tx_task);
5888 		}
5889 	} else {
5890 do_sched:
5891 		/*
5892 		 * Release the OACTIVE earlier, with the hope, that
5893 		 * others could catch up.  The task will clear the
5894 		 * flag again with the hn_tx_lock to avoid possible
5895 		 * races.
5896 		 */
5897 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5898 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5899 	}
5900 }
5901 
5902 #endif	/* HN_IFSTART_SUPPORT */
5903 
5904 static int
5905 hn_xmit(struct hn_tx_ring *txr, int len)
5906 {
5907 	struct hn_softc *sc = txr->hn_sc;
5908 	if_t ifp = sc->hn_ifp;
5909 	struct mbuf *m_head;
5910 	int sched = 0;
5911 
5912 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5913 #ifdef HN_IFSTART_SUPPORT
5914 	KASSERT(hn_use_if_start == 0,
5915 	    ("hn_xmit is called, when if_start is enabled"));
5916 #endif
5917 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5918 
5919 	if (__predict_false(txr->hn_suspended))
5920 		return (0);
5921 
5922 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5923 		return (0);
5924 
5925 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5926 		struct hn_txdesc *txd;
5927 		int error;
5928 
5929 		if (len > 0 && m_head->m_pkthdr.len > len) {
5930 			/*
5931 			 * This sending could be time consuming; let callers
5932 			 * dispatch this packet sending (and sending of any
5933 			 * following up packets) to tx taskqueue.
5934 			 */
5935 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5936 			sched = 1;
5937 			break;
5938 		}
5939 
5940 		txd = hn_txdesc_get(txr);
5941 		if (txd == NULL) {
5942 			txr->hn_no_txdescs++;
5943 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5944 			txr->hn_oactive = 1;
5945 			break;
5946 		}
5947 
5948 		error = hn_encap(ifp, txr, txd, &m_head);
5949 		if (error) {
5950 			/* Both txd and m_head are freed; discard */
5951 			KASSERT(txr->hn_agg_txd == NULL,
5952 			    ("encap failed w/ pending aggregating txdesc"));
5953 			drbr_advance(ifp, txr->hn_mbuf_br);
5954 			continue;
5955 		}
5956 
5957 		if (txr->hn_agg_pktleft == 0) {
5958 			if (txr->hn_agg_txd != NULL) {
5959 				KASSERT(m_head == NULL,
5960 				    ("pending mbuf for aggregating txdesc"));
5961 				error = hn_flush_txagg(ifp, txr);
5962 				if (__predict_false(error)) {
5963 					txr->hn_oactive = 1;
5964 					break;
5965 				}
5966 			} else {
5967 				KASSERT(m_head != NULL, ("mbuf was freed"));
5968 				error = hn_txpkt(ifp, txr, txd);
5969 				if (__predict_false(error)) {
5970 					/* txd is freed, but m_head is not */
5971 					drbr_putback(ifp, txr->hn_mbuf_br,
5972 					    m_head);
5973 					txr->hn_oactive = 1;
5974 					break;
5975 				}
5976 			}
5977 		}
5978 #ifdef INVARIANTS
5979 		else {
5980 			KASSERT(txr->hn_agg_txd != NULL,
5981 			    ("no aggregating txdesc"));
5982 			KASSERT(m_head == NULL,
5983 			    ("pending mbuf for aggregating txdesc"));
5984 		}
5985 #endif
5986 
5987 		/* Sent */
5988 		drbr_advance(ifp, txr->hn_mbuf_br);
5989 	}
5990 
5991 	/* Flush pending aggerated transmission. */
5992 	if (txr->hn_agg_txd != NULL)
5993 		hn_flush_txagg(ifp, txr);
5994 	return (sched);
5995 }
5996 
5997 static int
5998 hn_transmit(if_t ifp, struct mbuf *m)
5999 {
6000 	struct hn_softc *sc = if_getsoftc(ifp);
6001 	struct hn_tx_ring *txr;
6002 	int error, idx = 0;
6003 
6004 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
6005 		struct rm_priotracker pt;
6006 
6007 		rm_rlock(&sc->hn_vf_lock, &pt);
6008 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6009 			struct mbuf *m_bpf = NULL;
6010 			int obytes, omcast;
6011 
6012 			obytes = m->m_pkthdr.len;
6013 			omcast = (m->m_flags & M_MCAST) != 0;
6014 
6015 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6016 				if (bpf_peers_present(if_getbpf(ifp))) {
6017 					m_bpf = m_copypacket(m, M_NOWAIT);
6018 					if (m_bpf == NULL) {
6019 						/*
6020 						 * Failed to grab a shallow
6021 						 * copy; tap now.
6022 						 */
6023 						ETHER_BPF_MTAP(ifp, m);
6024 					}
6025 				}
6026 			} else {
6027 				ETHER_BPF_MTAP(ifp, m);
6028 			}
6029 
6030 			error = if_transmit(sc->hn_vf_ifp, m);
6031 			rm_runlock(&sc->hn_vf_lock, &pt);
6032 
6033 			if (m_bpf != NULL) {
6034 				if (!error)
6035 					ETHER_BPF_MTAP(ifp, m_bpf);
6036 				m_freem(m_bpf);
6037 			}
6038 
6039 			if (error == ENOBUFS) {
6040 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6041 			} else if (error) {
6042 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6043 			} else {
6044 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6045 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6046 				if (omcast) {
6047 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6048 					    omcast);
6049 				}
6050 			}
6051 			return (error);
6052 		}
6053 		rm_runlock(&sc->hn_vf_lock, &pt);
6054 	}
6055 
6056 #if defined(INET6) || defined(INET)
6057 	/*
6058 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6059 	 * since packet headers should be cache-hot.
6060 	 */
6061 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6062 		m = hn_tso_fixup(m);
6063 		if (__predict_false(m == NULL)) {
6064 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6065 			return EIO;
6066 		}
6067 	} else if (m->m_pkthdr.csum_flags &
6068 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6069 		m = hn_set_hlen(m);
6070 		if (__predict_false(m == NULL)) {
6071 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6072 			return EIO;
6073 		}
6074 	}
6075 #endif
6076 
6077 	/*
6078 	 * Select the TX ring based on flowid
6079 	 */
6080 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6081 #ifdef RSS
6082 		uint32_t bid;
6083 
6084 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6085 		    &bid) == 0)
6086 			idx = bid % sc->hn_tx_ring_inuse;
6087 		else
6088 #endif
6089 		{
6090 #if defined(INET6) || defined(INET)
6091 			int tcpsyn = 0;
6092 
6093 			if (m->m_pkthdr.len < 128 &&
6094 			    (m->m_pkthdr.csum_flags &
6095 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6096 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6097 				m = hn_check_tcpsyn(m, &tcpsyn);
6098 				if (__predict_false(m == NULL)) {
6099 					if_inc_counter(ifp,
6100 					    IFCOUNTER_OERRORS, 1);
6101 					return (EIO);
6102 				}
6103 			}
6104 #else
6105 			const int tcpsyn = 0;
6106 #endif
6107 			if (tcpsyn)
6108 				idx = 0;
6109 			else
6110 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6111 		}
6112 	}
6113 	txr = &sc->hn_tx_ring[idx];
6114 
6115 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6116 	if (error) {
6117 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6118 		return error;
6119 	}
6120 
6121 	if (txr->hn_oactive)
6122 		return 0;
6123 
6124 	if (txr->hn_sched_tx)
6125 		goto do_sched;
6126 
6127 	if (mtx_trylock(&txr->hn_tx_lock)) {
6128 		int sched;
6129 
6130 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6131 		mtx_unlock(&txr->hn_tx_lock);
6132 		if (!sched)
6133 			return 0;
6134 	}
6135 do_sched:
6136 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6137 	return 0;
6138 }
6139 
6140 static void
6141 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6142 {
6143 	struct mbuf *m;
6144 
6145 	mtx_lock(&txr->hn_tx_lock);
6146 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6147 		m_freem(m);
6148 	mtx_unlock(&txr->hn_tx_lock);
6149 }
6150 
6151 static void
6152 hn_xmit_qflush(if_t ifp)
6153 {
6154 	struct hn_softc *sc = if_getsoftc(ifp);
6155 	struct rm_priotracker pt;
6156 	int i;
6157 
6158 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6159 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6160 	if_qflush(ifp);
6161 
6162 	rm_rlock(&sc->hn_vf_lock, &pt);
6163 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6164 		if_qflush(sc->hn_vf_ifp);
6165 	rm_runlock(&sc->hn_vf_lock, &pt);
6166 }
6167 
6168 static void
6169 hn_xmit_txeof(struct hn_tx_ring *txr)
6170 {
6171 
6172 	if (txr->hn_sched_tx)
6173 		goto do_sched;
6174 
6175 	if (mtx_trylock(&txr->hn_tx_lock)) {
6176 		int sched;
6177 
6178 		txr->hn_oactive = 0;
6179 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6180 		mtx_unlock(&txr->hn_tx_lock);
6181 		if (sched) {
6182 			taskqueue_enqueue(txr->hn_tx_taskq,
6183 			    &txr->hn_tx_task);
6184 		}
6185 	} else {
6186 do_sched:
6187 		/*
6188 		 * Release the oactive earlier, with the hope, that
6189 		 * others could catch up.  The task will clear the
6190 		 * oactive again with the hn_tx_lock to avoid possible
6191 		 * races.
6192 		 */
6193 		txr->hn_oactive = 0;
6194 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6195 	}
6196 }
6197 
6198 static void
6199 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6200 {
6201 	struct hn_tx_ring *txr = xtxr;
6202 
6203 	mtx_lock(&txr->hn_tx_lock);
6204 	hn_xmit(txr, 0);
6205 	mtx_unlock(&txr->hn_tx_lock);
6206 }
6207 
6208 static void
6209 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6210 {
6211 	struct hn_tx_ring *txr = xtxr;
6212 
6213 	mtx_lock(&txr->hn_tx_lock);
6214 	txr->hn_oactive = 0;
6215 	hn_xmit(txr, 0);
6216 	mtx_unlock(&txr->hn_tx_lock);
6217 }
6218 
6219 static int
6220 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6221 {
6222 	struct vmbus_chan_br cbr;
6223 	struct hn_rx_ring *rxr;
6224 	struct hn_tx_ring *txr = NULL;
6225 	int idx, error;
6226 
6227 	idx = vmbus_chan_subidx(chan);
6228 
6229 	/*
6230 	 * Link this channel to RX/TX ring.
6231 	 */
6232 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6233 	    ("invalid channel index %d, should > 0 && < %d",
6234 	     idx, sc->hn_rx_ring_inuse));
6235 	rxr = &sc->hn_rx_ring[idx];
6236 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6237 	    ("RX ring %d already attached", idx));
6238 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6239 	rxr->hn_chan = chan;
6240 
6241 	if (bootverbose) {
6242 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6243 		    idx, vmbus_chan_id(chan));
6244 	}
6245 
6246 	if (idx < sc->hn_tx_ring_inuse) {
6247 		txr = &sc->hn_tx_ring[idx];
6248 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6249 		    ("TX ring %d already attached", idx));
6250 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6251 
6252 		txr->hn_chan = chan;
6253 		if (bootverbose) {
6254 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6255 			    idx, vmbus_chan_id(chan));
6256 		}
6257 	}
6258 
6259 	/* Bind this channel to a proper CPU. */
6260 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6261 
6262 	/*
6263 	 * Open this channel
6264 	 */
6265 	cbr.cbr = rxr->hn_br;
6266 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6267 	cbr.cbr_txsz = HN_TXBR_SIZE;
6268 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6269 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6270 	if (error) {
6271 		if (error == EISCONN) {
6272 			if_printf(sc->hn_ifp, "bufring is connected after "
6273 			    "chan%u open failure\n", vmbus_chan_id(chan));
6274 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6275 		} else {
6276 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6277 			    vmbus_chan_id(chan), error);
6278 		}
6279 	}
6280 	return (error);
6281 }
6282 
6283 static void
6284 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6285 {
6286 	struct hn_rx_ring *rxr;
6287 	int idx, error;
6288 
6289 	idx = vmbus_chan_subidx(chan);
6290 
6291 	/*
6292 	 * Link this channel to RX/TX ring.
6293 	 */
6294 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6295 	    ("invalid channel index %d, should > 0 && < %d",
6296 	     idx, sc->hn_rx_ring_inuse));
6297 	rxr = &sc->hn_rx_ring[idx];
6298 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6299 	    ("RX ring %d is not attached", idx));
6300 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6301 
6302 	if (idx < sc->hn_tx_ring_inuse) {
6303 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6304 
6305 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6306 		    ("TX ring %d is not attached attached", idx));
6307 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6308 	}
6309 
6310 	/*
6311 	 * Close this channel.
6312 	 *
6313 	 * NOTE:
6314 	 * Channel closing does _not_ destroy the target channel.
6315 	 */
6316 	error = vmbus_chan_close_direct(chan);
6317 	if (error == EISCONN) {
6318 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6319 		    "after being closed\n", vmbus_chan_id(chan));
6320 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6321 	} else if (error) {
6322 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6323 		    vmbus_chan_id(chan), error);
6324 	}
6325 }
6326 
6327 static int
6328 hn_attach_subchans(struct hn_softc *sc)
6329 {
6330 	struct vmbus_channel **subchans;
6331 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6332 	int i, error = 0;
6333 
6334 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6335 
6336 	/* Attach the sub-channels. */
6337 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6338 	for (i = 0; i < subchan_cnt; ++i) {
6339 		int error1;
6340 
6341 		error1 = hn_chan_attach(sc, subchans[i]);
6342 		if (error1) {
6343 			error = error1;
6344 			/* Move on; all channels will be detached later. */
6345 		}
6346 	}
6347 	vmbus_subchan_rel(subchans, subchan_cnt);
6348 
6349 	if (error) {
6350 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6351 	} else {
6352 		if (bootverbose) {
6353 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6354 			    subchan_cnt);
6355 		}
6356 	}
6357 	return (error);
6358 }
6359 
6360 static void
6361 hn_detach_allchans(struct hn_softc *sc)
6362 {
6363 	struct vmbus_channel **subchans;
6364 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6365 	int i;
6366 
6367 	if (subchan_cnt == 0)
6368 		goto back;
6369 
6370 	/* Detach the sub-channels. */
6371 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6372 	for (i = 0; i < subchan_cnt; ++i)
6373 		hn_chan_detach(sc, subchans[i]);
6374 	vmbus_subchan_rel(subchans, subchan_cnt);
6375 
6376 back:
6377 	/*
6378 	 * Detach the primary channel, _after_ all sub-channels
6379 	 * are detached.
6380 	 */
6381 	hn_chan_detach(sc, sc->hn_prichan);
6382 
6383 	/* Wait for sub-channels to be destroyed, if any. */
6384 	vmbus_subchan_drain(sc->hn_prichan);
6385 
6386 #ifdef INVARIANTS
6387 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6388 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6389 		    HN_RX_FLAG_ATTACHED) == 0,
6390 		    ("%dth RX ring is still attached", i));
6391 	}
6392 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6393 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6394 		    HN_TX_FLAG_ATTACHED) == 0,
6395 		    ("%dth TX ring is still attached", i));
6396 	}
6397 #endif
6398 }
6399 
6400 static int
6401 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6402 {
6403 	struct vmbus_channel **subchans;
6404 	int nchan, rxr_cnt, error;
6405 
6406 	nchan = *nsubch + 1;
6407 	if (nchan == 1) {
6408 		/*
6409 		 * Multiple RX/TX rings are not requested.
6410 		 */
6411 		*nsubch = 0;
6412 		return (0);
6413 	}
6414 
6415 	/*
6416 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6417 	 * table entries.
6418 	 */
6419 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6420 	if (error) {
6421 		/* No RSS; this is benign. */
6422 		*nsubch = 0;
6423 		return (0);
6424 	}
6425 	if (bootverbose) {
6426 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6427 		    rxr_cnt, nchan);
6428 	}
6429 
6430 	if (nchan > rxr_cnt)
6431 		nchan = rxr_cnt;
6432 	if (nchan == 1) {
6433 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6434 		*nsubch = 0;
6435 		return (0);
6436 	}
6437 
6438 	/*
6439 	 * Allocate sub-channels from NVS.
6440 	 */
6441 	*nsubch = nchan - 1;
6442 	error = hn_nvs_alloc_subchans(sc, nsubch);
6443 	if (error || *nsubch == 0) {
6444 		/* Failed to allocate sub-channels. */
6445 		*nsubch = 0;
6446 		return (0);
6447 	}
6448 
6449 	/*
6450 	 * Wait for all sub-channels to become ready before moving on.
6451 	 */
6452 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6453 	vmbus_subchan_rel(subchans, *nsubch);
6454 	return (0);
6455 }
6456 
6457 static bool
6458 hn_synth_attachable(const struct hn_softc *sc)
6459 {
6460 	int i;
6461 
6462 	if (sc->hn_flags & HN_FLAG_ERRORS)
6463 		return (false);
6464 
6465 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6466 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6467 
6468 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6469 			return (false);
6470 	}
6471 	return (true);
6472 }
6473 
6474 /*
6475  * Make sure that the RX filter is zero after the successful
6476  * RNDIS initialization.
6477  *
6478  * NOTE:
6479  * Under certain conditions on certain versions of Hyper-V,
6480  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6481  * after the successful RNDIS initialization, which breaks
6482  * the assumption of any following code (well, it breaks the
6483  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6484  * explicitly, drain packets sneaking through, and drain the
6485  * interrupt taskqueues scheduled due to the stealth packets.
6486  */
6487 static void
6488 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6489 {
6490 
6491 	hn_disable_rx(sc);
6492 	hn_drain_rxtx(sc, nchan);
6493 }
6494 
6495 static int
6496 hn_synth_attach(struct hn_softc *sc, int mtu)
6497 {
6498 #define ATTACHED_NVS		0x0002
6499 #define ATTACHED_RNDIS		0x0004
6500 
6501 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6502 	int error, nsubch, nchan = 1, i, rndis_inited;
6503 	uint32_t old_caps, attached = 0;
6504 
6505 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6506 	    ("synthetic parts were attached"));
6507 
6508 	if (!hn_synth_attachable(sc))
6509 		return (ENXIO);
6510 
6511 	/* Save capabilities for later verification. */
6512 	old_caps = sc->hn_caps;
6513 	sc->hn_caps = 0;
6514 
6515 	/* Clear RSS stuffs. */
6516 	sc->hn_rss_ind_size = 0;
6517 	sc->hn_rss_hash = 0;
6518 	sc->hn_rss_hcap = 0;
6519 
6520 	/*
6521 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6522 	 */
6523 	error = hn_chan_attach(sc, sc->hn_prichan);
6524 	if (error)
6525 		goto failed;
6526 
6527 	/*
6528 	 * Attach NVS.
6529 	 */
6530 	error = hn_nvs_attach(sc, mtu);
6531 	if (error)
6532 		goto failed;
6533 	attached |= ATTACHED_NVS;
6534 
6535 	/*
6536 	 * Attach RNDIS _after_ NVS is attached.
6537 	 */
6538 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6539 	if (rndis_inited)
6540 		attached |= ATTACHED_RNDIS;
6541 	if (error)
6542 		goto failed;
6543 
6544 	/*
6545 	 * Make sure capabilities are not changed.
6546 	 */
6547 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6548 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6549 		    old_caps, sc->hn_caps);
6550 		error = ENXIO;
6551 		goto failed;
6552 	}
6553 
6554 	/*
6555 	 * Allocate sub-channels for multi-TX/RX rings.
6556 	 *
6557 	 * NOTE:
6558 	 * The # of RX rings that can be used is equivalent to the # of
6559 	 * channels to be requested.
6560 	 */
6561 	nsubch = sc->hn_rx_ring_cnt - 1;
6562 	error = hn_synth_alloc_subchans(sc, &nsubch);
6563 	if (error)
6564 		goto failed;
6565 	/* NOTE: _Full_ synthetic parts detach is required now. */
6566 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6567 
6568 	/*
6569 	 * Set the # of TX/RX rings that could be used according to
6570 	 * the # of channels that NVS offered.
6571 	 */
6572 	nchan = nsubch + 1;
6573 	hn_set_ring_inuse(sc, nchan);
6574 	if (nchan == 1) {
6575 		/* Only the primary channel can be used; done */
6576 		goto back;
6577 	}
6578 
6579 	/*
6580 	 * Attach the sub-channels.
6581 	 *
6582 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6583 	 */
6584 	error = hn_attach_subchans(sc);
6585 	if (error)
6586 		goto failed;
6587 
6588 	/*
6589 	 * Configure RSS key and indirect table _after_ all sub-channels
6590 	 * are attached.
6591 	 */
6592 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6593 		/*
6594 		 * RSS key is not set yet; set it to the default RSS key.
6595 		 */
6596 		if (bootverbose)
6597 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6598 #ifdef RSS
6599 		rss_getkey(rss->rss_key);
6600 #else
6601 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6602 #endif
6603 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6604 	}
6605 
6606 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6607 		/*
6608 		 * RSS indirect table is not set yet; set it up in round-
6609 		 * robin fashion.
6610 		 */
6611 		if (bootverbose) {
6612 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6613 			    "table\n");
6614 		}
6615 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6616 			uint32_t subidx;
6617 
6618 #ifdef RSS
6619 			subidx = rss_get_indirection_to_bucket(i);
6620 #else
6621 			subidx = i;
6622 #endif
6623 			rss->rss_ind[i] = subidx % nchan;
6624 		}
6625 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6626 	} else {
6627 		/*
6628 		 * # of usable channels may be changed, so we have to
6629 		 * make sure that all entries in RSS indirect table
6630 		 * are valid.
6631 		 *
6632 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6633 		 */
6634 		hn_rss_ind_fixup(sc);
6635 	}
6636 
6637 	sc->hn_rss_hash = sc->hn_rss_hcap;
6638 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6639 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6640 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6641 		hn_vf_rss_fixup(sc, false);
6642 	}
6643 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6644 	if (error)
6645 		goto failed;
6646 back:
6647 	/*
6648 	 * Fixup transmission aggregation setup.
6649 	 */
6650 	hn_set_txagg(sc);
6651 	hn_rndis_init_fixat(sc, nchan);
6652 	return (0);
6653 
6654 failed:
6655 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6656 		hn_rndis_init_fixat(sc, nchan);
6657 		hn_synth_detach(sc);
6658 	} else {
6659 		if (attached & ATTACHED_RNDIS) {
6660 			hn_rndis_init_fixat(sc, nchan);
6661 			hn_rndis_detach(sc);
6662 		}
6663 		if (attached & ATTACHED_NVS)
6664 			hn_nvs_detach(sc);
6665 		hn_chan_detach(sc, sc->hn_prichan);
6666 		/* Restore old capabilities. */
6667 		sc->hn_caps = old_caps;
6668 	}
6669 	return (error);
6670 
6671 #undef ATTACHED_RNDIS
6672 #undef ATTACHED_NVS
6673 }
6674 
6675 /*
6676  * NOTE:
6677  * The interface must have been suspended though hn_suspend(), before
6678  * this function get called.
6679  */
6680 static void
6681 hn_synth_detach(struct hn_softc *sc)
6682 {
6683 
6684 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6685 	    ("synthetic parts were not attached"));
6686 
6687 	/* Detach the RNDIS first. */
6688 	hn_rndis_detach(sc);
6689 
6690 	/* Detach NVS. */
6691 	hn_nvs_detach(sc);
6692 
6693 	/* Detach all of the channels. */
6694 	hn_detach_allchans(sc);
6695 
6696 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6697 		/*
6698 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6699 		 */
6700 		int error;
6701 
6702 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6703 		    sc->hn_rxbuf_gpadl);
6704 		if (error) {
6705 			if_printf(sc->hn_ifp,
6706 			    "rxbuf gpadl disconn failed: %d\n", error);
6707 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6708 		}
6709 		sc->hn_rxbuf_gpadl = 0;
6710 	}
6711 
6712 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6713 		/*
6714 		 * Host is post-Win2016, disconnect chimney sending buffer from
6715 		 * primary channel here.
6716 		 */
6717 		int error;
6718 
6719 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6720 		    sc->hn_chim_gpadl);
6721 		if (error) {
6722 			if_printf(sc->hn_ifp,
6723 			    "chim gpadl disconn failed: %d\n", error);
6724 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6725 		}
6726 		sc->hn_chim_gpadl = 0;
6727 	}
6728 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6729 }
6730 
6731 static void
6732 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6733 {
6734 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6735 	    ("invalid ring count %d", ring_cnt));
6736 
6737 	if (sc->hn_tx_ring_cnt > ring_cnt)
6738 		sc->hn_tx_ring_inuse = ring_cnt;
6739 	else
6740 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6741 	sc->hn_rx_ring_inuse = ring_cnt;
6742 
6743 #ifdef RSS
6744 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6745 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6746 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6747 		    rss_getnumbuckets());
6748 	}
6749 #endif
6750 
6751 	if (bootverbose) {
6752 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6753 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6754 	}
6755 }
6756 
6757 static void
6758 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6759 {
6760 
6761 	/*
6762 	 * NOTE:
6763 	 * The TX bufring will not be drained by the hypervisor,
6764 	 * if the primary channel is revoked.
6765 	 */
6766 	while (!vmbus_chan_rx_empty(chan) ||
6767 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6768 	     !vmbus_chan_tx_empty(chan)))
6769 		pause("waitch", 1);
6770 	vmbus_chan_intr_drain(chan);
6771 }
6772 
6773 static void
6774 hn_disable_rx(struct hn_softc *sc)
6775 {
6776 
6777 	/*
6778 	 * Disable RX by clearing RX filter forcefully.
6779 	 */
6780 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6781 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6782 
6783 	/*
6784 	 * Give RNDIS enough time to flush all pending data packets.
6785 	 */
6786 	pause("waitrx", (200 * hz) / 1000);
6787 }
6788 
6789 /*
6790  * NOTE:
6791  * RX/TX _must_ have been suspended/disabled, before this function
6792  * is called.
6793  */
6794 static void
6795 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6796 {
6797 	struct vmbus_channel **subch = NULL;
6798 	int nsubch;
6799 
6800 	/*
6801 	 * Drain RX/TX bufrings and interrupts.
6802 	 */
6803 	nsubch = nchan - 1;
6804 	if (nsubch > 0)
6805 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6806 
6807 	if (subch != NULL) {
6808 		int i;
6809 
6810 		for (i = 0; i < nsubch; ++i)
6811 			hn_chan_drain(sc, subch[i]);
6812 	}
6813 	hn_chan_drain(sc, sc->hn_prichan);
6814 
6815 	if (subch != NULL)
6816 		vmbus_subchan_rel(subch, nsubch);
6817 }
6818 
6819 static void
6820 hn_suspend_data(struct hn_softc *sc)
6821 {
6822 	struct hn_tx_ring *txr;
6823 	int i;
6824 
6825 	HN_LOCK_ASSERT(sc);
6826 
6827 	/*
6828 	 * Suspend TX.
6829 	 */
6830 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6831 		txr = &sc->hn_tx_ring[i];
6832 
6833 		mtx_lock(&txr->hn_tx_lock);
6834 		txr->hn_suspended = 1;
6835 		mtx_unlock(&txr->hn_tx_lock);
6836 		/* No one is able send more packets now. */
6837 
6838 		/*
6839 		 * Wait for all pending sends to finish.
6840 		 *
6841 		 * NOTE:
6842 		 * We will _not_ receive all pending send-done, if the
6843 		 * primary channel is revoked.
6844 		 */
6845 		while (hn_tx_ring_pending(txr) &&
6846 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6847 			pause("hnwtx", 1 /* 1 tick */);
6848 	}
6849 
6850 	/*
6851 	 * Disable RX.
6852 	 */
6853 	hn_disable_rx(sc);
6854 
6855 	/*
6856 	 * Drain RX/TX.
6857 	 */
6858 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6859 
6860 	/*
6861 	 * Drain any pending TX tasks.
6862 	 *
6863 	 * NOTE:
6864 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6865 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6866 	 */
6867 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6868 		txr = &sc->hn_tx_ring[i];
6869 
6870 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6871 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6872 	}
6873 }
6874 
6875 static void
6876 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6877 {
6878 
6879 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6880 }
6881 
6882 static void
6883 hn_suspend_mgmt(struct hn_softc *sc)
6884 {
6885 	struct task task;
6886 
6887 	HN_LOCK_ASSERT(sc);
6888 
6889 	/*
6890 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6891 	 * through hn_mgmt_taskq.
6892 	 */
6893 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6894 	vmbus_chan_run_task(sc->hn_prichan, &task);
6895 
6896 	/*
6897 	 * Make sure that all pending management tasks are completed.
6898 	 */
6899 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6900 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6901 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6902 }
6903 
6904 static void
6905 hn_suspend(struct hn_softc *sc)
6906 {
6907 
6908 	/* Disable polling. */
6909 	hn_polling(sc, 0);
6910 
6911 	/*
6912 	 * If the non-transparent mode VF is activated, the synthetic
6913 	 * device is receiving packets, so the data path of the
6914 	 * synthetic device must be suspended.
6915 	 */
6916 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6917 	    (sc->hn_flags & HN_FLAG_RXVF))
6918 		hn_suspend_data(sc);
6919 	hn_suspend_mgmt(sc);
6920 }
6921 
6922 static void
6923 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6924 {
6925 	int i;
6926 
6927 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6928 	    ("invalid TX ring count %d", tx_ring_cnt));
6929 
6930 	for (i = 0; i < tx_ring_cnt; ++i) {
6931 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6932 
6933 		mtx_lock(&txr->hn_tx_lock);
6934 		txr->hn_suspended = 0;
6935 		mtx_unlock(&txr->hn_tx_lock);
6936 	}
6937 }
6938 
6939 static void
6940 hn_resume_data(struct hn_softc *sc)
6941 {
6942 	int i;
6943 
6944 	HN_LOCK_ASSERT(sc);
6945 
6946 	/*
6947 	 * Re-enable RX.
6948 	 */
6949 	hn_rxfilter_config(sc);
6950 
6951 	/*
6952 	 * Make sure to clear suspend status on "all" TX rings,
6953 	 * since hn_tx_ring_inuse can be changed after
6954 	 * hn_suspend_data().
6955 	 */
6956 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6957 
6958 #ifdef HN_IFSTART_SUPPORT
6959 	if (!hn_use_if_start)
6960 #endif
6961 	{
6962 		/*
6963 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6964 		 * reduced.
6965 		 */
6966 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6967 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6968 	}
6969 
6970 	/*
6971 	 * Kick start TX.
6972 	 */
6973 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6974 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6975 
6976 		/*
6977 		 * Use txeof task, so that any pending oactive can be
6978 		 * cleared properly.
6979 		 */
6980 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6981 	}
6982 }
6983 
6984 static void
6985 hn_resume_mgmt(struct hn_softc *sc)
6986 {
6987 
6988 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6989 
6990 	/*
6991 	 * Kick off network change detection, if it was pending.
6992 	 * If no network change was pending, start link status
6993 	 * checks, which is more lightweight than network change
6994 	 * detection.
6995 	 */
6996 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6997 		hn_change_network(sc);
6998 	else
6999 		hn_update_link_status(sc);
7000 }
7001 
7002 static void
7003 hn_resume(struct hn_softc *sc)
7004 {
7005 
7006 	/*
7007 	 * If the non-transparent mode VF is activated, the synthetic
7008 	 * device have to receive packets, so the data path of the
7009 	 * synthetic device must be resumed.
7010 	 */
7011 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
7012 	    (sc->hn_flags & HN_FLAG_RXVF))
7013 		hn_resume_data(sc);
7014 
7015 	/*
7016 	 * Don't resume link status change if VF is attached/activated.
7017 	 * - In the non-transparent VF mode, the synthetic device marks
7018 	 *   link down until the VF is deactivated; i.e. VF is down.
7019 	 * - In transparent VF mode, VF's media status is used until
7020 	 *   the VF is detached.
7021 	 */
7022 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7023 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7024 		hn_resume_mgmt(sc);
7025 
7026 	/*
7027 	 * Re-enable polling if this interface is running and
7028 	 * the polling is requested.
7029 	 */
7030 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7031 		hn_polling(sc, sc->hn_pollhz);
7032 }
7033 
7034 static void
7035 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7036 {
7037 	const struct rndis_status_msg *msg;
7038 	int ofs;
7039 
7040 	if (dlen < sizeof(*msg)) {
7041 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7042 		return;
7043 	}
7044 	msg = data;
7045 
7046 	switch (msg->rm_status) {
7047 	case RNDIS_STATUS_MEDIA_CONNECT:
7048 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7049 		hn_update_link_status(sc);
7050 		break;
7051 
7052 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7053 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7054 		/* Not really useful; ignore. */
7055 		break;
7056 
7057 	case RNDIS_STATUS_NETWORK_CHANGE:
7058 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7059 		if (dlen < ofs + msg->rm_stbuflen ||
7060 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7061 			if_printf(sc->hn_ifp, "network changed\n");
7062 		} else {
7063 			uint32_t change;
7064 
7065 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7066 			    sizeof(change));
7067 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7068 			    change);
7069 		}
7070 		hn_change_network(sc);
7071 		break;
7072 
7073 	default:
7074 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7075 		    msg->rm_status);
7076 		break;
7077 	}
7078 }
7079 
7080 static int
7081 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7082 {
7083 	const struct rndis_pktinfo *pi = info_data;
7084 	uint32_t mask = 0;
7085 
7086 	while (info_dlen != 0) {
7087 		const void *data;
7088 		uint32_t dlen;
7089 
7090 		if (__predict_false(info_dlen < sizeof(*pi)))
7091 			return (EINVAL);
7092 		if (__predict_false(info_dlen < pi->rm_size))
7093 			return (EINVAL);
7094 		info_dlen -= pi->rm_size;
7095 
7096 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7097 			return (EINVAL);
7098 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7099 			return (EINVAL);
7100 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7101 		data = pi->rm_data;
7102 
7103 		if (pi->rm_internal == 1) {
7104 			switch (pi->rm_type) {
7105 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7106 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7107 					return (EINVAL);
7108 				info->pktinfo_id =
7109 				    (const struct packet_info_id *)data;
7110 				mask |= HN_RXINFO_PKTINFO_ID;
7111 				break;
7112 
7113 			default:
7114 				goto next;
7115 			}
7116 		} else {
7117 			switch (pi->rm_type) {
7118 			case NDIS_PKTINFO_TYPE_VLAN:
7119 				if (__predict_false(dlen
7120 				    < NDIS_VLAN_INFO_SIZE))
7121 					return (EINVAL);
7122 				info->vlan_info = (const uint32_t *)data;
7123 				mask |= HN_RXINFO_VLAN;
7124 				break;
7125 
7126 			case NDIS_PKTINFO_TYPE_CSUM:
7127 				if (__predict_false(dlen
7128 				    < NDIS_RXCSUM_INFO_SIZE))
7129 					return (EINVAL);
7130 				info->csum_info = (const uint32_t *)data;
7131 				mask |= HN_RXINFO_CSUM;
7132 				break;
7133 
7134 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7135 				if (__predict_false(dlen
7136 				    < HN_NDIS_HASH_VALUE_SIZE))
7137 					return (EINVAL);
7138 				info->hash_value = (const uint32_t *)data;
7139 				mask |= HN_RXINFO_HASHVAL;
7140 				break;
7141 
7142 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7143 				if (__predict_false(dlen
7144 				    < HN_NDIS_HASH_INFO_SIZE))
7145 					return (EINVAL);
7146 				info->hash_info = (const uint32_t *)data;
7147 				mask |= HN_RXINFO_HASHINF;
7148 				break;
7149 
7150 			default:
7151 				goto next;
7152 			}
7153 		}
7154 
7155 		if (mask == HN_RXINFO_ALL) {
7156 			/* All found; done */
7157 			break;
7158 		}
7159 next:
7160 		pi = (const struct rndis_pktinfo *)
7161 		    ((const uint8_t *)pi + pi->rm_size);
7162 	}
7163 
7164 	/*
7165 	 * Final fixup.
7166 	 * - If there is no hash value, invalidate the hash info.
7167 	 */
7168 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7169 		info->hash_info = NULL;
7170 	return (0);
7171 }
7172 
7173 static __inline bool
7174 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7175 {
7176 
7177 	if (off < check_off) {
7178 		if (__predict_true(off + len <= check_off))
7179 			return (false);
7180 	} else if (off > check_off) {
7181 		if (__predict_true(check_off + check_len <= off))
7182 			return (false);
7183 	}
7184 	return (true);
7185 }
7186 
7187 static __inline void
7188 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7189 		uint32_t len, struct hn_rxinfo *info)
7190 {
7191 	uint32_t cnt = rxr->rsc.cnt;
7192 
7193 	if (cnt) {
7194 		rxr->rsc.pktlen += len;
7195 	} else {
7196 		rxr->rsc.vlan_info = info->vlan_info;
7197 		rxr->rsc.csum_info = info->csum_info;
7198 		rxr->rsc.hash_info = info->hash_info;
7199 		rxr->rsc.hash_value = info->hash_value;
7200 		rxr->rsc.pktlen = len;
7201 	}
7202 
7203 	rxr->rsc.frag_data[cnt] = data;
7204 	rxr->rsc.frag_len[cnt] = len;
7205 	rxr->rsc.cnt++;
7206 }
7207 
7208 static void
7209 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7210 {
7211 	const struct rndis_packet_msg *pkt;
7212 	struct hn_rxinfo info;
7213 	int data_off, pktinfo_off, data_len, pktinfo_len;
7214 	bool rsc_more= false;
7215 
7216 	/*
7217 	 * Check length.
7218 	 */
7219 	if (__predict_false(dlen < sizeof(*pkt))) {
7220 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7221 		return;
7222 	}
7223 	pkt = data;
7224 
7225 	if (__predict_false(dlen < pkt->rm_len)) {
7226 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7227 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7228 		return;
7229 	}
7230 	if (__predict_false(pkt->rm_len <
7231 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7232 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7233 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7234 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7235 		    pkt->rm_pktinfolen);
7236 		return;
7237 	}
7238 	if (__predict_false(pkt->rm_datalen == 0)) {
7239 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7240 		return;
7241 	}
7242 
7243 	/*
7244 	 * Check offests.
7245 	 */
7246 #define IS_OFFSET_INVALID(ofs)			\
7247 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7248 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7249 
7250 	/* XXX Hyper-V does not meet data offset alignment requirement */
7251 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7252 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7253 		    "data offset %u\n", pkt->rm_dataoffset);
7254 		return;
7255 	}
7256 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7257 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7258 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7259 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7260 		return;
7261 	}
7262 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7263 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7264 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7265 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7266 		return;
7267 	}
7268 
7269 #undef IS_OFFSET_INVALID
7270 
7271 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7272 	data_len = pkt->rm_datalen;
7273 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7274 	pktinfo_len = pkt->rm_pktinfolen;
7275 
7276 	/*
7277 	 * Check OOB coverage.
7278 	 */
7279 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7280 		int oob_off, oob_len;
7281 
7282 		if_printf(rxr->hn_ifp, "got oobdata\n");
7283 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7284 		oob_len = pkt->rm_oobdatalen;
7285 
7286 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7287 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7288 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7289 			    pkt->rm_len, oob_off, oob_len);
7290 			return;
7291 		}
7292 
7293 		/*
7294 		 * Check against data.
7295 		 */
7296 		if (hn_rndis_check_overlap(oob_off, oob_len,
7297 		    data_off, data_len)) {
7298 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7299 			    "oob overlaps data, oob abs %d len %d, "
7300 			    "data abs %d len %d\n",
7301 			    oob_off, oob_len, data_off, data_len);
7302 			return;
7303 		}
7304 
7305 		/*
7306 		 * Check against pktinfo.
7307 		 */
7308 		if (pktinfo_len != 0 &&
7309 		    hn_rndis_check_overlap(oob_off, oob_len,
7310 		    pktinfo_off, pktinfo_len)) {
7311 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7312 			    "oob overlaps pktinfo, oob abs %d len %d, "
7313 			    "pktinfo abs %d len %d\n",
7314 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7315 			return;
7316 		}
7317 	}
7318 
7319 	/*
7320 	 * Check per-packet-info coverage and find useful per-packet-info.
7321 	 */
7322 	info.vlan_info = NULL;
7323 	info.csum_info = NULL;
7324 	info.hash_info = NULL;
7325 	info.pktinfo_id = NULL;
7326 
7327 	if (__predict_true(pktinfo_len != 0)) {
7328 		bool overlap;
7329 		int error;
7330 
7331 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7332 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7333 			    "pktinfo overflow, msglen %u, "
7334 			    "pktinfo abs %d len %d\n",
7335 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7336 			return;
7337 		}
7338 
7339 		/*
7340 		 * Check packet info coverage.
7341 		 */
7342 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7343 		    data_off, data_len);
7344 		if (__predict_false(overlap)) {
7345 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7346 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7347 			    "data abs %d len %d\n",
7348 			    pktinfo_off, pktinfo_len, data_off, data_len);
7349 			return;
7350 		}
7351 
7352 		/*
7353 		 * Find useful per-packet-info.
7354 		 */
7355 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7356 		    pktinfo_len, &info);
7357 		if (__predict_false(error)) {
7358 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7359 			    "pktinfo\n");
7360 			return;
7361 		}
7362 	}
7363 
7364 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7365 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7366 		    "data overflow, msglen %u, data abs %d len %d\n",
7367 		    pkt->rm_len, data_off, data_len);
7368 		return;
7369 	}
7370 
7371 	/* Identify RSC fragments, drop invalid packets */
7372 	if ((info.pktinfo_id != NULL) &&
7373 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7374 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7375 			rxr->rsc.cnt = 0;
7376 			rxr->hn_rsc_pkts++;
7377 		} else if (rxr->rsc.cnt == 0)
7378 			goto drop;
7379 
7380 		rsc_more = true;
7381 
7382 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7383 			rsc_more = false;
7384 
7385 		if (rsc_more && rxr->rsc.is_last)
7386 			goto drop;
7387 	} else {
7388 		rxr->rsc.cnt = 0;
7389 	}
7390 
7391 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7392 		goto drop;
7393 
7394 	/* Store data in per rx ring structure */
7395 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7396 	    data_len, &info);
7397 
7398 	if (rsc_more)
7399 		return;
7400 
7401 	hn_rxpkt(rxr);
7402 	rxr->rsc.cnt = 0;
7403 	return;
7404 drop:
7405 	rxr->hn_rsc_drop++;
7406 	return;
7407 }
7408 
7409 static __inline void
7410 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7411 {
7412 	const struct rndis_msghdr *hdr;
7413 
7414 	if (__predict_false(dlen < sizeof(*hdr))) {
7415 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7416 		return;
7417 	}
7418 	hdr = data;
7419 
7420 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7421 		/* Hot data path. */
7422 		hn_rndis_rx_data(rxr, data, dlen);
7423 		/* Done! */
7424 		return;
7425 	}
7426 
7427 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7428 		hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7429 	else
7430 		hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7431 }
7432 
7433 static void
7434 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7435 {
7436 	const struct hn_nvs_hdr *hdr;
7437 
7438 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7439 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7440 		return;
7441 	}
7442 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7443 
7444 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7445 		/* Useless; ignore */
7446 		return;
7447 	}
7448 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7449 }
7450 
7451 static void
7452 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7453     const struct vmbus_chanpkt_hdr *pkt)
7454 {
7455 	struct hn_nvs_sendctx *sndc;
7456 
7457 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7458 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7459 	    VMBUS_CHANPKT_DATALEN(pkt));
7460 	/*
7461 	 * NOTE:
7462 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7463 	 * its callback.
7464 	 */
7465 }
7466 
7467 static void
7468 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7469     const struct vmbus_chanpkt_hdr *pkthdr)
7470 {
7471 	struct epoch_tracker et;
7472 	const struct vmbus_chanpkt_rxbuf *pkt;
7473 	const struct hn_nvs_hdr *nvs_hdr;
7474 	int count, i, hlen;
7475 
7476 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7477 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7478 		return;
7479 	}
7480 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7481 
7482 	/* Make sure that this is a RNDIS message. */
7483 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7484 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7485 		    nvs_hdr->nvs_type);
7486 		return;
7487 	}
7488 
7489 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7490 	if (__predict_false(hlen < sizeof(*pkt))) {
7491 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7492 		return;
7493 	}
7494 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7495 
7496 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7497 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7498 		    pkt->cp_rxbuf_id);
7499 		return;
7500 	}
7501 
7502 	count = pkt->cp_rxbuf_cnt;
7503 	if (__predict_false(hlen <
7504 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7505 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7506 		return;
7507 	}
7508 
7509 	NET_EPOCH_ENTER(et);
7510 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7511 	for (i = 0; i < count; ++i) {
7512 		int ofs, len;
7513 
7514 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7515 		len = pkt->cp_rxbuf[i].rb_len;
7516 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7517 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7518 			    "ofs %d, len %d\n", i, ofs, len);
7519 			continue;
7520 		}
7521 
7522 		rxr->rsc.is_last = (i == (count - 1));
7523 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7524 	}
7525 	NET_EPOCH_EXIT(et);
7526 
7527 	/*
7528 	 * Ack the consumed RXBUF associated w/ this channel packet,
7529 	 * so that this RXBUF can be recycled by the hypervisor.
7530 	 */
7531 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7532 }
7533 
7534 static void
7535 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7536     uint64_t tid)
7537 {
7538 	struct hn_nvs_rndis_ack ack;
7539 	int retries, error;
7540 
7541 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7542 	ack.nvs_status = HN_NVS_STATUS_OK;
7543 
7544 	retries = 0;
7545 again:
7546 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7547 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7548 	if (__predict_false(error == EAGAIN)) {
7549 		/*
7550 		 * NOTE:
7551 		 * This should _not_ happen in real world, since the
7552 		 * consumption of the TX bufring from the TX path is
7553 		 * controlled.
7554 		 */
7555 		if (rxr->hn_ack_failed == 0)
7556 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7557 		rxr->hn_ack_failed++;
7558 		retries++;
7559 		if (retries < 10) {
7560 			DELAY(100);
7561 			goto again;
7562 		}
7563 		/* RXBUF leaks! */
7564 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7565 	}
7566 }
7567 
7568 static void
7569 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7570 {
7571 	struct hn_rx_ring *rxr = xrxr;
7572 	struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7573 
7574 	for (;;) {
7575 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7576 		int error, pktlen;
7577 
7578 		pktlen = rxr->hn_pktbuf_len;
7579 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7580 		if (__predict_false(error == ENOBUFS)) {
7581 			void *nbuf;
7582 			int nlen;
7583 
7584 			/*
7585 			 * Expand channel packet buffer.
7586 			 *
7587 			 * XXX
7588 			 * Use M_WAITOK here, since allocation failure
7589 			 * is fatal.
7590 			 */
7591 			nlen = rxr->hn_pktbuf_len * 2;
7592 			while (nlen < pktlen)
7593 				nlen *= 2;
7594 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7595 
7596 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7597 			    rxr->hn_pktbuf_len, nlen);
7598 
7599 			free(rxr->hn_pktbuf, M_DEVBUF);
7600 			rxr->hn_pktbuf = nbuf;
7601 			rxr->hn_pktbuf_len = nlen;
7602 			/* Retry! */
7603 			continue;
7604 		} else if (__predict_false(error == EAGAIN)) {
7605 			/* No more channel packets; done! */
7606 			break;
7607 		}
7608 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7609 
7610 		switch (pkt->cph_type) {
7611 		case VMBUS_CHANPKT_TYPE_COMP:
7612 			hn_nvs_handle_comp(sc, chan, pkt);
7613 			break;
7614 
7615 		case VMBUS_CHANPKT_TYPE_RXBUF:
7616 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7617 			break;
7618 
7619 		case VMBUS_CHANPKT_TYPE_INBAND:
7620 			hn_nvs_handle_notify(sc, pkt);
7621 			break;
7622 
7623 		default:
7624 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7625 			    pkt->cph_type);
7626 			break;
7627 		}
7628 	}
7629 	hn_chan_rollup(rxr, rxr->hn_txr);
7630 }
7631 
7632 static void
7633 hn_sysinit(void *arg __unused)
7634 {
7635 	int i;
7636 
7637 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7638 
7639 #ifdef HN_IFSTART_SUPPORT
7640 	/*
7641 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7642 	 * mainly due to the IFF_DRV_OACTIVE flag.
7643 	 */
7644 	if (hn_xpnt_vf && hn_use_if_start) {
7645 		hn_use_if_start = 0;
7646 		printf("hn: tranparent VF mode, if_transmit will be used, "
7647 		    "instead of if_start\n");
7648 	}
7649 #endif
7650 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7651 		printf("hn: invalid transparent VF attach routing "
7652 		    "wait timeout %d, reset to %d\n",
7653 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7654 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7655 	}
7656 
7657 	/*
7658 	 * Initialize VF map.
7659 	 */
7660 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7661 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7662 	hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7663 	    M_WAITOK | M_ZERO);
7664 
7665 	/*
7666 	 * Fix the # of TX taskqueues.
7667 	 */
7668 	if (hn_tx_taskq_cnt <= 0)
7669 		hn_tx_taskq_cnt = 1;
7670 	else if (hn_tx_taskq_cnt > mp_ncpus)
7671 		hn_tx_taskq_cnt = mp_ncpus;
7672 
7673 	/*
7674 	 * Fix the TX taskqueue mode.
7675 	 */
7676 	switch (hn_tx_taskq_mode) {
7677 	case HN_TX_TASKQ_M_INDEP:
7678 	case HN_TX_TASKQ_M_GLOBAL:
7679 	case HN_TX_TASKQ_M_EVTTQ:
7680 		break;
7681 	default:
7682 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7683 		break;
7684 	}
7685 
7686 	if (vm_guest != VM_GUEST_HV)
7687 		return;
7688 
7689 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7690 		return;
7691 
7692 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7693 	    M_DEVBUF, M_WAITOK);
7694 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7695 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7696 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7697 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7698 		    "hn tx%d", i);
7699 	}
7700 }
7701 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7702 
7703 static void
7704 hn_sysuninit(void *arg __unused)
7705 {
7706 
7707 	if (hn_tx_taskque != NULL) {
7708 		int i;
7709 
7710 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7711 			taskqueue_free(hn_tx_taskque[i]);
7712 		free(hn_tx_taskque, M_DEVBUF);
7713 	}
7714 
7715 	if (hn_vfmap != NULL)
7716 		free(hn_vfmap, M_DEVBUF);
7717 	rm_destroy(&hn_vfmap_lock);
7718 
7719 	counter_u64_free(hn_udpcs_fixup);
7720 }
7721 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7722