xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 9dc417c32b934bc50e9a225cda9614ac877103a3)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/rmlock.h>
73 #include <sys/sbuf.h>
74 #include <sys/smp.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
77 #include <sys/sx.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
83 
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
86 
87 #include <net/bpf.h>
88 #include <net/ethernet.h>
89 #include <net/if.h>
90 #include <net/if_dl.h>
91 #include <net/if_media.h>
92 #include <net/if_types.h>
93 #include <net/if_var.h>
94 #include <net/rndis.h>
95 #ifdef RSS
96 #include <net/rss_config.h>
97 #endif
98 
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
106 
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
111 
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
117 
118 #include "vmbus_if.h"
119 
120 #define HN_IFSTART_SUPPORT
121 
122 #define HN_RING_CNT_DEF_MAX		8
123 
124 #define HN_VFMAP_SIZE_DEF		8
125 
126 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
127 
128 /* YYY should get it from the underlying channel */
129 #define HN_TX_DESC_CNT			512
130 
131 #define HN_RNDIS_PKT_LEN					\
132 	(sizeof(struct rndis_packet_msg) +			\
133 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
134 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
135 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
136 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
137 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
138 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
139 
140 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
141 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
142 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
143 /* -1 for RNDIS packet message */
144 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
145 
146 #define HN_DIRECT_TX_SIZE_DEF		128
147 
148 #define HN_EARLY_TXEOF_THRESH		8
149 
150 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
151 
152 #define HN_LROENT_CNT_DEF		128
153 
154 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
155 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
156 /* YYY 2*MTU is a bit rough, but should be good enough. */
157 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
158 
159 #define HN_LRO_ACKCNT_DEF		1
160 
161 #define HN_LOCK_INIT(sc)		\
162 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
163 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
164 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
165 #define HN_LOCK(sc)					\
166 do {							\
167 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
168 		DELAY(1000);				\
169 } while (0)
170 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
171 
172 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
173 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
174 #define HN_CSUM_IP_HWASSIST(sc)		\
175 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
176 #define HN_CSUM_IP6_HWASSIST(sc)	\
177 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
178 
179 #define HN_PKTSIZE_MIN(align)		\
180 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
181 	    HN_RNDIS_PKT_LEN, (align))
182 #define HN_PKTSIZE(m, align)		\
183 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 
185 #ifdef RSS
186 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
187 #else
188 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
189 #endif
190 
191 struct hn_txdesc {
192 #ifndef HN_USE_TXDESC_BUFRING
193 	SLIST_ENTRY(hn_txdesc)		link;
194 #endif
195 	STAILQ_ENTRY(hn_txdesc)		agg_link;
196 
197 	/* Aggregated txdescs, in sending order. */
198 	STAILQ_HEAD(, hn_txdesc)	agg_list;
199 
200 	/* The oldest packet, if transmission aggregation happens. */
201 	struct mbuf			*m;
202 	struct hn_tx_ring		*txr;
203 	int				refs;
204 	uint32_t			flags;	/* HN_TXD_FLAG_ */
205 	struct hn_nvs_sendctx		send_ctx;
206 	uint32_t			chim_index;
207 	int				chim_size;
208 
209 	bus_dmamap_t			data_dmap;
210 
211 	bus_addr_t			rndis_pkt_paddr;
212 	struct rndis_packet_msg		*rndis_pkt;
213 	bus_dmamap_t			rndis_pkt_dmap;
214 };
215 
216 #define HN_TXD_FLAG_ONLIST		0x0001
217 #define HN_TXD_FLAG_DMAMAP		0x0002
218 #define HN_TXD_FLAG_ONAGG		0x0004
219 
220 struct hn_rxinfo {
221 	uint32_t			vlan_info;
222 	uint32_t			csum_info;
223 	uint32_t			hash_info;
224 	uint32_t			hash_value;
225 };
226 
227 struct hn_rxvf_setarg {
228 	struct hn_rx_ring	*rxr;
229 	struct ifnet		*vf_ifp;
230 };
231 
232 #define HN_RXINFO_VLAN			0x0001
233 #define HN_RXINFO_CSUM			0x0002
234 #define HN_RXINFO_HASHINF		0x0004
235 #define HN_RXINFO_HASHVAL		0x0008
236 #define HN_RXINFO_ALL			\
237 	(HN_RXINFO_VLAN |		\
238 	 HN_RXINFO_CSUM |		\
239 	 HN_RXINFO_HASHINF |		\
240 	 HN_RXINFO_HASHVAL)
241 
242 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
243 #define HN_NDIS_RXCSUM_INFO_INVALID	0
244 #define HN_NDIS_HASH_INFO_INVALID	0
245 
246 static int			hn_probe(device_t);
247 static int			hn_attach(device_t);
248 static int			hn_detach(device_t);
249 static int			hn_shutdown(device_t);
250 static void			hn_chan_callback(struct vmbus_channel *,
251 				    void *);
252 
253 static void			hn_init(void *);
254 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
255 #ifdef HN_IFSTART_SUPPORT
256 static void			hn_start(struct ifnet *);
257 #endif
258 static int			hn_transmit(struct ifnet *, struct mbuf *);
259 static void			hn_xmit_qflush(struct ifnet *);
260 static int			hn_ifmedia_upd(struct ifnet *);
261 static void			hn_ifmedia_sts(struct ifnet *,
262 				    struct ifmediareq *);
263 
264 static void			hn_ifnet_event(void *, struct ifnet *, int);
265 static void			hn_ifaddr_event(void *, struct ifnet *);
266 static void			hn_ifnet_attevent(void *, struct ifnet *);
267 static void			hn_ifnet_detevent(void *, struct ifnet *);
268 static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
269 
270 static bool			hn_ismyvf(const struct hn_softc *,
271 				    const struct ifnet *);
272 static void			hn_rxvf_change(struct hn_softc *,
273 				    struct ifnet *, bool);
274 static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
275 static void			hn_rxvf_set_task(void *, int);
276 static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
277 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
278 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
279 				    struct ifreq *);
280 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
281 static bool			hn_xpnt_vf_isready(struct hn_softc *);
282 static void			hn_xpnt_vf_setready(struct hn_softc *);
283 static void			hn_xpnt_vf_init_taskfunc(void *, int);
284 static void			hn_xpnt_vf_init(struct hn_softc *);
285 
286 static int			hn_rndis_rxinfo(const void *, int,
287 				    struct hn_rxinfo *);
288 static void			hn_rndis_rx_data(struct hn_rx_ring *,
289 				    const void *, int);
290 static void			hn_rndis_rx_status(struct hn_softc *,
291 				    const void *, int);
292 static void			hn_rndis_init_fixat(struct hn_softc *, int);
293 
294 static void			hn_nvs_handle_notify(struct hn_softc *,
295 				    const struct vmbus_chanpkt_hdr *);
296 static void			hn_nvs_handle_comp(struct hn_softc *,
297 				    struct vmbus_channel *,
298 				    const struct vmbus_chanpkt_hdr *);
299 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
300 				    struct vmbus_channel *,
301 				    const struct vmbus_chanpkt_hdr *);
302 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
303 				    struct vmbus_channel *, uint64_t);
304 
305 #if __FreeBSD_version >= 1100099
306 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
307 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
308 #endif
309 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
310 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
311 #if __FreeBSD_version < 1100095
312 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
313 #else
314 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
315 #endif
316 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
317 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
318 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
319 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
320 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
321 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
322 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
323 #ifndef RSS
324 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
325 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
326 #endif
327 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
328 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
329 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
330 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
339 
340 static void			hn_stop(struct hn_softc *, bool);
341 static void			hn_init_locked(struct hn_softc *);
342 static int			hn_chan_attach(struct hn_softc *,
343 				    struct vmbus_channel *);
344 static void			hn_chan_detach(struct hn_softc *,
345 				    struct vmbus_channel *);
346 static int			hn_attach_subchans(struct hn_softc *);
347 static void			hn_detach_allchans(struct hn_softc *);
348 static void			hn_chan_rollup(struct hn_rx_ring *,
349 				    struct hn_tx_ring *);
350 static void			hn_set_ring_inuse(struct hn_softc *, int);
351 static int			hn_synth_attach(struct hn_softc *, int);
352 static void			hn_synth_detach(struct hn_softc *);
353 static int			hn_synth_alloc_subchans(struct hn_softc *,
354 				    int *);
355 static bool			hn_synth_attachable(const struct hn_softc *);
356 static void			hn_suspend(struct hn_softc *);
357 static void			hn_suspend_data(struct hn_softc *);
358 static void			hn_suspend_mgmt(struct hn_softc *);
359 static void			hn_resume(struct hn_softc *);
360 static void			hn_resume_data(struct hn_softc *);
361 static void			hn_resume_mgmt(struct hn_softc *);
362 static void			hn_suspend_mgmt_taskfunc(void *, int);
363 static void			hn_chan_drain(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static void			hn_disable_rx(struct hn_softc *);
366 static void			hn_drain_rxtx(struct hn_softc *, int);
367 static void			hn_polling(struct hn_softc *, u_int);
368 static void			hn_chan_polling(struct vmbus_channel *, u_int);
369 static void			hn_mtu_change_fixup(struct hn_softc *);
370 
371 static void			hn_update_link_status(struct hn_softc *);
372 static void			hn_change_network(struct hn_softc *);
373 static void			hn_link_taskfunc(void *, int);
374 static void			hn_netchg_init_taskfunc(void *, int);
375 static void			hn_netchg_status_taskfunc(void *, int);
376 static void			hn_link_status(struct hn_softc *);
377 
378 static int			hn_create_rx_data(struct hn_softc *, int);
379 static void			hn_destroy_rx_data(struct hn_softc *);
380 static int			hn_check_iplen(const struct mbuf *, int);
381 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
382 static int			hn_rxfilter_config(struct hn_softc *);
383 #ifndef RSS
384 static int			hn_rss_reconfig(struct hn_softc *);
385 #endif
386 static void			hn_rss_ind_fixup(struct hn_softc *);
387 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
388 				    int, const struct hn_rxinfo *);
389 
390 static int			hn_tx_ring_create(struct hn_softc *, int);
391 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
392 static int			hn_create_tx_data(struct hn_softc *, int);
393 static void			hn_fixup_tx_data(struct hn_softc *);
394 static void			hn_destroy_tx_data(struct hn_softc *);
395 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
396 static void			hn_txdesc_gc(struct hn_tx_ring *,
397 				    struct hn_txdesc *);
398 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
399 				    struct hn_txdesc *, struct mbuf **);
400 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
401 				    struct hn_txdesc *);
402 static void			hn_set_chim_size(struct hn_softc *, int);
403 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
404 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
405 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
406 static void			hn_resume_tx(struct hn_softc *, int);
407 static void			hn_set_txagg(struct hn_softc *);
408 static void			*hn_try_txagg(struct ifnet *,
409 				    struct hn_tx_ring *, struct hn_txdesc *,
410 				    int);
411 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
412 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
413 				    struct hn_softc *, struct vmbus_channel *,
414 				    const void *, int);
415 static int			hn_txpkt_sglist(struct hn_tx_ring *,
416 				    struct hn_txdesc *);
417 static int			hn_txpkt_chim(struct hn_tx_ring *,
418 				    struct hn_txdesc *);
419 static int			hn_xmit(struct hn_tx_ring *, int);
420 static void			hn_xmit_taskfunc(void *, int);
421 static void			hn_xmit_txeof(struct hn_tx_ring *);
422 static void			hn_xmit_txeof_taskfunc(void *, int);
423 #ifdef HN_IFSTART_SUPPORT
424 static int			hn_start_locked(struct hn_tx_ring *, int);
425 static void			hn_start_taskfunc(void *, int);
426 static void			hn_start_txeof(struct hn_tx_ring *);
427 static void			hn_start_txeof_taskfunc(void *, int);
428 #endif
429 
430 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
431     "Hyper-V network interface");
432 
433 /* Trust tcp segements verification on host side. */
434 static int			hn_trust_hosttcp = 1;
435 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
436     &hn_trust_hosttcp, 0,
437     "Trust tcp segement verification on host side, "
438     "when csum info is missing (global setting)");
439 
440 /* Trust udp datagrams verification on host side. */
441 static int			hn_trust_hostudp = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
443     &hn_trust_hostudp, 0,
444     "Trust udp datagram verification on host side, "
445     "when csum info is missing (global setting)");
446 
447 /* Trust ip packets verification on host side. */
448 static int			hn_trust_hostip = 1;
449 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
450     &hn_trust_hostip, 0,
451     "Trust ip packet verification on host side, "
452     "when csum info is missing (global setting)");
453 
454 /* Limit TSO burst size */
455 static int			hn_tso_maxlen = IP_MAXPACKET;
456 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
457     &hn_tso_maxlen, 0, "TSO burst limit");
458 
459 /* Limit chimney send size */
460 static int			hn_tx_chimney_size = 0;
461 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
462     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
463 
464 /* Limit the size of packet for direct transmission */
465 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
466 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
467     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
468 
469 /* # of LRO entries per RX ring */
470 #if defined(INET) || defined(INET6)
471 #if __FreeBSD_version >= 1100095
472 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
473 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
474     &hn_lro_entry_count, 0, "LRO entry count");
475 #endif
476 #endif
477 
478 static int			hn_tx_taskq_cnt = 1;
479 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
480     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
481 
482 #define HN_TX_TASKQ_M_INDEP	0
483 #define HN_TX_TASKQ_M_GLOBAL	1
484 #define HN_TX_TASKQ_M_EVTTQ	2
485 
486 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
488     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
489     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
490 
491 #ifndef HN_USE_TXDESC_BUFRING
492 static int			hn_use_txdesc_bufring = 0;
493 #else
494 static int			hn_use_txdesc_bufring = 1;
495 #endif
496 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
497     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
498 
499 #ifdef HN_IFSTART_SUPPORT
500 /* Use ifnet.if_start instead of ifnet.if_transmit */
501 static int			hn_use_if_start = 0;
502 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
503     &hn_use_if_start, 0, "Use if_start TX method");
504 #endif
505 
506 /* # of channels to use */
507 static int			hn_chan_cnt = 0;
508 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
509     &hn_chan_cnt, 0,
510     "# of channels to use; each channel has one RX ring and one TX ring");
511 
512 /* # of transmit rings to use */
513 static int			hn_tx_ring_cnt = 0;
514 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
515     &hn_tx_ring_cnt, 0, "# of TX rings to use");
516 
517 /* Software TX ring deptch */
518 static int			hn_tx_swq_depth = 0;
519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
520     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
521 
522 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
523 #if __FreeBSD_version >= 1100095
524 static u_int			hn_lro_mbufq_depth = 0;
525 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
526     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
527 #endif
528 
529 /* Packet transmission aggregation size limit */
530 static int			hn_tx_agg_size = -1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
532     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
533 
534 /* Packet transmission aggregation count limit */
535 static int			hn_tx_agg_pkts = -1;
536 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
537     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
538 
539 /* VF list */
540 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
541     0, 0, hn_vflist_sysctl, "A", "VF list");
542 
543 /* VF mapping */
544 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
545     0, 0, hn_vfmap_sysctl, "A", "VF mapping");
546 
547 /* Transparent VF */
548 static int			hn_xpnt_vf = 0;
549 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
550     &hn_xpnt_vf, 0, "Transparent VF mod");
551 
552 /* Accurate BPF support for Transparent VF */
553 static int			hn_xpnt_vf_accbpf = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
555     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
556 
557 /* Extra wait for transparent VF attach routing; unit seconds. */
558 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
559 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
560     &hn_xpnt_vf_attwait, 0,
561     "Extra wait for transparent VF attach routing; unit: seconds");
562 
563 static u_int			hn_cpu_index;	/* next CPU for channel */
564 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
565 
566 static struct rmlock		hn_vfmap_lock;
567 static int			hn_vfmap_size;
568 static struct ifnet		**hn_vfmap;
569 
570 #ifndef RSS
571 static const uint8_t
572 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
573 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
574 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
575 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
576 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
577 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
578 };
579 #endif	/* !RSS */
580 
581 static device_method_t hn_methods[] = {
582 	/* Device interface */
583 	DEVMETHOD(device_probe,		hn_probe),
584 	DEVMETHOD(device_attach,	hn_attach),
585 	DEVMETHOD(device_detach,	hn_detach),
586 	DEVMETHOD(device_shutdown,	hn_shutdown),
587 	DEVMETHOD_END
588 };
589 
590 static driver_t hn_driver = {
591 	"hn",
592 	hn_methods,
593 	sizeof(struct hn_softc)
594 };
595 
596 static devclass_t hn_devclass;
597 
598 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
599 MODULE_VERSION(hn, 1);
600 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
601 
602 #if __FreeBSD_version >= 1100099
603 static void
604 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
605 {
606 	int i;
607 
608 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
609 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
610 }
611 #endif
612 
613 static int
614 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
615 {
616 
617 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
618 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
619 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
620 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
621 }
622 
623 static int
624 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
625 {
626 	struct hn_nvs_rndis rndis;
627 
628 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
629 	    txd->chim_size > 0, ("invalid rndis chim txd"));
630 
631 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
632 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
633 	rndis.nvs_chim_idx = txd->chim_index;
634 	rndis.nvs_chim_sz = txd->chim_size;
635 
636 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
637 	    &rndis, sizeof(rndis), &txd->send_ctx));
638 }
639 
640 static __inline uint32_t
641 hn_chim_alloc(struct hn_softc *sc)
642 {
643 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
644 	u_long *bmap = sc->hn_chim_bmap;
645 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
646 
647 	for (i = 0; i < bmap_cnt; ++i) {
648 		int idx;
649 
650 		idx = ffsl(~bmap[i]);
651 		if (idx == 0)
652 			continue;
653 
654 		--idx; /* ffsl is 1-based */
655 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
656 		    ("invalid i %d and idx %d", i, idx));
657 
658 		if (atomic_testandset_long(&bmap[i], idx))
659 			continue;
660 
661 		ret = i * LONG_BIT + idx;
662 		break;
663 	}
664 	return (ret);
665 }
666 
667 static __inline void
668 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
669 {
670 	u_long mask;
671 	uint32_t idx;
672 
673 	idx = chim_idx / LONG_BIT;
674 	KASSERT(idx < sc->hn_chim_bmap_cnt,
675 	    ("invalid chimney index 0x%x", chim_idx));
676 
677 	mask = 1UL << (chim_idx % LONG_BIT);
678 	KASSERT(sc->hn_chim_bmap[idx] & mask,
679 	    ("index bitmap 0x%lx, chimney index %u, "
680 	     "bitmap idx %d, bitmask 0x%lx",
681 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
682 
683 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
684 }
685 
686 #if defined(INET6) || defined(INET)
687 
688 #define PULLUP_HDR(m, len)				\
689 do {							\
690 	if (__predict_false((m)->m_len < (len))) {	\
691 		(m) = m_pullup((m), (len));		\
692 		if ((m) == NULL)			\
693 			return (NULL);			\
694 	}						\
695 } while (0)
696 
697 /*
698  * NOTE: If this function failed, the m_head would be freed.
699  */
700 static __inline struct mbuf *
701 hn_tso_fixup(struct mbuf *m_head)
702 {
703 	struct ether_vlan_header *evl;
704 	struct tcphdr *th;
705 	int ehlen;
706 
707 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
708 
709 	PULLUP_HDR(m_head, sizeof(*evl));
710 	evl = mtod(m_head, struct ether_vlan_header *);
711 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
712 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
713 	else
714 		ehlen = ETHER_HDR_LEN;
715 
716 #ifdef INET
717 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
718 		struct ip *ip;
719 		int iphlen;
720 
721 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
722 		ip = mtodo(m_head, ehlen);
723 		iphlen = ip->ip_hl << 2;
724 
725 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
726 		th = mtodo(m_head, ehlen + iphlen);
727 
728 		ip->ip_len = 0;
729 		ip->ip_sum = 0;
730 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
731 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
732 	}
733 #endif
734 #if defined(INET6) && defined(INET)
735 	else
736 #endif
737 #ifdef INET6
738 	{
739 		struct ip6_hdr *ip6;
740 
741 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
742 		ip6 = mtodo(m_head, ehlen);
743 		if (ip6->ip6_nxt != IPPROTO_TCP) {
744 			m_freem(m_head);
745 			return (NULL);
746 		}
747 
748 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
749 		th = mtodo(m_head, ehlen + sizeof(*ip6));
750 
751 		ip6->ip6_plen = 0;
752 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
753 	}
754 #endif
755 	return (m_head);
756 
757 }
758 
759 /*
760  * NOTE: If this function failed, the m_head would be freed.
761  */
762 static __inline struct mbuf *
763 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
764 {
765 	const struct ether_vlan_header *evl;
766 	const struct tcphdr *th;
767 	int ehlen;
768 
769 	*tcpsyn = 0;
770 
771 	PULLUP_HDR(m_head, sizeof(*evl));
772 	evl = mtod(m_head, const struct ether_vlan_header *);
773 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
774 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
775 	else
776 		ehlen = ETHER_HDR_LEN;
777 
778 #ifdef INET
779 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
780 		const struct ip *ip;
781 		int iphlen;
782 
783 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
784 		ip = mtodo(m_head, ehlen);
785 		iphlen = ip->ip_hl << 2;
786 
787 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
788 		th = mtodo(m_head, ehlen + iphlen);
789 		if (th->th_flags & TH_SYN)
790 			*tcpsyn = 1;
791 	}
792 #endif
793 #if defined(INET6) && defined(INET)
794 	else
795 #endif
796 #ifdef INET6
797 	{
798 		const struct ip6_hdr *ip6;
799 
800 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
801 		ip6 = mtodo(m_head, ehlen);
802 		if (ip6->ip6_nxt != IPPROTO_TCP)
803 			return (m_head);
804 
805 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
806 		th = mtodo(m_head, ehlen + sizeof(*ip6));
807 		if (th->th_flags & TH_SYN)
808 			*tcpsyn = 1;
809 	}
810 #endif
811 	return (m_head);
812 }
813 
814 #undef PULLUP_HDR
815 
816 #endif	/* INET6 || INET */
817 
818 static int
819 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
820 {
821 	int error = 0;
822 
823 	HN_LOCK_ASSERT(sc);
824 
825 	if (sc->hn_rx_filter != filter) {
826 		error = hn_rndis_set_rxfilter(sc, filter);
827 		if (!error)
828 			sc->hn_rx_filter = filter;
829 	}
830 	return (error);
831 }
832 
833 static int
834 hn_rxfilter_config(struct hn_softc *sc)
835 {
836 	struct ifnet *ifp = sc->hn_ifp;
837 	uint32_t filter;
838 
839 	HN_LOCK_ASSERT(sc);
840 
841 	/*
842 	 * If the non-transparent mode VF is activated, we don't know how
843 	 * its RX filter is configured, so stick the synthetic device in
844 	 * the promiscous mode.
845 	 */
846 	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
847 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
848 	} else {
849 		filter = NDIS_PACKET_TYPE_DIRECTED;
850 		if (ifp->if_flags & IFF_BROADCAST)
851 			filter |= NDIS_PACKET_TYPE_BROADCAST;
852 		/* TODO: support multicast list */
853 		if ((ifp->if_flags & IFF_ALLMULTI) ||
854 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
855 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
856 	}
857 	return (hn_set_rxfilter(sc, filter));
858 }
859 
860 static void
861 hn_set_txagg(struct hn_softc *sc)
862 {
863 	uint32_t size, pkts;
864 	int i;
865 
866 	/*
867 	 * Setup aggregation size.
868 	 */
869 	if (sc->hn_agg_size < 0)
870 		size = UINT32_MAX;
871 	else
872 		size = sc->hn_agg_size;
873 
874 	if (sc->hn_rndis_agg_size < size)
875 		size = sc->hn_rndis_agg_size;
876 
877 	/* NOTE: We only aggregate packets using chimney sending buffers. */
878 	if (size > (uint32_t)sc->hn_chim_szmax)
879 		size = sc->hn_chim_szmax;
880 
881 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
882 		/* Disable */
883 		size = 0;
884 		pkts = 0;
885 		goto done;
886 	}
887 
888 	/* NOTE: Type of the per TX ring setting is 'int'. */
889 	if (size > INT_MAX)
890 		size = INT_MAX;
891 
892 	/*
893 	 * Setup aggregation packet count.
894 	 */
895 	if (sc->hn_agg_pkts < 0)
896 		pkts = UINT32_MAX;
897 	else
898 		pkts = sc->hn_agg_pkts;
899 
900 	if (sc->hn_rndis_agg_pkts < pkts)
901 		pkts = sc->hn_rndis_agg_pkts;
902 
903 	if (pkts <= 1) {
904 		/* Disable */
905 		size = 0;
906 		pkts = 0;
907 		goto done;
908 	}
909 
910 	/* NOTE: Type of the per TX ring setting is 'short'. */
911 	if (pkts > SHRT_MAX)
912 		pkts = SHRT_MAX;
913 
914 done:
915 	/* NOTE: Type of the per TX ring setting is 'short'. */
916 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
917 		/* Disable */
918 		size = 0;
919 		pkts = 0;
920 	}
921 
922 	if (bootverbose) {
923 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
924 		    size, pkts, sc->hn_rndis_agg_align);
925 	}
926 
927 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
928 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
929 
930 		mtx_lock(&txr->hn_tx_lock);
931 		txr->hn_agg_szmax = size;
932 		txr->hn_agg_pktmax = pkts;
933 		txr->hn_agg_align = sc->hn_rndis_agg_align;
934 		mtx_unlock(&txr->hn_tx_lock);
935 	}
936 }
937 
938 static int
939 hn_get_txswq_depth(const struct hn_tx_ring *txr)
940 {
941 
942 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
943 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
944 		return txr->hn_txdesc_cnt;
945 	return hn_tx_swq_depth;
946 }
947 
948 #ifndef RSS
949 static int
950 hn_rss_reconfig(struct hn_softc *sc)
951 {
952 	int error;
953 
954 	HN_LOCK_ASSERT(sc);
955 
956 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
957 		return (ENXIO);
958 
959 	/*
960 	 * Disable RSS first.
961 	 *
962 	 * NOTE:
963 	 * Direct reconfiguration by setting the UNCHG flags does
964 	 * _not_ work properly.
965 	 */
966 	if (bootverbose)
967 		if_printf(sc->hn_ifp, "disable RSS\n");
968 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
969 	if (error) {
970 		if_printf(sc->hn_ifp, "RSS disable failed\n");
971 		return (error);
972 	}
973 
974 	/*
975 	 * Reenable the RSS w/ the updated RSS key or indirect
976 	 * table.
977 	 */
978 	if (bootverbose)
979 		if_printf(sc->hn_ifp, "reconfig RSS\n");
980 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
981 	if (error) {
982 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
983 		return (error);
984 	}
985 	return (0);
986 }
987 #endif	/* !RSS */
988 
989 static void
990 hn_rss_ind_fixup(struct hn_softc *sc)
991 {
992 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
993 	int i, nchan;
994 
995 	nchan = sc->hn_rx_ring_inuse;
996 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
997 
998 	/*
999 	 * Check indirect table to make sure that all channels in it
1000 	 * can be used.
1001 	 */
1002 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1003 		if (rss->rss_ind[i] >= nchan) {
1004 			if_printf(sc->hn_ifp,
1005 			    "RSS indirect table %d fixup: %u -> %d\n",
1006 			    i, rss->rss_ind[i], nchan - 1);
1007 			rss->rss_ind[i] = nchan - 1;
1008 		}
1009 	}
1010 }
1011 
1012 static int
1013 hn_ifmedia_upd(struct ifnet *ifp __unused)
1014 {
1015 
1016 	return EOPNOTSUPP;
1017 }
1018 
1019 static void
1020 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1021 {
1022 	struct hn_softc *sc = ifp->if_softc;
1023 
1024 	ifmr->ifm_status = IFM_AVALID;
1025 	ifmr->ifm_active = IFM_ETHER;
1026 
1027 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1028 		ifmr->ifm_active |= IFM_NONE;
1029 		return;
1030 	}
1031 	ifmr->ifm_status |= IFM_ACTIVE;
1032 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1033 }
1034 
1035 static void
1036 hn_rxvf_set_task(void *xarg, int pending __unused)
1037 {
1038 	struct hn_rxvf_setarg *arg = xarg;
1039 
1040 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1041 }
1042 
1043 static void
1044 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1045 {
1046 	struct hn_rx_ring *rxr;
1047 	struct hn_rxvf_setarg arg;
1048 	struct task task;
1049 	int i;
1050 
1051 	HN_LOCK_ASSERT(sc);
1052 
1053 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1054 
1055 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1056 		rxr = &sc->hn_rx_ring[i];
1057 
1058 		if (i < sc->hn_rx_ring_inuse) {
1059 			arg.rxr = rxr;
1060 			arg.vf_ifp = vf_ifp;
1061 			vmbus_chan_run_task(rxr->hn_chan, &task);
1062 		} else {
1063 			rxr->hn_rxvf_ifp = vf_ifp;
1064 		}
1065 	}
1066 }
1067 
1068 static bool
1069 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1070 {
1071 	const struct ifnet *hn_ifp;
1072 
1073 	hn_ifp = sc->hn_ifp;
1074 
1075 	if (ifp == hn_ifp)
1076 		return (false);
1077 
1078 	if (ifp->if_alloctype != IFT_ETHER)
1079 		return (false);
1080 
1081 	/* Ignore lagg/vlan interfaces */
1082 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1083 	    strcmp(ifp->if_dname, "vlan") == 0)
1084 		return (false);
1085 
1086 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1087 		return (false);
1088 
1089 	return (true);
1090 }
1091 
1092 static void
1093 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1094 {
1095 	struct ifnet *hn_ifp;
1096 
1097 	HN_LOCK(sc);
1098 
1099 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1100 		goto out;
1101 
1102 	if (!hn_ismyvf(sc, ifp))
1103 		goto out;
1104 	hn_ifp = sc->hn_ifp;
1105 
1106 	if (rxvf) {
1107 		if (sc->hn_flags & HN_FLAG_RXVF)
1108 			goto out;
1109 
1110 		sc->hn_flags |= HN_FLAG_RXVF;
1111 		hn_rxfilter_config(sc);
1112 	} else {
1113 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1114 			goto out;
1115 
1116 		sc->hn_flags &= ~HN_FLAG_RXVF;
1117 		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1118 			hn_rxfilter_config(sc);
1119 		else
1120 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1121 	}
1122 
1123 	hn_nvs_set_datapath(sc,
1124 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1125 
1126 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1127 
1128 	if (rxvf) {
1129 		hn_suspend_mgmt(sc);
1130 		sc->hn_link_flags &=
1131 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1132 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1133 	} else {
1134 		hn_resume_mgmt(sc);
1135 	}
1136 
1137 	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1138 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1139 
1140 	if (bootverbose) {
1141 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1142 		    rxvf ? "to" : "from", ifp->if_xname);
1143 	}
1144 out:
1145 	HN_UNLOCK(sc);
1146 }
1147 
1148 static void
1149 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1150 {
1151 
1152 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1153 		return;
1154 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1155 }
1156 
1157 static void
1158 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1159 {
1160 
1161 	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1162 }
1163 
1164 static int
1165 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1166 {
1167 	struct ifnet *ifp, *vf_ifp;
1168 	uint64_t tmp;
1169 	int error;
1170 
1171 	HN_LOCK_ASSERT(sc);
1172 	ifp = sc->hn_ifp;
1173 	vf_ifp = sc->hn_vf_ifp;
1174 
1175 	/*
1176 	 * Fix up requested capabilities w/ supported capabilities,
1177 	 * since the supported capabilities could have been changed.
1178 	 */
1179 	ifr->ifr_reqcap &= ifp->if_capabilities;
1180 	/* Pass SIOCSIFCAP to VF. */
1181 	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1182 
1183 	/*
1184 	 * NOTE:
1185 	 * The error will be propagated to the callers, however, it
1186 	 * is _not_ useful here.
1187 	 */
1188 
1189 	/*
1190 	 * Merge VF's enabled capabilities.
1191 	 */
1192 	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1193 
1194 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1195 	if (ifp->if_capenable & IFCAP_TXCSUM)
1196 		ifp->if_hwassist |= tmp;
1197 	else
1198 		ifp->if_hwassist &= ~tmp;
1199 
1200 	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1201 	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1202 		ifp->if_hwassist |= tmp;
1203 	else
1204 		ifp->if_hwassist &= ~tmp;
1205 
1206 	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1207 	if (ifp->if_capenable & IFCAP_TSO4)
1208 		ifp->if_hwassist |= tmp;
1209 	else
1210 		ifp->if_hwassist &= ~tmp;
1211 
1212 	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1213 	if (ifp->if_capenable & IFCAP_TSO6)
1214 		ifp->if_hwassist |= tmp;
1215 	else
1216 		ifp->if_hwassist &= ~tmp;
1217 
1218 	return (error);
1219 }
1220 
1221 static int
1222 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1223 {
1224 	struct ifnet *vf_ifp;
1225 	struct ifreq ifr;
1226 
1227 	HN_LOCK_ASSERT(sc);
1228 	vf_ifp = sc->hn_vf_ifp;
1229 
1230 	memset(&ifr, 0, sizeof(ifr));
1231 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1232 	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1233 	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1234 	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1235 }
1236 
1237 static void
1238 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1239 {
1240 	struct ifnet *ifp = sc->hn_ifp;
1241 	int allmulti = 0;
1242 
1243 	HN_LOCK_ASSERT(sc);
1244 
1245 	/* XXX vlan(4) style mcast addr maintenance */
1246 	if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1247 		allmulti = IFF_ALLMULTI;
1248 
1249 	/* Always set the VF's if_flags */
1250 	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1251 }
1252 
1253 static void
1254 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1255 {
1256 	struct rm_priotracker pt;
1257 	struct ifnet *hn_ifp = NULL;
1258 	struct mbuf *mn;
1259 
1260 	/*
1261 	 * XXX racy, if hn(4) ever detached.
1262 	 */
1263 	rm_rlock(&hn_vfmap_lock, &pt);
1264 	if (vf_ifp->if_index < hn_vfmap_size)
1265 		hn_ifp = hn_vfmap[vf_ifp->if_index];
1266 	rm_runlock(&hn_vfmap_lock, &pt);
1267 
1268 	if (hn_ifp != NULL) {
1269 		/*
1270 		 * Fix up rcvif and go through hn(4)'s if_input and
1271 		 * increase ipackets.
1272 		 */
1273 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1274 			/* Allow tapping on the VF. */
1275 			ETHER_BPF_MTAP(vf_ifp, mn);
1276 			mn->m_pkthdr.rcvif = hn_ifp;
1277 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1278 		}
1279 		hn_ifp->if_input(hn_ifp, m);
1280 	} else {
1281 		/*
1282 		 * In the middle of the transition; free this
1283 		 * mbuf chain.
1284 		 */
1285 		while (m != NULL) {
1286 			mn = m->m_nextpkt;
1287 			m->m_nextpkt = NULL;
1288 			m_freem(m);
1289 			m = mn;
1290 		}
1291 	}
1292 }
1293 
1294 static void
1295 hn_mtu_change_fixup(struct hn_softc *sc)
1296 {
1297 	struct ifnet *ifp;
1298 
1299 	HN_LOCK_ASSERT(sc);
1300 	ifp = sc->hn_ifp;
1301 
1302 	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1303 #if __FreeBSD_version >= 1100099
1304 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1305 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1306 #endif
1307 }
1308 
1309 static void
1310 hn_xpnt_vf_setready(struct hn_softc *sc)
1311 {
1312 	struct ifnet *ifp, *vf_ifp;
1313 	struct ifreq ifr;
1314 
1315 	HN_LOCK_ASSERT(sc);
1316 	ifp = sc->hn_ifp;
1317 	vf_ifp = sc->hn_vf_ifp;
1318 
1319 	/*
1320 	 * Mark the VF ready.
1321 	 */
1322 	sc->hn_vf_rdytick = 0;
1323 
1324 	/*
1325 	 * Save information for restoration.
1326 	 */
1327 	sc->hn_saved_caps = ifp->if_capabilities;
1328 	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1329 	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1330 	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1331 
1332 	/*
1333 	 * Intersect supported/enabled capabilities.
1334 	 *
1335 	 * NOTE:
1336 	 * if_hwassist is not changed here.
1337 	 */
1338 	ifp->if_capabilities &= vf_ifp->if_capabilities;
1339 	ifp->if_capenable &= ifp->if_capabilities;
1340 
1341 	/*
1342 	 * Fix TSO settings.
1343 	 */
1344 	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1345 		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1346 	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1347 		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1348 	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1349 		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1350 
1351 	/*
1352 	 * Change VF's enabled capabilities.
1353 	 */
1354 	memset(&ifr, 0, sizeof(ifr));
1355 	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1356 	ifr.ifr_reqcap = ifp->if_capenable;
1357 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1358 
1359 	if (ifp->if_mtu != ETHERMTU) {
1360 		int error;
1361 
1362 		/*
1363 		 * Change VF's MTU.
1364 		 */
1365 		memset(&ifr, 0, sizeof(ifr));
1366 		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1367 		ifr.ifr_mtu = ifp->if_mtu;
1368 		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1369 		if (error) {
1370 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1371 			    vf_ifp->if_xname, ifp->if_mtu);
1372 			if (ifp->if_mtu > ETHERMTU) {
1373 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1374 
1375 				/*
1376 				 * XXX
1377 				 * No need to adjust the synthetic parts' MTU;
1378 				 * failure of the adjustment will cause us
1379 				 * infinite headache.
1380 				 */
1381 				ifp->if_mtu = ETHERMTU;
1382 				hn_mtu_change_fixup(sc);
1383 			}
1384 		}
1385 	}
1386 }
1387 
1388 static bool
1389 hn_xpnt_vf_isready(struct hn_softc *sc)
1390 {
1391 
1392 	HN_LOCK_ASSERT(sc);
1393 
1394 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1395 		return (false);
1396 
1397 	if (sc->hn_vf_rdytick == 0)
1398 		return (true);
1399 
1400 	if (sc->hn_vf_rdytick > ticks)
1401 		return (false);
1402 
1403 	/* Mark VF as ready. */
1404 	hn_xpnt_vf_setready(sc);
1405 	return (true);
1406 }
1407 
1408 static void
1409 hn_xpnt_vf_init(struct hn_softc *sc)
1410 {
1411 	int error;
1412 
1413 	HN_LOCK_ASSERT(sc);
1414 
1415 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1416 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1417 
1418 	if (bootverbose) {
1419 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1420 		    sc->hn_vf_ifp->if_xname);
1421 	}
1422 
1423 	/*
1424 	 * Bring the VF up.
1425 	 */
1426 	hn_xpnt_vf_saveifflags(sc);
1427 	sc->hn_vf_ifp->if_flags |= IFF_UP;
1428 	error = hn_xpnt_vf_iocsetflags(sc);
1429 	if (error) {
1430 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1431 		    sc->hn_vf_ifp->if_xname, error);
1432 		return;
1433 	}
1434 
1435 	/*
1436 	 * NOTE:
1437 	 * Datapath setting must happen _after_ bringing the VF up.
1438 	 */
1439 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1440 
1441 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1442 	rm_wlock(&sc->hn_vf_lock);
1443 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1444 	rm_wunlock(&sc->hn_vf_lock);
1445 }
1446 
1447 static void
1448 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1449 {
1450 	struct hn_softc *sc = xsc;
1451 
1452 	HN_LOCK(sc);
1453 
1454 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1455 		goto done;
1456 	if (sc->hn_vf_ifp == NULL)
1457 		goto done;
1458 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1459 		goto done;
1460 
1461 	if (sc->hn_vf_rdytick != 0) {
1462 		/* Mark VF as ready. */
1463 		hn_xpnt_vf_setready(sc);
1464 	}
1465 
1466 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1467 		/*
1468 		 * Delayed VF initialization.
1469 		 */
1470 		if (bootverbose) {
1471 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1472 			    sc->hn_vf_ifp->if_xname);
1473 		}
1474 		hn_xpnt_vf_init(sc);
1475 	}
1476 done:
1477 	HN_UNLOCK(sc);
1478 }
1479 
1480 static void
1481 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1482 {
1483 	struct hn_softc *sc = xsc;
1484 
1485 	HN_LOCK(sc);
1486 
1487 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1488 		goto done;
1489 
1490 	if (!hn_ismyvf(sc, ifp))
1491 		goto done;
1492 
1493 	if (sc->hn_vf_ifp != NULL) {
1494 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1495 		    sc->hn_vf_ifp->if_xname);
1496 		goto done;
1497 	}
1498 
1499 	if (hn_xpnt_vf && ifp->if_start != NULL) {
1500 		/*
1501 		 * ifnet.if_start is _not_ supported by transparent
1502 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1503 		 */
1504 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1505 		    "in transparent VF mode.\n", ifp->if_xname);
1506 		goto done;
1507 	}
1508 
1509 	rm_wlock(&hn_vfmap_lock);
1510 
1511 	if (ifp->if_index >= hn_vfmap_size) {
1512 		struct ifnet **newmap;
1513 		int newsize;
1514 
1515 		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1516 		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1517 		    M_WAITOK | M_ZERO);
1518 
1519 		memcpy(newmap, hn_vfmap,
1520 		    sizeof(struct ifnet *) * hn_vfmap_size);
1521 		free(hn_vfmap, M_DEVBUF);
1522 		hn_vfmap = newmap;
1523 		hn_vfmap_size = newsize;
1524 	}
1525 	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1526 	    ("%s: ifindex %d was mapped to %s",
1527 	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1528 	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1529 
1530 	rm_wunlock(&hn_vfmap_lock);
1531 
1532 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1533 	rm_wlock(&sc->hn_vf_lock);
1534 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1535 	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1536 	sc->hn_vf_ifp = ifp;
1537 	rm_wunlock(&sc->hn_vf_lock);
1538 
1539 	if (hn_xpnt_vf) {
1540 		int wait_ticks;
1541 
1542 		/*
1543 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1544 		 * Save vf_ifp's current if_input for later restoration.
1545 		 */
1546 		sc->hn_vf_input = ifp->if_input;
1547 		ifp->if_input = hn_xpnt_vf_input;
1548 
1549 		/*
1550 		 * Stop link status management; use the VF's.
1551 		 */
1552 		hn_suspend_mgmt(sc);
1553 
1554 		/*
1555 		 * Give VF sometime to complete its attach routing.
1556 		 */
1557 		wait_ticks = hn_xpnt_vf_attwait * hz;
1558 		sc->hn_vf_rdytick = ticks + wait_ticks;
1559 
1560 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1561 		    wait_ticks);
1562 	}
1563 done:
1564 	HN_UNLOCK(sc);
1565 }
1566 
1567 static void
1568 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1569 {
1570 	struct hn_softc *sc = xsc;
1571 
1572 	HN_LOCK(sc);
1573 
1574 	if (sc->hn_vf_ifp == NULL)
1575 		goto done;
1576 
1577 	if (!hn_ismyvf(sc, ifp))
1578 		goto done;
1579 
1580 	if (hn_xpnt_vf) {
1581 		/*
1582 		 * Make sure that the delayed initialization is not running.
1583 		 *
1584 		 * NOTE:
1585 		 * - This lock _must_ be released, since the hn_vf_init task
1586 		 *   will try holding this lock.
1587 		 * - It is safe to release this lock here, since the
1588 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1589 		 *
1590 		 * XXX racy, if hn(4) ever detached.
1591 		 */
1592 		HN_UNLOCK(sc);
1593 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1594 		HN_LOCK(sc);
1595 
1596 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1597 		    sc->hn_ifp->if_xname));
1598 		ifp->if_input = sc->hn_vf_input;
1599 		sc->hn_vf_input = NULL;
1600 
1601 		if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1602 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1603 
1604 		if (sc->hn_vf_rdytick == 0) {
1605 			/*
1606 			 * The VF was ready; restore some settings.
1607 			 */
1608 			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1609 			/*
1610 			 * NOTE:
1611 			 * There is _no_ need to fixup if_capenable and
1612 			 * if_hwassist, since the if_capabilities before
1613 			 * restoration was an intersection of the VF's
1614 			 * if_capabilites and the synthetic device's
1615 			 * if_capabilites.
1616 			 */
1617 			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1618 			sc->hn_ifp->if_hw_tsomaxsegcount =
1619 			    sc->hn_saved_tsosegcnt;
1620 			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1621 		}
1622 
1623 		/*
1624 		 * Resume link status management, which was suspended
1625 		 * by hn_ifnet_attevent().
1626 		 */
1627 		hn_resume_mgmt(sc);
1628 	}
1629 
1630 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1631 	rm_wlock(&sc->hn_vf_lock);
1632 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1633 	sc->hn_vf_ifp = NULL;
1634 	rm_wunlock(&sc->hn_vf_lock);
1635 
1636 	rm_wlock(&hn_vfmap_lock);
1637 
1638 	KASSERT(ifp->if_index < hn_vfmap_size,
1639 	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1640 	if (hn_vfmap[ifp->if_index] != NULL) {
1641 		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1642 		    ("%s: ifindex %d was mapped to %s",
1643 		     ifp->if_xname, ifp->if_index,
1644 		     hn_vfmap[ifp->if_index]->if_xname));
1645 		hn_vfmap[ifp->if_index] = NULL;
1646 	}
1647 
1648 	rm_wunlock(&hn_vfmap_lock);
1649 done:
1650 	HN_UNLOCK(sc);
1651 }
1652 
1653 static void
1654 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1655 {
1656 	struct hn_softc *sc = xsc;
1657 
1658 	if (sc->hn_vf_ifp == ifp)
1659 		if_link_state_change(sc->hn_ifp, link_state);
1660 }
1661 
1662 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1663 static const struct hyperv_guid g_net_vsc_device_type = {
1664 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1665 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1666 };
1667 
1668 static int
1669 hn_probe(device_t dev)
1670 {
1671 
1672 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1673 	    &g_net_vsc_device_type) == 0) {
1674 		device_set_desc(dev, "Hyper-V Network Interface");
1675 		return BUS_PROBE_DEFAULT;
1676 	}
1677 	return ENXIO;
1678 }
1679 
1680 static int
1681 hn_attach(device_t dev)
1682 {
1683 	struct hn_softc *sc = device_get_softc(dev);
1684 	struct sysctl_oid_list *child;
1685 	struct sysctl_ctx_list *ctx;
1686 	uint8_t eaddr[ETHER_ADDR_LEN];
1687 	struct ifnet *ifp = NULL;
1688 	int error, ring_cnt, tx_ring_cnt;
1689 
1690 	sc->hn_dev = dev;
1691 	sc->hn_prichan = vmbus_get_channel(dev);
1692 	HN_LOCK_INIT(sc);
1693 	rm_init(&sc->hn_vf_lock, "hnvf");
1694 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
1695 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
1696 
1697 	/*
1698 	 * Initialize these tunables once.
1699 	 */
1700 	sc->hn_agg_size = hn_tx_agg_size;
1701 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1702 
1703 	/*
1704 	 * Setup taskqueue for transmission.
1705 	 */
1706 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1707 		int i;
1708 
1709 		sc->hn_tx_taskqs =
1710 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1711 		    M_DEVBUF, M_WAITOK);
1712 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1713 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1714 			    M_WAITOK, taskqueue_thread_enqueue,
1715 			    &sc->hn_tx_taskqs[i]);
1716 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1717 			    "%s tx%d", device_get_nameunit(dev), i);
1718 		}
1719 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1720 		sc->hn_tx_taskqs = hn_tx_taskque;
1721 	}
1722 
1723 	/*
1724 	 * Setup taskqueue for mangement tasks, e.g. link status.
1725 	 */
1726 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1727 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1728 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1729 	    device_get_nameunit(dev));
1730 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1731 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1732 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1733 	    hn_netchg_status_taskfunc, sc);
1734 
1735 	if (hn_xpnt_vf) {
1736 		/*
1737 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
1738 		 */
1739 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
1740 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
1741 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
1742 		    device_get_nameunit(dev));
1743 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
1744 		    hn_xpnt_vf_init_taskfunc, sc);
1745 	}
1746 
1747 	/*
1748 	 * Allocate ifnet and setup its name earlier, so that if_printf
1749 	 * can be used by functions, which will be called after
1750 	 * ether_ifattach().
1751 	 */
1752 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1753 	ifp->if_softc = sc;
1754 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1755 
1756 	/*
1757 	 * Initialize ifmedia earlier so that it can be unconditionally
1758 	 * destroyed, if error happened later on.
1759 	 */
1760 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1761 
1762 	/*
1763 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1764 	 * to use (tx_ring_cnt).
1765 	 *
1766 	 * NOTE:
1767 	 * The # of RX rings to use is same as the # of channels to use.
1768 	 */
1769 	ring_cnt = hn_chan_cnt;
1770 	if (ring_cnt <= 0) {
1771 		/* Default */
1772 		ring_cnt = mp_ncpus;
1773 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1774 			ring_cnt = HN_RING_CNT_DEF_MAX;
1775 	} else if (ring_cnt > mp_ncpus) {
1776 		ring_cnt = mp_ncpus;
1777 	}
1778 #ifdef RSS
1779 	if (ring_cnt > rss_getnumbuckets())
1780 		ring_cnt = rss_getnumbuckets();
1781 #endif
1782 
1783 	tx_ring_cnt = hn_tx_ring_cnt;
1784 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1785 		tx_ring_cnt = ring_cnt;
1786 #ifdef HN_IFSTART_SUPPORT
1787 	if (hn_use_if_start) {
1788 		/* ifnet.if_start only needs one TX ring. */
1789 		tx_ring_cnt = 1;
1790 	}
1791 #endif
1792 
1793 	/*
1794 	 * Set the leader CPU for channels.
1795 	 */
1796 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1797 
1798 	/*
1799 	 * Create enough TX/RX rings, even if only limited number of
1800 	 * channels can be allocated.
1801 	 */
1802 	error = hn_create_tx_data(sc, tx_ring_cnt);
1803 	if (error)
1804 		goto failed;
1805 	error = hn_create_rx_data(sc, ring_cnt);
1806 	if (error)
1807 		goto failed;
1808 
1809 	/*
1810 	 * Create transaction context for NVS and RNDIS transactions.
1811 	 */
1812 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1813 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1814 	if (sc->hn_xact == NULL) {
1815 		error = ENXIO;
1816 		goto failed;
1817 	}
1818 
1819 	/*
1820 	 * Install orphan handler for the revocation of this device's
1821 	 * primary channel.
1822 	 *
1823 	 * NOTE:
1824 	 * The processing order is critical here:
1825 	 * Install the orphan handler, _before_ testing whether this
1826 	 * device's primary channel has been revoked or not.
1827 	 */
1828 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1829 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1830 		error = ENXIO;
1831 		goto failed;
1832 	}
1833 
1834 	/*
1835 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1836 	 */
1837 	error = hn_synth_attach(sc, ETHERMTU);
1838 	if (error)
1839 		goto failed;
1840 
1841 	error = hn_rndis_get_eaddr(sc, eaddr);
1842 	if (error)
1843 		goto failed;
1844 
1845 #if __FreeBSD_version >= 1100099
1846 	if (sc->hn_rx_ring_inuse > 1) {
1847 		/*
1848 		 * Reduce TCP segment aggregation limit for multiple
1849 		 * RX rings to increase ACK timeliness.
1850 		 */
1851 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1852 	}
1853 #endif
1854 
1855 	/*
1856 	 * Fixup TX stuffs after synthetic parts are attached.
1857 	 */
1858 	hn_fixup_tx_data(sc);
1859 
1860 	ctx = device_get_sysctl_ctx(dev);
1861 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1862 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1863 	    &sc->hn_nvs_ver, 0, "NVS version");
1864 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1865 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1866 	    hn_ndis_version_sysctl, "A", "NDIS version");
1867 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1868 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1869 	    hn_caps_sysctl, "A", "capabilities");
1870 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1871 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1872 	    hn_hwassist_sysctl, "A", "hwassist");
1873 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
1874 	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
1875 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
1876 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
1877 	    "max # of TSO segments");
1878 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
1879 	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
1880 	    "max size of TSO segment");
1881 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1882 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1883 	    hn_rxfilter_sysctl, "A", "rxfilter");
1884 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1885 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1886 	    hn_rss_hash_sysctl, "A", "RSS hash");
1887 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1888 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1889 #ifndef RSS
1890 	/*
1891 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1892 	 */
1893 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1894 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1895 	    hn_rss_key_sysctl, "IU", "RSS key");
1896 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1897 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1898 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1899 #endif
1900 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1901 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1902 	    "RNDIS offered packet transmission aggregation size limit");
1903 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1904 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1905 	    "RNDIS offered packet transmission aggregation count limit");
1906 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1907 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1908 	    "RNDIS packet transmission aggregation alignment");
1909 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1910 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1911 	    hn_txagg_size_sysctl, "I",
1912 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1913 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1914 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1915 	    hn_txagg_pkts_sysctl, "I",
1916 	    "Packet transmission aggregation packets, "
1917 	    "0 -- disable, -1 -- auto");
1918 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1919 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1920 	    hn_polling_sysctl, "I",
1921 	    "Polling frequency: [100,1000000], 0 disable polling");
1922 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1923 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1924 	    hn_vf_sysctl, "A", "Virtual Function's name");
1925 	if (!hn_xpnt_vf) {
1926 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1927 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1928 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1929 	} else {
1930 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
1931 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1932 		    hn_xpnt_vf_enabled_sysctl, "I",
1933 		    "Transparent VF enabled");
1934 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
1935 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1936 		    hn_xpnt_vf_accbpf_sysctl, "I",
1937 		    "Accurate BPF for transparent VF");
1938 	}
1939 
1940 	/*
1941 	 * Setup the ifmedia, which has been initialized earlier.
1942 	 */
1943 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1944 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1945 	/* XXX ifmedia_set really should do this for us */
1946 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1947 
1948 	/*
1949 	 * Setup the ifnet for this interface.
1950 	 */
1951 
1952 	ifp->if_baudrate = IF_Gbps(10);
1953 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1954 	ifp->if_ioctl = hn_ioctl;
1955 	ifp->if_init = hn_init;
1956 #ifdef HN_IFSTART_SUPPORT
1957 	if (hn_use_if_start) {
1958 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1959 
1960 		ifp->if_start = hn_start;
1961 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1962 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1963 		IFQ_SET_READY(&ifp->if_snd);
1964 	} else
1965 #endif
1966 	{
1967 		ifp->if_transmit = hn_transmit;
1968 		ifp->if_qflush = hn_xmit_qflush;
1969 	}
1970 
1971 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
1972 #ifdef foo
1973 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1974 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1975 #endif
1976 	if (sc->hn_caps & HN_CAP_VLAN) {
1977 		/* XXX not sure about VLAN_MTU. */
1978 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1979 	}
1980 
1981 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1982 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1983 		ifp->if_capabilities |= IFCAP_TXCSUM;
1984 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1985 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1986 	if (sc->hn_caps & HN_CAP_TSO4) {
1987 		ifp->if_capabilities |= IFCAP_TSO4;
1988 		ifp->if_hwassist |= CSUM_IP_TSO;
1989 	}
1990 	if (sc->hn_caps & HN_CAP_TSO6) {
1991 		ifp->if_capabilities |= IFCAP_TSO6;
1992 		ifp->if_hwassist |= CSUM_IP6_TSO;
1993 	}
1994 
1995 	/* Enable all available capabilities by default. */
1996 	ifp->if_capenable = ifp->if_capabilities;
1997 
1998 	/*
1999 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2000 	 * be enabled through SIOCSIFCAP.
2001 	 */
2002 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2003 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2004 
2005 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2006 		/*
2007 		 * Lock hn_set_tso_maxsize() to simplify its
2008 		 * internal logic.
2009 		 */
2010 		HN_LOCK(sc);
2011 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2012 		HN_UNLOCK(sc);
2013 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2014 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2015 	}
2016 
2017 	ether_ifattach(ifp, eaddr);
2018 
2019 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2020 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2021 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2022 	}
2023 
2024 	/* Inform the upper layer about the long frame support. */
2025 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2026 
2027 	/*
2028 	 * Kick off link status check.
2029 	 */
2030 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2031 	hn_update_link_status(sc);
2032 
2033 	if (!hn_xpnt_vf) {
2034 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2035 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2036 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2037 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2038 	} else {
2039 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2040 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2041 	}
2042 
2043 	/*
2044 	 * NOTE:
2045 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2046 	 * since interface's LLADDR is needed; interface LLADDR is not
2047 	 * available when ifnet_arrival event is triggered.
2048 	 */
2049 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2050 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2051 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2052 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2053 
2054 	return (0);
2055 failed:
2056 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2057 		hn_synth_detach(sc);
2058 	hn_detach(dev);
2059 	return (error);
2060 }
2061 
2062 static int
2063 hn_detach(device_t dev)
2064 {
2065 	struct hn_softc *sc = device_get_softc(dev);
2066 	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2067 
2068 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2069 		/*
2070 		 * In case that the vmbus missed the orphan handler
2071 		 * installation.
2072 		 */
2073 		vmbus_xact_ctx_orphan(sc->hn_xact);
2074 	}
2075 
2076 	if (sc->hn_ifaddr_evthand != NULL)
2077 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2078 	if (sc->hn_ifnet_evthand != NULL)
2079 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2080 	if (sc->hn_ifnet_atthand != NULL) {
2081 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2082 		    sc->hn_ifnet_atthand);
2083 	}
2084 	if (sc->hn_ifnet_dethand != NULL) {
2085 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2086 		    sc->hn_ifnet_dethand);
2087 	}
2088 	if (sc->hn_ifnet_lnkhand != NULL)
2089 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2090 
2091 	vf_ifp = sc->hn_vf_ifp;
2092 	__compiler_membar();
2093 	if (vf_ifp != NULL)
2094 		hn_ifnet_detevent(sc, vf_ifp);
2095 
2096 	if (device_is_attached(dev)) {
2097 		HN_LOCK(sc);
2098 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2099 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2100 				hn_stop(sc, true);
2101 			/*
2102 			 * NOTE:
2103 			 * hn_stop() only suspends data, so managment
2104 			 * stuffs have to be suspended manually here.
2105 			 */
2106 			hn_suspend_mgmt(sc);
2107 			hn_synth_detach(sc);
2108 		}
2109 		HN_UNLOCK(sc);
2110 		ether_ifdetach(ifp);
2111 	}
2112 
2113 	ifmedia_removeall(&sc->hn_media);
2114 	hn_destroy_rx_data(sc);
2115 	hn_destroy_tx_data(sc);
2116 
2117 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2118 		int i;
2119 
2120 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2121 			taskqueue_free(sc->hn_tx_taskqs[i]);
2122 		free(sc->hn_tx_taskqs, M_DEVBUF);
2123 	}
2124 	taskqueue_free(sc->hn_mgmt_taskq0);
2125 	if (sc->hn_vf_taskq != NULL)
2126 		taskqueue_free(sc->hn_vf_taskq);
2127 
2128 	if (sc->hn_xact != NULL) {
2129 		/*
2130 		 * Uninstall the orphan handler _before_ the xact is
2131 		 * destructed.
2132 		 */
2133 		vmbus_chan_unset_orphan(sc->hn_prichan);
2134 		vmbus_xact_ctx_destroy(sc->hn_xact);
2135 	}
2136 
2137 	if_free(ifp);
2138 
2139 	HN_LOCK_DESTROY(sc);
2140 	rm_destroy(&sc->hn_vf_lock);
2141 	return (0);
2142 }
2143 
2144 static int
2145 hn_shutdown(device_t dev)
2146 {
2147 
2148 	return (0);
2149 }
2150 
2151 static void
2152 hn_link_status(struct hn_softc *sc)
2153 {
2154 	uint32_t link_status;
2155 	int error;
2156 
2157 	error = hn_rndis_get_linkstatus(sc, &link_status);
2158 	if (error) {
2159 		/* XXX what to do? */
2160 		return;
2161 	}
2162 
2163 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2164 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2165 	else
2166 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2167 	if_link_state_change(sc->hn_ifp,
2168 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2169 	    LINK_STATE_UP : LINK_STATE_DOWN);
2170 }
2171 
2172 static void
2173 hn_link_taskfunc(void *xsc, int pending __unused)
2174 {
2175 	struct hn_softc *sc = xsc;
2176 
2177 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2178 		return;
2179 	hn_link_status(sc);
2180 }
2181 
2182 static void
2183 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2184 {
2185 	struct hn_softc *sc = xsc;
2186 
2187 	/* Prevent any link status checks from running. */
2188 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2189 
2190 	/*
2191 	 * Fake up a [link down --> link up] state change; 5 seconds
2192 	 * delay is used, which closely simulates miibus reaction
2193 	 * upon link down event.
2194 	 */
2195 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2196 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2197 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2198 	    &sc->hn_netchg_status, 5 * hz);
2199 }
2200 
2201 static void
2202 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2203 {
2204 	struct hn_softc *sc = xsc;
2205 
2206 	/* Re-allow link status checks. */
2207 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2208 	hn_link_status(sc);
2209 }
2210 
2211 static void
2212 hn_update_link_status(struct hn_softc *sc)
2213 {
2214 
2215 	if (sc->hn_mgmt_taskq != NULL)
2216 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2217 }
2218 
2219 static void
2220 hn_change_network(struct hn_softc *sc)
2221 {
2222 
2223 	if (sc->hn_mgmt_taskq != NULL)
2224 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2225 }
2226 
2227 static __inline int
2228 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2229     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2230 {
2231 	struct mbuf *m = *m_head;
2232 	int error;
2233 
2234 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2235 
2236 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2237 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2238 	if (error == EFBIG) {
2239 		struct mbuf *m_new;
2240 
2241 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2242 		if (m_new == NULL)
2243 			return ENOBUFS;
2244 		else
2245 			*m_head = m = m_new;
2246 		txr->hn_tx_collapsed++;
2247 
2248 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2249 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2250 	}
2251 	if (!error) {
2252 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2253 		    BUS_DMASYNC_PREWRITE);
2254 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2255 	}
2256 	return error;
2257 }
2258 
2259 static __inline int
2260 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2261 {
2262 
2263 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2264 	    ("put an onlist txd %#x", txd->flags));
2265 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2266 	    ("put an onagg txd %#x", txd->flags));
2267 
2268 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2269 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2270 		return 0;
2271 
2272 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2273 		struct hn_txdesc *tmp_txd;
2274 
2275 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2276 			int freed;
2277 
2278 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2279 			    ("resursive aggregation on aggregated txdesc"));
2280 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2281 			    ("not aggregated txdesc"));
2282 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2283 			    ("aggregated txdesc uses dmamap"));
2284 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2285 			    ("aggregated txdesc consumes "
2286 			     "chimney sending buffer"));
2287 			KASSERT(tmp_txd->chim_size == 0,
2288 			    ("aggregated txdesc has non-zero "
2289 			     "chimney sending size"));
2290 
2291 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2292 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2293 			freed = hn_txdesc_put(txr, tmp_txd);
2294 			KASSERT(freed, ("failed to free aggregated txdesc"));
2295 		}
2296 	}
2297 
2298 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2299 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2300 		    ("chim txd uses dmamap"));
2301 		hn_chim_free(txr->hn_sc, txd->chim_index);
2302 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2303 		txd->chim_size = 0;
2304 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2305 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2306 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2307 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2308 		    txd->data_dmap);
2309 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2310 	}
2311 
2312 	if (txd->m != NULL) {
2313 		m_freem(txd->m);
2314 		txd->m = NULL;
2315 	}
2316 
2317 	txd->flags |= HN_TXD_FLAG_ONLIST;
2318 #ifndef HN_USE_TXDESC_BUFRING
2319 	mtx_lock_spin(&txr->hn_txlist_spin);
2320 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2321 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2322 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2323 	txr->hn_txdesc_avail++;
2324 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2325 	mtx_unlock_spin(&txr->hn_txlist_spin);
2326 #else	/* HN_USE_TXDESC_BUFRING */
2327 #ifdef HN_DEBUG
2328 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2329 #endif
2330 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2331 #endif	/* !HN_USE_TXDESC_BUFRING */
2332 
2333 	return 1;
2334 }
2335 
2336 static __inline struct hn_txdesc *
2337 hn_txdesc_get(struct hn_tx_ring *txr)
2338 {
2339 	struct hn_txdesc *txd;
2340 
2341 #ifndef HN_USE_TXDESC_BUFRING
2342 	mtx_lock_spin(&txr->hn_txlist_spin);
2343 	txd = SLIST_FIRST(&txr->hn_txlist);
2344 	if (txd != NULL) {
2345 		KASSERT(txr->hn_txdesc_avail > 0,
2346 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2347 		txr->hn_txdesc_avail--;
2348 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2349 	}
2350 	mtx_unlock_spin(&txr->hn_txlist_spin);
2351 #else
2352 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2353 #endif
2354 
2355 	if (txd != NULL) {
2356 #ifdef HN_USE_TXDESC_BUFRING
2357 #ifdef HN_DEBUG
2358 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2359 #endif
2360 #endif	/* HN_USE_TXDESC_BUFRING */
2361 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2362 		    STAILQ_EMPTY(&txd->agg_list) &&
2363 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2364 		    txd->chim_size == 0 &&
2365 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2366 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2367 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2368 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2369 		txd->refs = 1;
2370 	}
2371 	return txd;
2372 }
2373 
2374 static __inline void
2375 hn_txdesc_hold(struct hn_txdesc *txd)
2376 {
2377 
2378 	/* 0->1 transition will never work */
2379 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2380 	atomic_add_int(&txd->refs, 1);
2381 }
2382 
2383 static __inline void
2384 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2385 {
2386 
2387 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2388 	    ("recursive aggregation on aggregating txdesc"));
2389 
2390 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2391 	    ("already aggregated"));
2392 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2393 	    ("recursive aggregation on to-be-aggregated txdesc"));
2394 
2395 	txd->flags |= HN_TXD_FLAG_ONAGG;
2396 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2397 }
2398 
2399 static bool
2400 hn_tx_ring_pending(struct hn_tx_ring *txr)
2401 {
2402 	bool pending = false;
2403 
2404 #ifndef HN_USE_TXDESC_BUFRING
2405 	mtx_lock_spin(&txr->hn_txlist_spin);
2406 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2407 		pending = true;
2408 	mtx_unlock_spin(&txr->hn_txlist_spin);
2409 #else
2410 	if (!buf_ring_full(txr->hn_txdesc_br))
2411 		pending = true;
2412 #endif
2413 	return (pending);
2414 }
2415 
2416 static __inline void
2417 hn_txeof(struct hn_tx_ring *txr)
2418 {
2419 	txr->hn_has_txeof = 0;
2420 	txr->hn_txeof(txr);
2421 }
2422 
2423 static void
2424 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2425     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2426 {
2427 	struct hn_txdesc *txd = sndc->hn_cbarg;
2428 	struct hn_tx_ring *txr;
2429 
2430 	txr = txd->txr;
2431 	KASSERT(txr->hn_chan == chan,
2432 	    ("channel mismatch, on chan%u, should be chan%u",
2433 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2434 
2435 	txr->hn_has_txeof = 1;
2436 	hn_txdesc_put(txr, txd);
2437 
2438 	++txr->hn_txdone_cnt;
2439 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2440 		txr->hn_txdone_cnt = 0;
2441 		if (txr->hn_oactive)
2442 			hn_txeof(txr);
2443 	}
2444 }
2445 
2446 static void
2447 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2448 {
2449 #if defined(INET) || defined(INET6)
2450 	tcp_lro_flush_all(&rxr->hn_lro);
2451 #endif
2452 
2453 	/*
2454 	 * NOTE:
2455 	 * 'txr' could be NULL, if multiple channels and
2456 	 * ifnet.if_start method are enabled.
2457 	 */
2458 	if (txr == NULL || !txr->hn_has_txeof)
2459 		return;
2460 
2461 	txr->hn_txdone_cnt = 0;
2462 	hn_txeof(txr);
2463 }
2464 
2465 static __inline uint32_t
2466 hn_rndis_pktmsg_offset(uint32_t ofs)
2467 {
2468 
2469 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2470 	    ("invalid RNDIS packet msg offset %u", ofs));
2471 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2472 }
2473 
2474 static __inline void *
2475 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2476     size_t pi_dlen, uint32_t pi_type)
2477 {
2478 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2479 	struct rndis_pktinfo *pi;
2480 
2481 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2482 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2483 
2484 	/*
2485 	 * Per-packet-info does not move; it only grows.
2486 	 *
2487 	 * NOTE:
2488 	 * rm_pktinfooffset in this phase counts from the beginning
2489 	 * of rndis_packet_msg.
2490 	 */
2491 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2492 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2493 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2494 	    pkt->rm_pktinfolen);
2495 	pkt->rm_pktinfolen += pi_size;
2496 
2497 	pi->rm_size = pi_size;
2498 	pi->rm_type = pi_type;
2499 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2500 
2501 	return (pi->rm_data);
2502 }
2503 
2504 static __inline int
2505 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2506 {
2507 	struct hn_txdesc *txd;
2508 	struct mbuf *m;
2509 	int error, pkts;
2510 
2511 	txd = txr->hn_agg_txd;
2512 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2513 
2514 	/*
2515 	 * Since hn_txpkt() will reset this temporary stat, save
2516 	 * it now, so that oerrors can be updated properly, if
2517 	 * hn_txpkt() ever fails.
2518 	 */
2519 	pkts = txr->hn_stat_pkts;
2520 
2521 	/*
2522 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2523 	 * failure, save it for later freeing, if hn_txpkt() ever
2524 	 * fails.
2525 	 */
2526 	m = txd->m;
2527 	error = hn_txpkt(ifp, txr, txd);
2528 	if (__predict_false(error)) {
2529 		/* txd is freed, but m is not. */
2530 		m_freem(m);
2531 
2532 		txr->hn_flush_failed++;
2533 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2534 	}
2535 
2536 	/* Reset all aggregation states. */
2537 	txr->hn_agg_txd = NULL;
2538 	txr->hn_agg_szleft = 0;
2539 	txr->hn_agg_pktleft = 0;
2540 	txr->hn_agg_prevpkt = NULL;
2541 
2542 	return (error);
2543 }
2544 
2545 static void *
2546 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2547     int pktsize)
2548 {
2549 	void *chim;
2550 
2551 	if (txr->hn_agg_txd != NULL) {
2552 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2553 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2554 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2555 			int olen;
2556 
2557 			/*
2558 			 * Update the previous RNDIS packet's total length,
2559 			 * it can be increased due to the mandatory alignment
2560 			 * padding for this RNDIS packet.  And update the
2561 			 * aggregating txdesc's chimney sending buffer size
2562 			 * accordingly.
2563 			 *
2564 			 * XXX
2565 			 * Zero-out the padding, as required by the RNDIS spec.
2566 			 */
2567 			olen = pkt->rm_len;
2568 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2569 			agg_txd->chim_size += pkt->rm_len - olen;
2570 
2571 			/* Link this txdesc to the parent. */
2572 			hn_txdesc_agg(agg_txd, txd);
2573 
2574 			chim = (uint8_t *)pkt + pkt->rm_len;
2575 			/* Save the current packet for later fixup. */
2576 			txr->hn_agg_prevpkt = chim;
2577 
2578 			txr->hn_agg_pktleft--;
2579 			txr->hn_agg_szleft -= pktsize;
2580 			if (txr->hn_agg_szleft <=
2581 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2582 				/*
2583 				 * Probably can't aggregate more packets,
2584 				 * flush this aggregating txdesc proactively.
2585 				 */
2586 				txr->hn_agg_pktleft = 0;
2587 			}
2588 			/* Done! */
2589 			return (chim);
2590 		}
2591 		hn_flush_txagg(ifp, txr);
2592 	}
2593 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2594 
2595 	txr->hn_tx_chimney_tried++;
2596 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
2597 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2598 		return (NULL);
2599 	txr->hn_tx_chimney++;
2600 
2601 	chim = txr->hn_sc->hn_chim +
2602 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2603 
2604 	if (txr->hn_agg_pktmax > 1 &&
2605 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2606 		txr->hn_agg_txd = txd;
2607 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2608 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2609 		txr->hn_agg_prevpkt = chim;
2610 	}
2611 	return (chim);
2612 }
2613 
2614 /*
2615  * NOTE:
2616  * If this function fails, then both txd and m_head0 will be freed.
2617  */
2618 static int
2619 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2620     struct mbuf **m_head0)
2621 {
2622 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2623 	int error, nsegs, i;
2624 	struct mbuf *m_head = *m_head0;
2625 	struct rndis_packet_msg *pkt;
2626 	uint32_t *pi_data;
2627 	void *chim = NULL;
2628 	int pkt_hlen, pkt_size;
2629 
2630 	pkt = txd->rndis_pkt;
2631 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2632 	if (pkt_size < txr->hn_chim_size) {
2633 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2634 		if (chim != NULL)
2635 			pkt = chim;
2636 	} else {
2637 		if (txr->hn_agg_txd != NULL)
2638 			hn_flush_txagg(ifp, txr);
2639 	}
2640 
2641 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2642 	pkt->rm_len = m_head->m_pkthdr.len;
2643 	pkt->rm_dataoffset = 0;
2644 	pkt->rm_datalen = m_head->m_pkthdr.len;
2645 	pkt->rm_oobdataoffset = 0;
2646 	pkt->rm_oobdatalen = 0;
2647 	pkt->rm_oobdataelements = 0;
2648 	pkt->rm_pktinfooffset = sizeof(*pkt);
2649 	pkt->rm_pktinfolen = 0;
2650 	pkt->rm_vchandle = 0;
2651 	pkt->rm_reserved = 0;
2652 
2653 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2654 		/*
2655 		 * Set the hash value for this packet, so that the host could
2656 		 * dispatch the TX done event for this packet back to this TX
2657 		 * ring's channel.
2658 		 */
2659 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2660 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2661 		*pi_data = txr->hn_tx_idx;
2662 	}
2663 
2664 	if (m_head->m_flags & M_VLANTAG) {
2665 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2666 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2667 		*pi_data = NDIS_VLAN_INFO_MAKE(
2668 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2669 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2670 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2671 	}
2672 
2673 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2674 #if defined(INET6) || defined(INET)
2675 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2676 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2677 #ifdef INET
2678 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2679 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2680 			    m_head->m_pkthdr.tso_segsz);
2681 		}
2682 #endif
2683 #if defined(INET6) && defined(INET)
2684 		else
2685 #endif
2686 #ifdef INET6
2687 		{
2688 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2689 			    m_head->m_pkthdr.tso_segsz);
2690 		}
2691 #endif
2692 #endif	/* INET6 || INET */
2693 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2694 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2695 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2696 		if (m_head->m_pkthdr.csum_flags &
2697 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2698 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
2699 		} else {
2700 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
2701 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2702 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
2703 		}
2704 
2705 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2706 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2707 		else if (m_head->m_pkthdr.csum_flags &
2708 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
2709 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2710 	}
2711 
2712 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2713 	/* Fixup RNDIS packet message total length */
2714 	pkt->rm_len += pkt_hlen;
2715 	/* Convert RNDIS packet message offsets */
2716 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2717 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2718 
2719 	/*
2720 	 * Fast path: Chimney sending.
2721 	 */
2722 	if (chim != NULL) {
2723 		struct hn_txdesc *tgt_txd = txd;
2724 
2725 		if (txr->hn_agg_txd != NULL) {
2726 			tgt_txd = txr->hn_agg_txd;
2727 #ifdef INVARIANTS
2728 			*m_head0 = NULL;
2729 #endif
2730 		}
2731 
2732 		KASSERT(pkt == chim,
2733 		    ("RNDIS pkt not in chimney sending buffer"));
2734 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2735 		    ("chimney sending buffer is not used"));
2736 		tgt_txd->chim_size += pkt->rm_len;
2737 
2738 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2739 		    ((uint8_t *)chim) + pkt_hlen);
2740 
2741 		txr->hn_gpa_cnt = 0;
2742 		txr->hn_sendpkt = hn_txpkt_chim;
2743 		goto done;
2744 	}
2745 
2746 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2747 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2748 	    ("chimney buffer is used"));
2749 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2750 
2751 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2752 	if (__predict_false(error)) {
2753 		int freed;
2754 
2755 		/*
2756 		 * This mbuf is not linked w/ the txd yet, so free it now.
2757 		 */
2758 		m_freem(m_head);
2759 		*m_head0 = NULL;
2760 
2761 		freed = hn_txdesc_put(txr, txd);
2762 		KASSERT(freed != 0,
2763 		    ("fail to free txd upon txdma error"));
2764 
2765 		txr->hn_txdma_failed++;
2766 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2767 		return error;
2768 	}
2769 	*m_head0 = m_head;
2770 
2771 	/* +1 RNDIS packet message */
2772 	txr->hn_gpa_cnt = nsegs + 1;
2773 
2774 	/* send packet with page buffer */
2775 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2776 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2777 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2778 
2779 	/*
2780 	 * Fill the page buffers with mbuf info after the page
2781 	 * buffer for RNDIS packet message.
2782 	 */
2783 	for (i = 0; i < nsegs; ++i) {
2784 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2785 
2786 		gpa->gpa_page = atop(segs[i].ds_addr);
2787 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2788 		gpa->gpa_len = segs[i].ds_len;
2789 	}
2790 
2791 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2792 	txd->chim_size = 0;
2793 	txr->hn_sendpkt = hn_txpkt_sglist;
2794 done:
2795 	txd->m = m_head;
2796 
2797 	/* Set the completion routine */
2798 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2799 
2800 	/* Update temporary stats for later use. */
2801 	txr->hn_stat_pkts++;
2802 	txr->hn_stat_size += m_head->m_pkthdr.len;
2803 	if (m_head->m_flags & M_MCAST)
2804 		txr->hn_stat_mcasts++;
2805 
2806 	return 0;
2807 }
2808 
2809 /*
2810  * NOTE:
2811  * If this function fails, then txd will be freed, but the mbuf
2812  * associated w/ the txd will _not_ be freed.
2813  */
2814 static int
2815 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2816 {
2817 	int error, send_failed = 0, has_bpf;
2818 
2819 again:
2820 	has_bpf = bpf_peers_present(ifp->if_bpf);
2821 	if (has_bpf) {
2822 		/*
2823 		 * Make sure that this txd and any aggregated txds are not
2824 		 * freed before ETHER_BPF_MTAP.
2825 		 */
2826 		hn_txdesc_hold(txd);
2827 	}
2828 	error = txr->hn_sendpkt(txr, txd);
2829 	if (!error) {
2830 		if (has_bpf) {
2831 			const struct hn_txdesc *tmp_txd;
2832 
2833 			ETHER_BPF_MTAP(ifp, txd->m);
2834 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2835 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2836 		}
2837 
2838 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2839 #ifdef HN_IFSTART_SUPPORT
2840 		if (!hn_use_if_start)
2841 #endif
2842 		{
2843 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2844 			    txr->hn_stat_size);
2845 			if (txr->hn_stat_mcasts != 0) {
2846 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2847 				    txr->hn_stat_mcasts);
2848 			}
2849 		}
2850 		txr->hn_pkts += txr->hn_stat_pkts;
2851 		txr->hn_sends++;
2852 	}
2853 	if (has_bpf)
2854 		hn_txdesc_put(txr, txd);
2855 
2856 	if (__predict_false(error)) {
2857 		int freed;
2858 
2859 		/*
2860 		 * This should "really rarely" happen.
2861 		 *
2862 		 * XXX Too many RX to be acked or too many sideband
2863 		 * commands to run?  Ask netvsc_channel_rollup()
2864 		 * to kick start later.
2865 		 */
2866 		txr->hn_has_txeof = 1;
2867 		if (!send_failed) {
2868 			txr->hn_send_failed++;
2869 			send_failed = 1;
2870 			/*
2871 			 * Try sending again after set hn_has_txeof;
2872 			 * in case that we missed the last
2873 			 * netvsc_channel_rollup().
2874 			 */
2875 			goto again;
2876 		}
2877 		if_printf(ifp, "send failed\n");
2878 
2879 		/*
2880 		 * Caller will perform further processing on the
2881 		 * associated mbuf, so don't free it in hn_txdesc_put();
2882 		 * only unload it from the DMA map in hn_txdesc_put(),
2883 		 * if it was loaded.
2884 		 */
2885 		txd->m = NULL;
2886 		freed = hn_txdesc_put(txr, txd);
2887 		KASSERT(freed != 0,
2888 		    ("fail to free txd upon send error"));
2889 
2890 		txr->hn_send_failed++;
2891 	}
2892 
2893 	/* Reset temporary stats, after this sending is done. */
2894 	txr->hn_stat_size = 0;
2895 	txr->hn_stat_pkts = 0;
2896 	txr->hn_stat_mcasts = 0;
2897 
2898 	return (error);
2899 }
2900 
2901 /*
2902  * Append the specified data to the indicated mbuf chain,
2903  * Extend the mbuf chain if the new data does not fit in
2904  * existing space.
2905  *
2906  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2907  * There should be an equivalent in the kernel mbuf code,
2908  * but there does not appear to be one yet.
2909  *
2910  * Differs from m_append() in that additional mbufs are
2911  * allocated with cluster size MJUMPAGESIZE, and filled
2912  * accordingly.
2913  *
2914  * Return 1 if able to complete the job; otherwise 0.
2915  */
2916 static int
2917 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2918 {
2919 	struct mbuf *m, *n;
2920 	int remainder, space;
2921 
2922 	for (m = m0; m->m_next != NULL; m = m->m_next)
2923 		;
2924 	remainder = len;
2925 	space = M_TRAILINGSPACE(m);
2926 	if (space > 0) {
2927 		/*
2928 		 * Copy into available space.
2929 		 */
2930 		if (space > remainder)
2931 			space = remainder;
2932 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2933 		m->m_len += space;
2934 		cp += space;
2935 		remainder -= space;
2936 	}
2937 	while (remainder > 0) {
2938 		/*
2939 		 * Allocate a new mbuf; could check space
2940 		 * and allocate a cluster instead.
2941 		 */
2942 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2943 		if (n == NULL)
2944 			break;
2945 		n->m_len = min(MJUMPAGESIZE, remainder);
2946 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2947 		cp += n->m_len;
2948 		remainder -= n->m_len;
2949 		m->m_next = n;
2950 		m = n;
2951 	}
2952 	if (m0->m_flags & M_PKTHDR)
2953 		m0->m_pkthdr.len += len - remainder;
2954 
2955 	return (remainder == 0);
2956 }
2957 
2958 #if defined(INET) || defined(INET6)
2959 static __inline int
2960 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2961 {
2962 #if __FreeBSD_version >= 1100095
2963 	if (hn_lro_mbufq_depth) {
2964 		tcp_lro_queue_mbuf(lc, m);
2965 		return 0;
2966 	}
2967 #endif
2968 	return tcp_lro_rx(lc, m, 0);
2969 }
2970 #endif
2971 
2972 static int
2973 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2974     const struct hn_rxinfo *info)
2975 {
2976 	struct ifnet *ifp;
2977 	struct mbuf *m_new;
2978 	int size, do_lro = 0, do_csum = 1;
2979 	int hash_type;
2980 
2981 	/* If the VF is active, inject the packet through the VF */
2982 	ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : rxr->hn_ifp;
2983 
2984 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2985 		/*
2986 		 * NOTE:
2987 		 * See the NOTE of hn_rndis_init_fixat().  This
2988 		 * function can be reached, immediately after the
2989 		 * RNDIS is initialized but before the ifnet is
2990 		 * setup on the hn_attach() path; drop the unexpected
2991 		 * packets.
2992 		 */
2993 		return (0);
2994 	}
2995 
2996 	if (dlen <= MHLEN) {
2997 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2998 		if (m_new == NULL) {
2999 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
3000 			return (0);
3001 		}
3002 		memcpy(mtod(m_new, void *), data, dlen);
3003 		m_new->m_pkthdr.len = m_new->m_len = dlen;
3004 		rxr->hn_small_pkts++;
3005 	} else {
3006 		/*
3007 		 * Get an mbuf with a cluster.  For packets 2K or less,
3008 		 * get a standard 2K cluster.  For anything larger, get a
3009 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3010 		 * if looped around to the Hyper-V TX channel, so avoid them.
3011 		 */
3012 		size = MCLBYTES;
3013 		if (dlen > MCLBYTES) {
3014 			/* 4096 */
3015 			size = MJUMPAGESIZE;
3016 		}
3017 
3018 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3019 		if (m_new == NULL) {
3020 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
3021 			return (0);
3022 		}
3023 
3024 		hv_m_append(m_new, dlen, data);
3025 	}
3026 	m_new->m_pkthdr.rcvif = ifp;
3027 
3028 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
3029 		do_csum = 0;
3030 
3031 	/* receive side checksum offload */
3032 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3033 		/* IP csum offload */
3034 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3035 			m_new->m_pkthdr.csum_flags |=
3036 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3037 			rxr->hn_csum_ip++;
3038 		}
3039 
3040 		/* TCP/UDP csum offload */
3041 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3042 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3043 			m_new->m_pkthdr.csum_flags |=
3044 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3045 			m_new->m_pkthdr.csum_data = 0xffff;
3046 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3047 				rxr->hn_csum_tcp++;
3048 			else
3049 				rxr->hn_csum_udp++;
3050 		}
3051 
3052 		/*
3053 		 * XXX
3054 		 * As of this write (Oct 28th, 2016), host side will turn
3055 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3056 		 * the do_lro setting here is actually _not_ accurate.  We
3057 		 * depend on the RSS hash type check to reset do_lro.
3058 		 */
3059 		if ((info->csum_info &
3060 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3061 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3062 			do_lro = 1;
3063 	} else {
3064 		const struct ether_header *eh;
3065 		uint16_t etype;
3066 		int hoff;
3067 
3068 		hoff = sizeof(*eh);
3069 		if (m_new->m_len < hoff)
3070 			goto skip;
3071 		eh = mtod(m_new, struct ether_header *);
3072 		etype = ntohs(eh->ether_type);
3073 		if (etype == ETHERTYPE_VLAN) {
3074 			const struct ether_vlan_header *evl;
3075 
3076 			hoff = sizeof(*evl);
3077 			if (m_new->m_len < hoff)
3078 				goto skip;
3079 			evl = mtod(m_new, struct ether_vlan_header *);
3080 			etype = ntohs(evl->evl_proto);
3081 		}
3082 
3083 		if (etype == ETHERTYPE_IP) {
3084 			int pr;
3085 
3086 			pr = hn_check_iplen(m_new, hoff);
3087 			if (pr == IPPROTO_TCP) {
3088 				if (do_csum &&
3089 				    (rxr->hn_trust_hcsum &
3090 				     HN_TRUST_HCSUM_TCP)) {
3091 					rxr->hn_csum_trusted++;
3092 					m_new->m_pkthdr.csum_flags |=
3093 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3094 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3095 					m_new->m_pkthdr.csum_data = 0xffff;
3096 				}
3097 				do_lro = 1;
3098 			} else if (pr == IPPROTO_UDP) {
3099 				if (do_csum &&
3100 				    (rxr->hn_trust_hcsum &
3101 				     HN_TRUST_HCSUM_UDP)) {
3102 					rxr->hn_csum_trusted++;
3103 					m_new->m_pkthdr.csum_flags |=
3104 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3105 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3106 					m_new->m_pkthdr.csum_data = 0xffff;
3107 				}
3108 			} else if (pr != IPPROTO_DONE && do_csum &&
3109 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3110 				rxr->hn_csum_trusted++;
3111 				m_new->m_pkthdr.csum_flags |=
3112 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3113 			}
3114 		}
3115 	}
3116 skip:
3117 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3118 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3119 		    NDIS_VLAN_INFO_ID(info->vlan_info),
3120 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3121 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3122 		m_new->m_flags |= M_VLANTAG;
3123 	}
3124 
3125 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3126 		rxr->hn_rss_pkts++;
3127 		m_new->m_pkthdr.flowid = info->hash_value;
3128 		hash_type = M_HASHTYPE_OPAQUE_HASH;
3129 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3130 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3131 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
3132 
3133 			/*
3134 			 * NOTE:
3135 			 * do_lro is resetted, if the hash types are not TCP
3136 			 * related.  See the comment in the above csum_flags
3137 			 * setup section.
3138 			 */
3139 			switch (type) {
3140 			case NDIS_HASH_IPV4:
3141 				hash_type = M_HASHTYPE_RSS_IPV4;
3142 				do_lro = 0;
3143 				break;
3144 
3145 			case NDIS_HASH_TCP_IPV4:
3146 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3147 				break;
3148 
3149 			case NDIS_HASH_IPV6:
3150 				hash_type = M_HASHTYPE_RSS_IPV6;
3151 				do_lro = 0;
3152 				break;
3153 
3154 			case NDIS_HASH_IPV6_EX:
3155 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3156 				do_lro = 0;
3157 				break;
3158 
3159 			case NDIS_HASH_TCP_IPV6:
3160 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3161 				break;
3162 
3163 			case NDIS_HASH_TCP_IPV6_EX:
3164 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3165 				break;
3166 			}
3167 		}
3168 	} else {
3169 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3170 		hash_type = M_HASHTYPE_OPAQUE;
3171 	}
3172 	M_HASHTYPE_SET(m_new, hash_type);
3173 
3174 	/*
3175 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
3176 	 * messages (not just data messages) will trigger a response.
3177 	 */
3178 
3179 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3180 	rxr->hn_pkts++;
3181 
3182 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
3183 #if defined(INET) || defined(INET6)
3184 		struct lro_ctrl *lro = &rxr->hn_lro;
3185 
3186 		if (lro->lro_cnt) {
3187 			rxr->hn_lro_tried++;
3188 			if (hn_lro_rx(lro, m_new) == 0) {
3189 				/* DONE! */
3190 				return 0;
3191 			}
3192 		}
3193 #endif
3194 	}
3195 
3196 	/* We're not holding the lock here, so don't release it */
3197 	(*ifp->if_input)(ifp, m_new);
3198 
3199 	return (0);
3200 }
3201 
3202 static int
3203 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3204 {
3205 	struct hn_softc *sc = ifp->if_softc;
3206 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3207 	struct ifnet *vf_ifp;
3208 	int mask, error = 0;
3209 
3210 	switch (cmd) {
3211 	case SIOCSIFMTU:
3212 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3213 			error = EINVAL;
3214 			break;
3215 		}
3216 
3217 		HN_LOCK(sc);
3218 
3219 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3220 			HN_UNLOCK(sc);
3221 			break;
3222 		}
3223 
3224 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3225 			/* Can't change MTU */
3226 			HN_UNLOCK(sc);
3227 			error = EOPNOTSUPP;
3228 			break;
3229 		}
3230 
3231 		if (ifp->if_mtu == ifr->ifr_mtu) {
3232 			HN_UNLOCK(sc);
3233 			break;
3234 		}
3235 
3236 		if (hn_xpnt_vf_isready(sc)) {
3237 			vf_ifp = sc->hn_vf_ifp;
3238 			ifr_vf = *ifr;
3239 			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3240 			    sizeof(ifr_vf.ifr_name));
3241 			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3242 			    (caddr_t)&ifr_vf);
3243 			if (error) {
3244 				HN_UNLOCK(sc);
3245 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3246 				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3247 				break;
3248 			}
3249 		}
3250 
3251 		/*
3252 		 * Suspend this interface before the synthetic parts
3253 		 * are ripped.
3254 		 */
3255 		hn_suspend(sc);
3256 
3257 		/*
3258 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3259 		 */
3260 		hn_synth_detach(sc);
3261 
3262 		/*
3263 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3264 		 * with the new MTU setting.
3265 		 */
3266 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3267 		if (error) {
3268 			HN_UNLOCK(sc);
3269 			break;
3270 		}
3271 
3272 		/*
3273 		 * Commit the requested MTU, after the synthetic parts
3274 		 * have been successfully attached.
3275 		 */
3276 		ifp->if_mtu = ifr->ifr_mtu;
3277 
3278 		/*
3279 		 * Synthetic parts' reattach may change the chimney
3280 		 * sending size; update it.
3281 		 */
3282 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3283 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3284 
3285 		/*
3286 		 * Make sure that various parameters based on MTU are
3287 		 * still valid, after the MTU change.
3288 		 */
3289 		hn_mtu_change_fixup(sc);
3290 
3291 		/*
3292 		 * All done!  Resume the interface now.
3293 		 */
3294 		hn_resume(sc);
3295 
3296 		if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3297 			/*
3298 			 * Since we have reattached the NVS part,
3299 			 * change the datapath to VF again; in case
3300 			 * that it is lost, after the NVS was detached.
3301 			 */
3302 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3303 		}
3304 
3305 		HN_UNLOCK(sc);
3306 		break;
3307 
3308 	case SIOCSIFFLAGS:
3309 		HN_LOCK(sc);
3310 
3311 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3312 			HN_UNLOCK(sc);
3313 			break;
3314 		}
3315 
3316 		if (hn_xpnt_vf_isready(sc))
3317 			hn_xpnt_vf_saveifflags(sc);
3318 
3319 		if (ifp->if_flags & IFF_UP) {
3320 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3321 				/*
3322 				 * Caller meight hold mutex, e.g.
3323 				 * bpf; use busy-wait for the RNDIS
3324 				 * reply.
3325 				 */
3326 				HN_NO_SLEEPING(sc);
3327 				hn_rxfilter_config(sc);
3328 				HN_SLEEPING_OK(sc);
3329 
3330 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3331 					error = hn_xpnt_vf_iocsetflags(sc);
3332 			} else {
3333 				hn_init_locked(sc);
3334 			}
3335 		} else {
3336 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3337 				hn_stop(sc, false);
3338 		}
3339 		sc->hn_if_flags = ifp->if_flags;
3340 
3341 		HN_UNLOCK(sc);
3342 		break;
3343 
3344 	case SIOCSIFCAP:
3345 		HN_LOCK(sc);
3346 
3347 		if (hn_xpnt_vf_isready(sc)) {
3348 			ifr_vf = *ifr;
3349 			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3350 			    sizeof(ifr_vf.ifr_name));
3351 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3352 			HN_UNLOCK(sc);
3353 			break;
3354 		}
3355 
3356 		/*
3357 		 * Fix up requested capabilities w/ supported capabilities,
3358 		 * since the supported capabilities could have been changed.
3359 		 */
3360 		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3361 		    ifp->if_capenable;
3362 
3363 		if (mask & IFCAP_TXCSUM) {
3364 			ifp->if_capenable ^= IFCAP_TXCSUM;
3365 			if (ifp->if_capenable & IFCAP_TXCSUM)
3366 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3367 			else
3368 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3369 		}
3370 		if (mask & IFCAP_TXCSUM_IPV6) {
3371 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3372 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3373 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3374 			else
3375 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3376 		}
3377 
3378 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3379 		if (mask & IFCAP_RXCSUM)
3380 			ifp->if_capenable ^= IFCAP_RXCSUM;
3381 #ifdef foo
3382 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3383 		if (mask & IFCAP_RXCSUM_IPV6)
3384 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3385 #endif
3386 
3387 		if (mask & IFCAP_LRO)
3388 			ifp->if_capenable ^= IFCAP_LRO;
3389 
3390 		if (mask & IFCAP_TSO4) {
3391 			ifp->if_capenable ^= IFCAP_TSO4;
3392 			if (ifp->if_capenable & IFCAP_TSO4)
3393 				ifp->if_hwassist |= CSUM_IP_TSO;
3394 			else
3395 				ifp->if_hwassist &= ~CSUM_IP_TSO;
3396 		}
3397 		if (mask & IFCAP_TSO6) {
3398 			ifp->if_capenable ^= IFCAP_TSO6;
3399 			if (ifp->if_capenable & IFCAP_TSO6)
3400 				ifp->if_hwassist |= CSUM_IP6_TSO;
3401 			else
3402 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3403 		}
3404 
3405 		HN_UNLOCK(sc);
3406 		break;
3407 
3408 	case SIOCADDMULTI:
3409 	case SIOCDELMULTI:
3410 		HN_LOCK(sc);
3411 
3412 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3413 			HN_UNLOCK(sc);
3414 			break;
3415 		}
3416 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3417 			/*
3418 			 * Multicast uses mutex; use busy-wait for
3419 			 * the RNDIS reply.
3420 			 */
3421 			HN_NO_SLEEPING(sc);
3422 			hn_rxfilter_config(sc);
3423 			HN_SLEEPING_OK(sc);
3424 		}
3425 
3426 		/* XXX vlan(4) style mcast addr maintenance */
3427 		if (hn_xpnt_vf_isready(sc)) {
3428 			int old_if_flags;
3429 
3430 			old_if_flags = sc->hn_vf_ifp->if_flags;
3431 			hn_xpnt_vf_saveifflags(sc);
3432 
3433 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3434 			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3435 			     IFF_ALLMULTI))
3436 				error = hn_xpnt_vf_iocsetflags(sc);
3437 		}
3438 
3439 		HN_UNLOCK(sc);
3440 		break;
3441 
3442 	case SIOCSIFMEDIA:
3443 	case SIOCGIFMEDIA:
3444 		HN_LOCK(sc);
3445 		if (hn_xpnt_vf_isready(sc)) {
3446 			/*
3447 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3448 			 * create and pass ifr_vf to the VF here; just
3449 			 * replace the ifr_name.
3450 			 */
3451 			vf_ifp = sc->hn_vf_ifp;
3452 			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3453 			    sizeof(ifr->ifr_name));
3454 			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3455 			/* Restore the ifr_name. */
3456 			strlcpy(ifr->ifr_name, ifp->if_xname,
3457 			    sizeof(ifr->ifr_name));
3458 			HN_UNLOCK(sc);
3459 			break;
3460 		}
3461 		HN_UNLOCK(sc);
3462 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3463 		break;
3464 
3465 	default:
3466 		error = ether_ioctl(ifp, cmd, data);
3467 		break;
3468 	}
3469 	return (error);
3470 }
3471 
3472 static void
3473 hn_stop(struct hn_softc *sc, bool detaching)
3474 {
3475 	struct ifnet *ifp = sc->hn_ifp;
3476 	int i;
3477 
3478 	HN_LOCK_ASSERT(sc);
3479 
3480 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3481 	    ("synthetic parts were not attached"));
3482 
3483 	/* Clear RUNNING bit ASAP. */
3484 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3485 
3486 	/* Disable polling. */
3487 	hn_polling(sc, 0);
3488 
3489 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3490 		KASSERT(sc->hn_vf_ifp != NULL,
3491 		    ("%s: VF is not attached", ifp->if_xname));
3492 
3493 		/* NOTE: hn_vf_lock for hn_transmit() */
3494 		rm_wlock(&sc->hn_vf_lock);
3495 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
3496 		rm_wunlock(&sc->hn_vf_lock);
3497 
3498 		/*
3499 		 * NOTE:
3500 		 * Datapath setting must happen _before_ bringing
3501 		 * the VF down.
3502 		 */
3503 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3504 
3505 		/*
3506 		 * Bring the VF down.
3507 		 */
3508 		hn_xpnt_vf_saveifflags(sc);
3509 		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3510 		hn_xpnt_vf_iocsetflags(sc);
3511 	}
3512 
3513 	/* Suspend data transfers. */
3514 	hn_suspend_data(sc);
3515 
3516 	/* Clear OACTIVE bit. */
3517 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3518 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3519 		sc->hn_tx_ring[i].hn_oactive = 0;
3520 
3521 	/*
3522 	 * If the non-transparent mode VF is active, make sure
3523 	 * that the RX filter still allows packet reception.
3524 	 */
3525 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3526 		hn_rxfilter_config(sc);
3527 }
3528 
3529 static void
3530 hn_init_locked(struct hn_softc *sc)
3531 {
3532 	struct ifnet *ifp = sc->hn_ifp;
3533 	int i;
3534 
3535 	HN_LOCK_ASSERT(sc);
3536 
3537 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3538 		return;
3539 
3540 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3541 		return;
3542 
3543 	/* Configure RX filter */
3544 	hn_rxfilter_config(sc);
3545 
3546 	/* Clear OACTIVE bit. */
3547 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3548 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3549 		sc->hn_tx_ring[i].hn_oactive = 0;
3550 
3551 	/* Clear TX 'suspended' bit. */
3552 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
3553 
3554 	if (hn_xpnt_vf_isready(sc)) {
3555 		/* Initialize transparent VF. */
3556 		hn_xpnt_vf_init(sc);
3557 	}
3558 
3559 	/* Everything is ready; unleash! */
3560 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3561 
3562 	/* Re-enable polling if requested. */
3563 	if (sc->hn_pollhz > 0)
3564 		hn_polling(sc, sc->hn_pollhz);
3565 }
3566 
3567 static void
3568 hn_init(void *xsc)
3569 {
3570 	struct hn_softc *sc = xsc;
3571 
3572 	HN_LOCK(sc);
3573 	hn_init_locked(sc);
3574 	HN_UNLOCK(sc);
3575 }
3576 
3577 #if __FreeBSD_version >= 1100099
3578 
3579 static int
3580 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
3581 {
3582 	struct hn_softc *sc = arg1;
3583 	unsigned int lenlim;
3584 	int error;
3585 
3586 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
3587 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
3588 	if (error || req->newptr == NULL)
3589 		return error;
3590 
3591 	HN_LOCK(sc);
3592 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
3593 	    lenlim > TCP_LRO_LENGTH_MAX) {
3594 		HN_UNLOCK(sc);
3595 		return EINVAL;
3596 	}
3597 	hn_set_lro_lenlim(sc, lenlim);
3598 	HN_UNLOCK(sc);
3599 
3600 	return 0;
3601 }
3602 
3603 static int
3604 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
3605 {
3606 	struct hn_softc *sc = arg1;
3607 	int ackcnt, error, i;
3608 
3609 	/*
3610 	 * lro_ackcnt_lim is append count limit,
3611 	 * +1 to turn it into aggregation limit.
3612 	 */
3613 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
3614 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
3615 	if (error || req->newptr == NULL)
3616 		return error;
3617 
3618 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
3619 		return EINVAL;
3620 
3621 	/*
3622 	 * Convert aggregation limit back to append
3623 	 * count limit.
3624 	 */
3625 	--ackcnt;
3626 	HN_LOCK(sc);
3627 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
3628 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
3629 	HN_UNLOCK(sc);
3630 	return 0;
3631 }
3632 
3633 #endif
3634 
3635 static int
3636 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3637 {
3638 	struct hn_softc *sc = arg1;
3639 	int hcsum = arg2;
3640 	int on, error, i;
3641 
3642 	on = 0;
3643 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3644 		on = 1;
3645 
3646 	error = sysctl_handle_int(oidp, &on, 0, req);
3647 	if (error || req->newptr == NULL)
3648 		return error;
3649 
3650 	HN_LOCK(sc);
3651 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3652 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3653 
3654 		if (on)
3655 			rxr->hn_trust_hcsum |= hcsum;
3656 		else
3657 			rxr->hn_trust_hcsum &= ~hcsum;
3658 	}
3659 	HN_UNLOCK(sc);
3660 	return 0;
3661 }
3662 
3663 static int
3664 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3665 {
3666 	struct hn_softc *sc = arg1;
3667 	int chim_size, error;
3668 
3669 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
3670 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
3671 	if (error || req->newptr == NULL)
3672 		return error;
3673 
3674 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3675 		return EINVAL;
3676 
3677 	HN_LOCK(sc);
3678 	hn_set_chim_size(sc, chim_size);
3679 	HN_UNLOCK(sc);
3680 	return 0;
3681 }
3682 
3683 #if __FreeBSD_version < 1100095
3684 static int
3685 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3686 {
3687 	struct hn_softc *sc = arg1;
3688 	int ofs = arg2, i, error;
3689 	struct hn_rx_ring *rxr;
3690 	uint64_t stat;
3691 
3692 	stat = 0;
3693 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3694 		rxr = &sc->hn_rx_ring[i];
3695 		stat += *((int *)((uint8_t *)rxr + ofs));
3696 	}
3697 
3698 	error = sysctl_handle_64(oidp, &stat, 0, req);
3699 	if (error || req->newptr == NULL)
3700 		return error;
3701 
3702 	/* Zero out this stat. */
3703 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3704 		rxr = &sc->hn_rx_ring[i];
3705 		*((int *)((uint8_t *)rxr + ofs)) = 0;
3706 	}
3707 	return 0;
3708 }
3709 #else
3710 static int
3711 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3712 {
3713 	struct hn_softc *sc = arg1;
3714 	int ofs = arg2, i, error;
3715 	struct hn_rx_ring *rxr;
3716 	uint64_t stat;
3717 
3718 	stat = 0;
3719 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3720 		rxr = &sc->hn_rx_ring[i];
3721 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3722 	}
3723 
3724 	error = sysctl_handle_64(oidp, &stat, 0, req);
3725 	if (error || req->newptr == NULL)
3726 		return error;
3727 
3728 	/* Zero out this stat. */
3729 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3730 		rxr = &sc->hn_rx_ring[i];
3731 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3732 	}
3733 	return 0;
3734 }
3735 
3736 #endif
3737 
3738 static int
3739 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3740 {
3741 	struct hn_softc *sc = arg1;
3742 	int ofs = arg2, i, error;
3743 	struct hn_rx_ring *rxr;
3744 	u_long stat;
3745 
3746 	stat = 0;
3747 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3748 		rxr = &sc->hn_rx_ring[i];
3749 		stat += *((u_long *)((uint8_t *)rxr + ofs));
3750 	}
3751 
3752 	error = sysctl_handle_long(oidp, &stat, 0, req);
3753 	if (error || req->newptr == NULL)
3754 		return error;
3755 
3756 	/* Zero out this stat. */
3757 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3758 		rxr = &sc->hn_rx_ring[i];
3759 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
3760 	}
3761 	return 0;
3762 }
3763 
3764 static int
3765 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3766 {
3767 	struct hn_softc *sc = arg1;
3768 	int ofs = arg2, i, error;
3769 	struct hn_tx_ring *txr;
3770 	u_long stat;
3771 
3772 	stat = 0;
3773 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3774 		txr = &sc->hn_tx_ring[i];
3775 		stat += *((u_long *)((uint8_t *)txr + ofs));
3776 	}
3777 
3778 	error = sysctl_handle_long(oidp, &stat, 0, req);
3779 	if (error || req->newptr == NULL)
3780 		return error;
3781 
3782 	/* Zero out this stat. */
3783 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3784 		txr = &sc->hn_tx_ring[i];
3785 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
3786 	}
3787 	return 0;
3788 }
3789 
3790 static int
3791 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3792 {
3793 	struct hn_softc *sc = arg1;
3794 	int ofs = arg2, i, error, conf;
3795 	struct hn_tx_ring *txr;
3796 
3797 	txr = &sc->hn_tx_ring[0];
3798 	conf = *((int *)((uint8_t *)txr + ofs));
3799 
3800 	error = sysctl_handle_int(oidp, &conf, 0, req);
3801 	if (error || req->newptr == NULL)
3802 		return error;
3803 
3804 	HN_LOCK(sc);
3805 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3806 		txr = &sc->hn_tx_ring[i];
3807 		*((int *)((uint8_t *)txr + ofs)) = conf;
3808 	}
3809 	HN_UNLOCK(sc);
3810 
3811 	return 0;
3812 }
3813 
3814 static int
3815 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3816 {
3817 	struct hn_softc *sc = arg1;
3818 	int error, size;
3819 
3820 	size = sc->hn_agg_size;
3821 	error = sysctl_handle_int(oidp, &size, 0, req);
3822 	if (error || req->newptr == NULL)
3823 		return (error);
3824 
3825 	HN_LOCK(sc);
3826 	sc->hn_agg_size = size;
3827 	hn_set_txagg(sc);
3828 	HN_UNLOCK(sc);
3829 
3830 	return (0);
3831 }
3832 
3833 static int
3834 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3835 {
3836 	struct hn_softc *sc = arg1;
3837 	int error, pkts;
3838 
3839 	pkts = sc->hn_agg_pkts;
3840 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3841 	if (error || req->newptr == NULL)
3842 		return (error);
3843 
3844 	HN_LOCK(sc);
3845 	sc->hn_agg_pkts = pkts;
3846 	hn_set_txagg(sc);
3847 	HN_UNLOCK(sc);
3848 
3849 	return (0);
3850 }
3851 
3852 static int
3853 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3854 {
3855 	struct hn_softc *sc = arg1;
3856 	int pkts;
3857 
3858 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3859 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3860 }
3861 
3862 static int
3863 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3864 {
3865 	struct hn_softc *sc = arg1;
3866 	int align;
3867 
3868 	align = sc->hn_tx_ring[0].hn_agg_align;
3869 	return (sysctl_handle_int(oidp, &align, 0, req));
3870 }
3871 
3872 static void
3873 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3874 {
3875 	if (pollhz == 0)
3876 		vmbus_chan_poll_disable(chan);
3877 	else
3878 		vmbus_chan_poll_enable(chan, pollhz);
3879 }
3880 
3881 static void
3882 hn_polling(struct hn_softc *sc, u_int pollhz)
3883 {
3884 	int nsubch = sc->hn_rx_ring_inuse - 1;
3885 
3886 	HN_LOCK_ASSERT(sc);
3887 
3888 	if (nsubch > 0) {
3889 		struct vmbus_channel **subch;
3890 		int i;
3891 
3892 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3893 		for (i = 0; i < nsubch; ++i)
3894 			hn_chan_polling(subch[i], pollhz);
3895 		vmbus_subchan_rel(subch, nsubch);
3896 	}
3897 	hn_chan_polling(sc->hn_prichan, pollhz);
3898 }
3899 
3900 static int
3901 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3902 {
3903 	struct hn_softc *sc = arg1;
3904 	int pollhz, error;
3905 
3906 	pollhz = sc->hn_pollhz;
3907 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
3908 	if (error || req->newptr == NULL)
3909 		return (error);
3910 
3911 	if (pollhz != 0 &&
3912 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3913 		return (EINVAL);
3914 
3915 	HN_LOCK(sc);
3916 	if (sc->hn_pollhz != pollhz) {
3917 		sc->hn_pollhz = pollhz;
3918 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3919 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3920 			hn_polling(sc, sc->hn_pollhz);
3921 	}
3922 	HN_UNLOCK(sc);
3923 
3924 	return (0);
3925 }
3926 
3927 static int
3928 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3929 {
3930 	struct hn_softc *sc = arg1;
3931 	char verstr[16];
3932 
3933 	snprintf(verstr, sizeof(verstr), "%u.%u",
3934 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3935 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3936 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3937 }
3938 
3939 static int
3940 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3941 {
3942 	struct hn_softc *sc = arg1;
3943 	char caps_str[128];
3944 	uint32_t caps;
3945 
3946 	HN_LOCK(sc);
3947 	caps = sc->hn_caps;
3948 	HN_UNLOCK(sc);
3949 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3950 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3951 }
3952 
3953 static int
3954 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3955 {
3956 	struct hn_softc *sc = arg1;
3957 	char assist_str[128];
3958 	uint32_t hwassist;
3959 
3960 	HN_LOCK(sc);
3961 	hwassist = sc->hn_ifp->if_hwassist;
3962 	HN_UNLOCK(sc);
3963 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3964 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3965 }
3966 
3967 static int
3968 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3969 {
3970 	struct hn_softc *sc = arg1;
3971 	char filter_str[128];
3972 	uint32_t filter;
3973 
3974 	HN_LOCK(sc);
3975 	filter = sc->hn_rx_filter;
3976 	HN_UNLOCK(sc);
3977 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3978 	    NDIS_PACKET_TYPES);
3979 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3980 }
3981 
3982 #ifndef RSS
3983 
3984 static int
3985 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3986 {
3987 	struct hn_softc *sc = arg1;
3988 	int error;
3989 
3990 	HN_LOCK(sc);
3991 
3992 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3993 	if (error || req->newptr == NULL)
3994 		goto back;
3995 
3996 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3997 	if (error)
3998 		goto back;
3999 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4000 
4001 	if (sc->hn_rx_ring_inuse > 1) {
4002 		error = hn_rss_reconfig(sc);
4003 	} else {
4004 		/* Not RSS capable, at least for now; just save the RSS key. */
4005 		error = 0;
4006 	}
4007 back:
4008 	HN_UNLOCK(sc);
4009 	return (error);
4010 }
4011 
4012 static int
4013 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4014 {
4015 	struct hn_softc *sc = arg1;
4016 	int error;
4017 
4018 	HN_LOCK(sc);
4019 
4020 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4021 	if (error || req->newptr == NULL)
4022 		goto back;
4023 
4024 	/*
4025 	 * Don't allow RSS indirect table change, if this interface is not
4026 	 * RSS capable currently.
4027 	 */
4028 	if (sc->hn_rx_ring_inuse == 1) {
4029 		error = EOPNOTSUPP;
4030 		goto back;
4031 	}
4032 
4033 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4034 	if (error)
4035 		goto back;
4036 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4037 
4038 	hn_rss_ind_fixup(sc);
4039 	error = hn_rss_reconfig(sc);
4040 back:
4041 	HN_UNLOCK(sc);
4042 	return (error);
4043 }
4044 
4045 #endif	/* !RSS */
4046 
4047 static int
4048 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4049 {
4050 	struct hn_softc *sc = arg1;
4051 	char hash_str[128];
4052 	uint32_t hash;
4053 
4054 	HN_LOCK(sc);
4055 	hash = sc->hn_rss_hash;
4056 	HN_UNLOCK(sc);
4057 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4058 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4059 }
4060 
4061 static int
4062 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4063 {
4064 	struct hn_softc *sc = arg1;
4065 	char vf_name[IFNAMSIZ + 1];
4066 	struct ifnet *vf_ifp;
4067 
4068 	HN_LOCK(sc);
4069 	vf_name[0] = '\0';
4070 	vf_ifp = sc->hn_vf_ifp;
4071 	if (vf_ifp != NULL)
4072 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4073 	HN_UNLOCK(sc);
4074 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4075 }
4076 
4077 static int
4078 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4079 {
4080 	struct hn_softc *sc = arg1;
4081 	char vf_name[IFNAMSIZ + 1];
4082 	struct ifnet *vf_ifp;
4083 
4084 	HN_LOCK(sc);
4085 	vf_name[0] = '\0';
4086 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4087 	if (vf_ifp != NULL)
4088 		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4089 	HN_UNLOCK(sc);
4090 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4091 }
4092 
4093 static int
4094 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4095 {
4096 	struct rm_priotracker pt;
4097 	struct sbuf *sb;
4098 	int error, i;
4099 	bool first;
4100 
4101 	error = sysctl_wire_old_buffer(req, 0);
4102 	if (error != 0)
4103 		return (error);
4104 
4105 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4106 	if (sb == NULL)
4107 		return (ENOMEM);
4108 
4109 	rm_rlock(&hn_vfmap_lock, &pt);
4110 
4111 	first = true;
4112 	for (i = 0; i < hn_vfmap_size; ++i) {
4113 		struct ifnet *ifp;
4114 
4115 		if (hn_vfmap[i] == NULL)
4116 			continue;
4117 
4118 		ifp = ifnet_byindex(i);
4119 		if (ifp != NULL) {
4120 			if (first)
4121 				sbuf_printf(sb, "%s", ifp->if_xname);
4122 			else
4123 				sbuf_printf(sb, " %s", ifp->if_xname);
4124 			first = false;
4125 		}
4126 	}
4127 
4128 	rm_runlock(&hn_vfmap_lock, &pt);
4129 
4130 	error = sbuf_finish(sb);
4131 	sbuf_delete(sb);
4132 	return (error);
4133 }
4134 
4135 static int
4136 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4137 {
4138 	struct rm_priotracker pt;
4139 	struct sbuf *sb;
4140 	int error, i;
4141 	bool first;
4142 
4143 	error = sysctl_wire_old_buffer(req, 0);
4144 	if (error != 0)
4145 		return (error);
4146 
4147 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4148 	if (sb == NULL)
4149 		return (ENOMEM);
4150 
4151 	rm_rlock(&hn_vfmap_lock, &pt);
4152 
4153 	first = true;
4154 	for (i = 0; i < hn_vfmap_size; ++i) {
4155 		struct ifnet *ifp, *hn_ifp;
4156 
4157 		hn_ifp = hn_vfmap[i];
4158 		if (hn_ifp == NULL)
4159 			continue;
4160 
4161 		ifp = ifnet_byindex(i);
4162 		if (ifp != NULL) {
4163 			if (first) {
4164 				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4165 				    hn_ifp->if_xname);
4166 			} else {
4167 				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4168 				    hn_ifp->if_xname);
4169 			}
4170 			first = false;
4171 		}
4172 	}
4173 
4174 	rm_runlock(&hn_vfmap_lock, &pt);
4175 
4176 	error = sbuf_finish(sb);
4177 	sbuf_delete(sb);
4178 	return (error);
4179 }
4180 
4181 static int
4182 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4183 {
4184 	struct hn_softc *sc = arg1;
4185 	int error, onoff = 0;
4186 
4187 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4188 		onoff = 1;
4189 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4190 	if (error || req->newptr == NULL)
4191 		return (error);
4192 
4193 	HN_LOCK(sc);
4194 	/* NOTE: hn_vf_lock for hn_transmit() */
4195 	rm_wlock(&sc->hn_vf_lock);
4196 	if (onoff)
4197 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4198 	else
4199 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4200 	rm_wunlock(&sc->hn_vf_lock);
4201 	HN_UNLOCK(sc);
4202 
4203 	return (0);
4204 }
4205 
4206 static int
4207 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4208 {
4209 	struct hn_softc *sc = arg1;
4210 	int enabled = 0;
4211 
4212 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4213 		enabled = 1;
4214 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4215 }
4216 
4217 static int
4218 hn_check_iplen(const struct mbuf *m, int hoff)
4219 {
4220 	const struct ip *ip;
4221 	int len, iphlen, iplen;
4222 	const struct tcphdr *th;
4223 	int thoff;				/* TCP data offset */
4224 
4225 	len = hoff + sizeof(struct ip);
4226 
4227 	/* The packet must be at least the size of an IP header. */
4228 	if (m->m_pkthdr.len < len)
4229 		return IPPROTO_DONE;
4230 
4231 	/* The fixed IP header must reside completely in the first mbuf. */
4232 	if (m->m_len < len)
4233 		return IPPROTO_DONE;
4234 
4235 	ip = mtodo(m, hoff);
4236 
4237 	/* Bound check the packet's stated IP header length. */
4238 	iphlen = ip->ip_hl << 2;
4239 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4240 		return IPPROTO_DONE;
4241 
4242 	/* The full IP header must reside completely in the one mbuf. */
4243 	if (m->m_len < hoff + iphlen)
4244 		return IPPROTO_DONE;
4245 
4246 	iplen = ntohs(ip->ip_len);
4247 
4248 	/*
4249 	 * Check that the amount of data in the buffers is as
4250 	 * at least much as the IP header would have us expect.
4251 	 */
4252 	if (m->m_pkthdr.len < hoff + iplen)
4253 		return IPPROTO_DONE;
4254 
4255 	/*
4256 	 * Ignore IP fragments.
4257 	 */
4258 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4259 		return IPPROTO_DONE;
4260 
4261 	/*
4262 	 * The TCP/IP or UDP/IP header must be entirely contained within
4263 	 * the first fragment of a packet.
4264 	 */
4265 	switch (ip->ip_p) {
4266 	case IPPROTO_TCP:
4267 		if (iplen < iphlen + sizeof(struct tcphdr))
4268 			return IPPROTO_DONE;
4269 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4270 			return IPPROTO_DONE;
4271 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4272 		thoff = th->th_off << 2;
4273 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4274 			return IPPROTO_DONE;
4275 		if (m->m_len < hoff + iphlen + thoff)
4276 			return IPPROTO_DONE;
4277 		break;
4278 	case IPPROTO_UDP:
4279 		if (iplen < iphlen + sizeof(struct udphdr))
4280 			return IPPROTO_DONE;
4281 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4282 			return IPPROTO_DONE;
4283 		break;
4284 	default:
4285 		if (iplen < iphlen)
4286 			return IPPROTO_DONE;
4287 		break;
4288 	}
4289 	return ip->ip_p;
4290 }
4291 
4292 static int
4293 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4294 {
4295 	struct sysctl_oid_list *child;
4296 	struct sysctl_ctx_list *ctx;
4297 	device_t dev = sc->hn_dev;
4298 #if defined(INET) || defined(INET6)
4299 #if __FreeBSD_version >= 1100095
4300 	int lroent_cnt;
4301 #endif
4302 #endif
4303 	int i;
4304 
4305 	/*
4306 	 * Create RXBUF for reception.
4307 	 *
4308 	 * NOTE:
4309 	 * - It is shared by all channels.
4310 	 * - A large enough buffer is allocated, certain version of NVSes
4311 	 *   may further limit the usable space.
4312 	 */
4313 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4314 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4315 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4316 	if (sc->hn_rxbuf == NULL) {
4317 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4318 		return (ENOMEM);
4319 	}
4320 
4321 	sc->hn_rx_ring_cnt = ring_cnt;
4322 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4323 
4324 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4325 	    M_DEVBUF, M_WAITOK | M_ZERO);
4326 
4327 #if defined(INET) || defined(INET6)
4328 #if __FreeBSD_version >= 1100095
4329 	lroent_cnt = hn_lro_entry_count;
4330 	if (lroent_cnt < TCP_LRO_ENTRIES)
4331 		lroent_cnt = TCP_LRO_ENTRIES;
4332 	if (bootverbose)
4333 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4334 #endif
4335 #endif	/* INET || INET6 */
4336 
4337 	ctx = device_get_sysctl_ctx(dev);
4338 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4339 
4340 	/* Create dev.hn.UNIT.rx sysctl tree */
4341 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4342 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4343 
4344 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4345 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4346 
4347 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4348 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4349 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4350 		if (rxr->hn_br == NULL) {
4351 			device_printf(dev, "allocate bufring failed\n");
4352 			return (ENOMEM);
4353 		}
4354 
4355 		if (hn_trust_hosttcp)
4356 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4357 		if (hn_trust_hostudp)
4358 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4359 		if (hn_trust_hostip)
4360 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4361 		rxr->hn_ifp = sc->hn_ifp;
4362 		if (i < sc->hn_tx_ring_cnt)
4363 			rxr->hn_txr = &sc->hn_tx_ring[i];
4364 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4365 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4366 		rxr->hn_rx_idx = i;
4367 		rxr->hn_rxbuf = sc->hn_rxbuf;
4368 
4369 		/*
4370 		 * Initialize LRO.
4371 		 */
4372 #if defined(INET) || defined(INET6)
4373 #if __FreeBSD_version >= 1100095
4374 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4375 		    hn_lro_mbufq_depth);
4376 #else
4377 		tcp_lro_init(&rxr->hn_lro);
4378 		rxr->hn_lro.ifp = sc->hn_ifp;
4379 #endif
4380 #if __FreeBSD_version >= 1100099
4381 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4382 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4383 #endif
4384 #endif	/* INET || INET6 */
4385 
4386 		if (sc->hn_rx_sysctl_tree != NULL) {
4387 			char name[16];
4388 
4389 			/*
4390 			 * Create per RX ring sysctl tree:
4391 			 * dev.hn.UNIT.rx.RINGID
4392 			 */
4393 			snprintf(name, sizeof(name), "%d", i);
4394 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4395 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4396 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4397 
4398 			if (rxr->hn_rx_sysctl_tree != NULL) {
4399 				SYSCTL_ADD_ULONG(ctx,
4400 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4401 				    OID_AUTO, "packets", CTLFLAG_RW,
4402 				    &rxr->hn_pkts, "# of packets received");
4403 				SYSCTL_ADD_ULONG(ctx,
4404 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4405 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
4406 				    &rxr->hn_rss_pkts,
4407 				    "# of packets w/ RSS info received");
4408 				SYSCTL_ADD_INT(ctx,
4409 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4410 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4411 				    &rxr->hn_pktbuf_len, 0,
4412 				    "Temporary channel packet buffer length");
4413 			}
4414 		}
4415 	}
4416 
4417 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4418 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4419 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4420 #if __FreeBSD_version < 1100095
4421 	    hn_rx_stat_int_sysctl,
4422 #else
4423 	    hn_rx_stat_u64_sysctl,
4424 #endif
4425 	    "LU", "LRO queued");
4426 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4427 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4428 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4429 #if __FreeBSD_version < 1100095
4430 	    hn_rx_stat_int_sysctl,
4431 #else
4432 	    hn_rx_stat_u64_sysctl,
4433 #endif
4434 	    "LU", "LRO flushed");
4435 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4436 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4437 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
4438 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4439 #if __FreeBSD_version >= 1100099
4440 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4441 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4442 	    hn_lro_lenlim_sysctl, "IU",
4443 	    "Max # of data bytes to be aggregated by LRO");
4444 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4445 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4446 	    hn_lro_ackcnt_sysctl, "I",
4447 	    "Max # of ACKs to be aggregated by LRO");
4448 #endif
4449 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4450 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4451 	    hn_trust_hcsum_sysctl, "I",
4452 	    "Trust tcp segement verification on host side, "
4453 	    "when csum info is missing");
4454 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4455 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4456 	    hn_trust_hcsum_sysctl, "I",
4457 	    "Trust udp datagram verification on host side, "
4458 	    "when csum info is missing");
4459 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4460 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4461 	    hn_trust_hcsum_sysctl, "I",
4462 	    "Trust ip packet verification on host side, "
4463 	    "when csum info is missing");
4464 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4465 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4466 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
4467 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4468 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4469 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4470 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
4471 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4472 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4473 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4474 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
4475 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4476 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4477 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4478 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
4479 	    hn_rx_stat_ulong_sysctl, "LU",
4480 	    "# of packets that we trust host's csum verification");
4481 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4482 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4483 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
4484 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4485 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4486 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4487 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
4488 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4489 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4490 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4491 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4492 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4493 
4494 	return (0);
4495 }
4496 
4497 static void
4498 hn_destroy_rx_data(struct hn_softc *sc)
4499 {
4500 	int i;
4501 
4502 	if (sc->hn_rxbuf != NULL) {
4503 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4504 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4505 		else
4506 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
4507 		sc->hn_rxbuf = NULL;
4508 	}
4509 
4510 	if (sc->hn_rx_ring_cnt == 0)
4511 		return;
4512 
4513 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4514 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4515 
4516 		if (rxr->hn_br == NULL)
4517 			continue;
4518 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
4519 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
4520 		} else {
4521 			device_printf(sc->hn_dev,
4522 			    "%dth channel bufring is referenced", i);
4523 		}
4524 		rxr->hn_br = NULL;
4525 
4526 #if defined(INET) || defined(INET6)
4527 		tcp_lro_free(&rxr->hn_lro);
4528 #endif
4529 		free(rxr->hn_pktbuf, M_DEVBUF);
4530 	}
4531 	free(sc->hn_rx_ring, M_DEVBUF);
4532 	sc->hn_rx_ring = NULL;
4533 
4534 	sc->hn_rx_ring_cnt = 0;
4535 	sc->hn_rx_ring_inuse = 0;
4536 }
4537 
4538 static int
4539 hn_tx_ring_create(struct hn_softc *sc, int id)
4540 {
4541 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
4542 	device_t dev = sc->hn_dev;
4543 	bus_dma_tag_t parent_dtag;
4544 	int error, i;
4545 
4546 	txr->hn_sc = sc;
4547 	txr->hn_tx_idx = id;
4548 
4549 #ifndef HN_USE_TXDESC_BUFRING
4550 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
4551 #endif
4552 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
4553 
4554 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
4555 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
4556 	    M_DEVBUF, M_WAITOK | M_ZERO);
4557 #ifndef HN_USE_TXDESC_BUFRING
4558 	SLIST_INIT(&txr->hn_txlist);
4559 #else
4560 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
4561 	    M_WAITOK, &txr->hn_tx_lock);
4562 #endif
4563 
4564 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
4565 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
4566 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
4567 	} else {
4568 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
4569 	}
4570 
4571 #ifdef HN_IFSTART_SUPPORT
4572 	if (hn_use_if_start) {
4573 		txr->hn_txeof = hn_start_txeof;
4574 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
4575 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
4576 	} else
4577 #endif
4578 	{
4579 		int br_depth;
4580 
4581 		txr->hn_txeof = hn_xmit_txeof;
4582 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
4583 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
4584 
4585 		br_depth = hn_get_txswq_depth(txr);
4586 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
4587 		    M_WAITOK, &txr->hn_tx_lock);
4588 	}
4589 
4590 	txr->hn_direct_tx_size = hn_direct_tx_size;
4591 
4592 	/*
4593 	 * Always schedule transmission instead of trying to do direct
4594 	 * transmission.  This one gives the best performance so far.
4595 	 */
4596 	txr->hn_sched_tx = 1;
4597 
4598 	parent_dtag = bus_get_dma_tag(dev);
4599 
4600 	/* DMA tag for RNDIS packet messages. */
4601 	error = bus_dma_tag_create(parent_dtag, /* parent */
4602 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
4603 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
4604 	    BUS_SPACE_MAXADDR,		/* lowaddr */
4605 	    BUS_SPACE_MAXADDR,		/* highaddr */
4606 	    NULL, NULL,			/* filter, filterarg */
4607 	    HN_RNDIS_PKT_LEN,		/* maxsize */
4608 	    1,				/* nsegments */
4609 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
4610 	    0,				/* flags */
4611 	    NULL,			/* lockfunc */
4612 	    NULL,			/* lockfuncarg */
4613 	    &txr->hn_tx_rndis_dtag);
4614 	if (error) {
4615 		device_printf(dev, "failed to create rndis dmatag\n");
4616 		return error;
4617 	}
4618 
4619 	/* DMA tag for data. */
4620 	error = bus_dma_tag_create(parent_dtag, /* parent */
4621 	    1,				/* alignment */
4622 	    HN_TX_DATA_BOUNDARY,	/* boundary */
4623 	    BUS_SPACE_MAXADDR,		/* lowaddr */
4624 	    BUS_SPACE_MAXADDR,		/* highaddr */
4625 	    NULL, NULL,			/* filter, filterarg */
4626 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
4627 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
4628 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
4629 	    0,				/* flags */
4630 	    NULL,			/* lockfunc */
4631 	    NULL,			/* lockfuncarg */
4632 	    &txr->hn_tx_data_dtag);
4633 	if (error) {
4634 		device_printf(dev, "failed to create data dmatag\n");
4635 		return error;
4636 	}
4637 
4638 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
4639 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
4640 
4641 		txd->txr = txr;
4642 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
4643 		STAILQ_INIT(&txd->agg_list);
4644 
4645 		/*
4646 		 * Allocate and load RNDIS packet message.
4647 		 */
4648         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
4649 		    (void **)&txd->rndis_pkt,
4650 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
4651 		    &txd->rndis_pkt_dmap);
4652 		if (error) {
4653 			device_printf(dev,
4654 			    "failed to allocate rndis_packet_msg, %d\n", i);
4655 			return error;
4656 		}
4657 
4658 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
4659 		    txd->rndis_pkt_dmap,
4660 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
4661 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
4662 		    BUS_DMA_NOWAIT);
4663 		if (error) {
4664 			device_printf(dev,
4665 			    "failed to load rndis_packet_msg, %d\n", i);
4666 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4667 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4668 			return error;
4669 		}
4670 
4671 		/* DMA map for TX data. */
4672 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
4673 		    &txd->data_dmap);
4674 		if (error) {
4675 			device_printf(dev,
4676 			    "failed to allocate tx data dmamap\n");
4677 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4678 			    txd->rndis_pkt_dmap);
4679 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
4680 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
4681 			return error;
4682 		}
4683 
4684 		/* All set, put it to list */
4685 		txd->flags |= HN_TXD_FLAG_ONLIST;
4686 #ifndef HN_USE_TXDESC_BUFRING
4687 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4688 #else
4689 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
4690 #endif
4691 	}
4692 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4693 
4694 	if (sc->hn_tx_sysctl_tree != NULL) {
4695 		struct sysctl_oid_list *child;
4696 		struct sysctl_ctx_list *ctx;
4697 		char name[16];
4698 
4699 		/*
4700 		 * Create per TX ring sysctl tree:
4701 		 * dev.hn.UNIT.tx.RINGID
4702 		 */
4703 		ctx = device_get_sysctl_ctx(dev);
4704 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4705 
4706 		snprintf(name, sizeof(name), "%d", id);
4707 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4708 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4709 
4710 		if (txr->hn_tx_sysctl_tree != NULL) {
4711 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4712 
4713 #ifdef HN_DEBUG
4714 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4715 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4716 			    "# of available TX descs");
4717 #endif
4718 #ifdef HN_IFSTART_SUPPORT
4719 			if (!hn_use_if_start)
4720 #endif
4721 			{
4722 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4723 				    CTLFLAG_RD, &txr->hn_oactive, 0,
4724 				    "over active");
4725 			}
4726 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4727 			    CTLFLAG_RW, &txr->hn_pkts,
4728 			    "# of packets transmitted");
4729 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4730 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
4731 		}
4732 	}
4733 
4734 	return 0;
4735 }
4736 
4737 static void
4738 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4739 {
4740 	struct hn_tx_ring *txr = txd->txr;
4741 
4742 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
4743 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4744 
4745 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4746 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4747 	    txd->rndis_pkt_dmap);
4748 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4749 }
4750 
4751 static void
4752 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4753 {
4754 
4755 	KASSERT(txd->refs == 0 || txd->refs == 1,
4756 	    ("invalid txd refs %d", txd->refs));
4757 
4758 	/* Aggregated txds will be freed by their aggregating txd. */
4759 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4760 		int freed;
4761 
4762 		freed = hn_txdesc_put(txr, txd);
4763 		KASSERT(freed, ("can't free txdesc"));
4764 	}
4765 }
4766 
4767 static void
4768 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4769 {
4770 	int i;
4771 
4772 	if (txr->hn_txdesc == NULL)
4773 		return;
4774 
4775 	/*
4776 	 * NOTE:
4777 	 * Because the freeing of aggregated txds will be deferred
4778 	 * to the aggregating txd, two passes are used here:
4779 	 * - The first pass GCes any pending txds.  This GC is necessary,
4780 	 *   since if the channels are revoked, hypervisor will not
4781 	 *   deliver send-done for all pending txds.
4782 	 * - The second pass frees the busdma stuffs, i.e. after all txds
4783 	 *   were freed.
4784 	 */
4785 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4786 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4787 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4788 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4789 
4790 	if (txr->hn_tx_data_dtag != NULL)
4791 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4792 	if (txr->hn_tx_rndis_dtag != NULL)
4793 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4794 
4795 #ifdef HN_USE_TXDESC_BUFRING
4796 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4797 #endif
4798 
4799 	free(txr->hn_txdesc, M_DEVBUF);
4800 	txr->hn_txdesc = NULL;
4801 
4802 	if (txr->hn_mbuf_br != NULL)
4803 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4804 
4805 #ifndef HN_USE_TXDESC_BUFRING
4806 	mtx_destroy(&txr->hn_txlist_spin);
4807 #endif
4808 	mtx_destroy(&txr->hn_tx_lock);
4809 }
4810 
4811 static int
4812 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4813 {
4814 	struct sysctl_oid_list *child;
4815 	struct sysctl_ctx_list *ctx;
4816 	int i;
4817 
4818 	/*
4819 	 * Create TXBUF for chimney sending.
4820 	 *
4821 	 * NOTE: It is shared by all channels.
4822 	 */
4823 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4824 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4825 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4826 	if (sc->hn_chim == NULL) {
4827 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
4828 		return (ENOMEM);
4829 	}
4830 
4831 	sc->hn_tx_ring_cnt = ring_cnt;
4832 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4833 
4834 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4835 	    M_DEVBUF, M_WAITOK | M_ZERO);
4836 
4837 	ctx = device_get_sysctl_ctx(sc->hn_dev);
4838 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
4839 
4840 	/* Create dev.hn.UNIT.tx sysctl tree */
4841 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
4842 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4843 
4844 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4845 		int error;
4846 
4847 		error = hn_tx_ring_create(sc, i);
4848 		if (error)
4849 			return error;
4850 	}
4851 
4852 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
4853 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4854 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
4855 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
4856 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
4857 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4858 	    __offsetof(struct hn_tx_ring, hn_send_failed),
4859 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
4860 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
4861 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4862 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
4863 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
4864 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
4865 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4866 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
4867 	    hn_tx_stat_ulong_sysctl, "LU",
4868 	    "# of packet transmission aggregation flush failure");
4869 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
4870 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4871 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
4872 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
4873 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
4874 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4875 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
4876 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
4877 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
4878 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4879 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
4880 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
4881 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
4882 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
4883 	    "# of total TX descs");
4884 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
4885 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
4886 	    "Chimney send packet size upper boundary");
4887 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
4888 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4889 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
4890 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
4891 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4892 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
4893 	    hn_tx_conf_int_sysctl, "I",
4894 	    "Size of the packet for direct transmission");
4895 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
4896 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4897 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
4898 	    hn_tx_conf_int_sysctl, "I",
4899 	    "Always schedule transmission "
4900 	    "instead of doing direct transmission");
4901 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
4902 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
4903 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
4904 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
4905 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
4906 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
4907 	    "Applied packet transmission aggregation size");
4908 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
4909 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4910 	    hn_txagg_pktmax_sysctl, "I",
4911 	    "Applied packet transmission aggregation packets");
4912 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
4913 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4914 	    hn_txagg_align_sysctl, "I",
4915 	    "Applied packet transmission aggregation alignment");
4916 
4917 	return 0;
4918 }
4919 
4920 static void
4921 hn_set_chim_size(struct hn_softc *sc, int chim_size)
4922 {
4923 	int i;
4924 
4925 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4926 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
4927 }
4928 
4929 static void
4930 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
4931 {
4932 	struct ifnet *ifp = sc->hn_ifp;
4933 	u_int hw_tsomax;
4934 	int tso_minlen;
4935 
4936 	HN_LOCK_ASSERT(sc);
4937 
4938 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
4939 		return;
4940 
4941 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
4942 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
4943 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
4944 
4945 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
4946 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
4947 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
4948 
4949 	if (tso_maxlen < tso_minlen)
4950 		tso_maxlen = tso_minlen;
4951 	else if (tso_maxlen > IP_MAXPACKET)
4952 		tso_maxlen = IP_MAXPACKET;
4953 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
4954 		tso_maxlen = sc->hn_ndis_tso_szmax;
4955 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4956 
4957 	if (hn_xpnt_vf_isready(sc)) {
4958 		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
4959 			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
4960 	}
4961 	ifp->if_hw_tsomax = hw_tsomax;
4962 	if (bootverbose)
4963 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
4964 }
4965 
4966 static void
4967 hn_fixup_tx_data(struct hn_softc *sc)
4968 {
4969 	uint64_t csum_assist;
4970 	int i;
4971 
4972 	hn_set_chim_size(sc, sc->hn_chim_szmax);
4973 	if (hn_tx_chimney_size > 0 &&
4974 	    hn_tx_chimney_size < sc->hn_chim_szmax)
4975 		hn_set_chim_size(sc, hn_tx_chimney_size);
4976 
4977 	csum_assist = 0;
4978 	if (sc->hn_caps & HN_CAP_IPCS)
4979 		csum_assist |= CSUM_IP;
4980 	if (sc->hn_caps & HN_CAP_TCP4CS)
4981 		csum_assist |= CSUM_IP_TCP;
4982 	if (sc->hn_caps & HN_CAP_UDP4CS)
4983 		csum_assist |= CSUM_IP_UDP;
4984 	if (sc->hn_caps & HN_CAP_TCP6CS)
4985 		csum_assist |= CSUM_IP6_TCP;
4986 	if (sc->hn_caps & HN_CAP_UDP6CS)
4987 		csum_assist |= CSUM_IP6_UDP;
4988 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4989 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4990 
4991 	if (sc->hn_caps & HN_CAP_HASHVAL) {
4992 		/*
4993 		 * Support HASHVAL pktinfo on TX path.
4994 		 */
4995 		if (bootverbose)
4996 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4997 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4998 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4999 	}
5000 }
5001 
5002 static void
5003 hn_destroy_tx_data(struct hn_softc *sc)
5004 {
5005 	int i;
5006 
5007 	if (sc->hn_chim != NULL) {
5008 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5009 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5010 		} else {
5011 			device_printf(sc->hn_dev,
5012 			    "chimney sending buffer is referenced");
5013 		}
5014 		sc->hn_chim = NULL;
5015 	}
5016 
5017 	if (sc->hn_tx_ring_cnt == 0)
5018 		return;
5019 
5020 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5021 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5022 
5023 	free(sc->hn_tx_ring, M_DEVBUF);
5024 	sc->hn_tx_ring = NULL;
5025 
5026 	sc->hn_tx_ring_cnt = 0;
5027 	sc->hn_tx_ring_inuse = 0;
5028 }
5029 
5030 #ifdef HN_IFSTART_SUPPORT
5031 
5032 static void
5033 hn_start_taskfunc(void *xtxr, int pending __unused)
5034 {
5035 	struct hn_tx_ring *txr = xtxr;
5036 
5037 	mtx_lock(&txr->hn_tx_lock);
5038 	hn_start_locked(txr, 0);
5039 	mtx_unlock(&txr->hn_tx_lock);
5040 }
5041 
5042 static int
5043 hn_start_locked(struct hn_tx_ring *txr, int len)
5044 {
5045 	struct hn_softc *sc = txr->hn_sc;
5046 	struct ifnet *ifp = sc->hn_ifp;
5047 	int sched = 0;
5048 
5049 	KASSERT(hn_use_if_start,
5050 	    ("hn_start_locked is called, when if_start is disabled"));
5051 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5052 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5053 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5054 
5055 	if (__predict_false(txr->hn_suspended))
5056 		return (0);
5057 
5058 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5059 	    IFF_DRV_RUNNING)
5060 		return (0);
5061 
5062 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5063 		struct hn_txdesc *txd;
5064 		struct mbuf *m_head;
5065 		int error;
5066 
5067 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5068 		if (m_head == NULL)
5069 			break;
5070 
5071 		if (len > 0 && m_head->m_pkthdr.len > len) {
5072 			/*
5073 			 * This sending could be time consuming; let callers
5074 			 * dispatch this packet sending (and sending of any
5075 			 * following up packets) to tx taskqueue.
5076 			 */
5077 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5078 			sched = 1;
5079 			break;
5080 		}
5081 
5082 #if defined(INET6) || defined(INET)
5083 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5084 			m_head = hn_tso_fixup(m_head);
5085 			if (__predict_false(m_head == NULL)) {
5086 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5087 				continue;
5088 			}
5089 		}
5090 #endif
5091 
5092 		txd = hn_txdesc_get(txr);
5093 		if (txd == NULL) {
5094 			txr->hn_no_txdescs++;
5095 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5096 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5097 			break;
5098 		}
5099 
5100 		error = hn_encap(ifp, txr, txd, &m_head);
5101 		if (error) {
5102 			/* Both txd and m_head are freed */
5103 			KASSERT(txr->hn_agg_txd == NULL,
5104 			    ("encap failed w/ pending aggregating txdesc"));
5105 			continue;
5106 		}
5107 
5108 		if (txr->hn_agg_pktleft == 0) {
5109 			if (txr->hn_agg_txd != NULL) {
5110 				KASSERT(m_head == NULL,
5111 				    ("pending mbuf for aggregating txdesc"));
5112 				error = hn_flush_txagg(ifp, txr);
5113 				if (__predict_false(error)) {
5114 					atomic_set_int(&ifp->if_drv_flags,
5115 					    IFF_DRV_OACTIVE);
5116 					break;
5117 				}
5118 			} else {
5119 				KASSERT(m_head != NULL, ("mbuf was freed"));
5120 				error = hn_txpkt(ifp, txr, txd);
5121 				if (__predict_false(error)) {
5122 					/* txd is freed, but m_head is not */
5123 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5124 					atomic_set_int(&ifp->if_drv_flags,
5125 					    IFF_DRV_OACTIVE);
5126 					break;
5127 				}
5128 			}
5129 		}
5130 #ifdef INVARIANTS
5131 		else {
5132 			KASSERT(txr->hn_agg_txd != NULL,
5133 			    ("no aggregating txdesc"));
5134 			KASSERT(m_head == NULL,
5135 			    ("pending mbuf for aggregating txdesc"));
5136 		}
5137 #endif
5138 	}
5139 
5140 	/* Flush pending aggerated transmission. */
5141 	if (txr->hn_agg_txd != NULL)
5142 		hn_flush_txagg(ifp, txr);
5143 	return (sched);
5144 }
5145 
5146 static void
5147 hn_start(struct ifnet *ifp)
5148 {
5149 	struct hn_softc *sc = ifp->if_softc;
5150 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5151 
5152 	if (txr->hn_sched_tx)
5153 		goto do_sched;
5154 
5155 	if (mtx_trylock(&txr->hn_tx_lock)) {
5156 		int sched;
5157 
5158 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5159 		mtx_unlock(&txr->hn_tx_lock);
5160 		if (!sched)
5161 			return;
5162 	}
5163 do_sched:
5164 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5165 }
5166 
5167 static void
5168 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5169 {
5170 	struct hn_tx_ring *txr = xtxr;
5171 
5172 	mtx_lock(&txr->hn_tx_lock);
5173 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5174 	hn_start_locked(txr, 0);
5175 	mtx_unlock(&txr->hn_tx_lock);
5176 }
5177 
5178 static void
5179 hn_start_txeof(struct hn_tx_ring *txr)
5180 {
5181 	struct hn_softc *sc = txr->hn_sc;
5182 	struct ifnet *ifp = sc->hn_ifp;
5183 
5184 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5185 
5186 	if (txr->hn_sched_tx)
5187 		goto do_sched;
5188 
5189 	if (mtx_trylock(&txr->hn_tx_lock)) {
5190 		int sched;
5191 
5192 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5193 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5194 		mtx_unlock(&txr->hn_tx_lock);
5195 		if (sched) {
5196 			taskqueue_enqueue(txr->hn_tx_taskq,
5197 			    &txr->hn_tx_task);
5198 		}
5199 	} else {
5200 do_sched:
5201 		/*
5202 		 * Release the OACTIVE earlier, with the hope, that
5203 		 * others could catch up.  The task will clear the
5204 		 * flag again with the hn_tx_lock to avoid possible
5205 		 * races.
5206 		 */
5207 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5208 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5209 	}
5210 }
5211 
5212 #endif	/* HN_IFSTART_SUPPORT */
5213 
5214 static int
5215 hn_xmit(struct hn_tx_ring *txr, int len)
5216 {
5217 	struct hn_softc *sc = txr->hn_sc;
5218 	struct ifnet *ifp = sc->hn_ifp;
5219 	struct mbuf *m_head;
5220 	int sched = 0;
5221 
5222 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5223 #ifdef HN_IFSTART_SUPPORT
5224 	KASSERT(hn_use_if_start == 0,
5225 	    ("hn_xmit is called, when if_start is enabled"));
5226 #endif
5227 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5228 
5229 	if (__predict_false(txr->hn_suspended))
5230 		return (0);
5231 
5232 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5233 		return (0);
5234 
5235 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5236 		struct hn_txdesc *txd;
5237 		int error;
5238 
5239 		if (len > 0 && m_head->m_pkthdr.len > len) {
5240 			/*
5241 			 * This sending could be time consuming; let callers
5242 			 * dispatch this packet sending (and sending of any
5243 			 * following up packets) to tx taskqueue.
5244 			 */
5245 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5246 			sched = 1;
5247 			break;
5248 		}
5249 
5250 		txd = hn_txdesc_get(txr);
5251 		if (txd == NULL) {
5252 			txr->hn_no_txdescs++;
5253 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5254 			txr->hn_oactive = 1;
5255 			break;
5256 		}
5257 
5258 		error = hn_encap(ifp, txr, txd, &m_head);
5259 		if (error) {
5260 			/* Both txd and m_head are freed; discard */
5261 			KASSERT(txr->hn_agg_txd == NULL,
5262 			    ("encap failed w/ pending aggregating txdesc"));
5263 			drbr_advance(ifp, txr->hn_mbuf_br);
5264 			continue;
5265 		}
5266 
5267 		if (txr->hn_agg_pktleft == 0) {
5268 			if (txr->hn_agg_txd != NULL) {
5269 				KASSERT(m_head == NULL,
5270 				    ("pending mbuf for aggregating txdesc"));
5271 				error = hn_flush_txagg(ifp, txr);
5272 				if (__predict_false(error)) {
5273 					txr->hn_oactive = 1;
5274 					break;
5275 				}
5276 			} else {
5277 				KASSERT(m_head != NULL, ("mbuf was freed"));
5278 				error = hn_txpkt(ifp, txr, txd);
5279 				if (__predict_false(error)) {
5280 					/* txd is freed, but m_head is not */
5281 					drbr_putback(ifp, txr->hn_mbuf_br,
5282 					    m_head);
5283 					txr->hn_oactive = 1;
5284 					break;
5285 				}
5286 			}
5287 		}
5288 #ifdef INVARIANTS
5289 		else {
5290 			KASSERT(txr->hn_agg_txd != NULL,
5291 			    ("no aggregating txdesc"));
5292 			KASSERT(m_head == NULL,
5293 			    ("pending mbuf for aggregating txdesc"));
5294 		}
5295 #endif
5296 
5297 		/* Sent */
5298 		drbr_advance(ifp, txr->hn_mbuf_br);
5299 	}
5300 
5301 	/* Flush pending aggerated transmission. */
5302 	if (txr->hn_agg_txd != NULL)
5303 		hn_flush_txagg(ifp, txr);
5304 	return (sched);
5305 }
5306 
5307 static int
5308 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5309 {
5310 	struct hn_softc *sc = ifp->if_softc;
5311 	struct hn_tx_ring *txr;
5312 	int error, idx = 0;
5313 
5314 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5315 		struct rm_priotracker pt;
5316 
5317 		rm_rlock(&sc->hn_vf_lock, &pt);
5318 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5319 			struct mbuf *m_bpf = NULL;
5320 			int obytes, omcast;
5321 
5322 			obytes = m->m_pkthdr.len;
5323 			if (m->m_flags & M_MCAST)
5324 				omcast = 1;
5325 
5326 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5327 				if (bpf_peers_present(ifp->if_bpf)) {
5328 					m_bpf = m_copypacket(m, M_NOWAIT);
5329 					if (m_bpf == NULL) {
5330 						/*
5331 						 * Failed to grab a shallow
5332 						 * copy; tap now.
5333 						 */
5334 						ETHER_BPF_MTAP(ifp, m);
5335 					}
5336 				}
5337 			} else {
5338 				ETHER_BPF_MTAP(ifp, m);
5339 			}
5340 
5341 			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5342 			rm_runlock(&sc->hn_vf_lock, &pt);
5343 
5344 			if (m_bpf != NULL) {
5345 				if (!error)
5346 					ETHER_BPF_MTAP(ifp, m_bpf);
5347 				m_freem(m_bpf);
5348 			}
5349 
5350 			if (error == ENOBUFS) {
5351 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5352 			} else if (error) {
5353 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5354 			} else {
5355 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5356 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5357 				if (omcast) {
5358 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5359 					    omcast);
5360 				}
5361 			}
5362 			return (error);
5363 		}
5364 		rm_runlock(&sc->hn_vf_lock, &pt);
5365 	}
5366 
5367 #if defined(INET6) || defined(INET)
5368 	/*
5369 	 * Perform TSO packet header fixup now, since the TSO
5370 	 * packet header should be cache-hot.
5371 	 */
5372 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5373 		m = hn_tso_fixup(m);
5374 		if (__predict_false(m == NULL)) {
5375 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5376 			return EIO;
5377 		}
5378 	}
5379 #endif
5380 
5381 	/*
5382 	 * Select the TX ring based on flowid
5383 	 */
5384 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5385 #ifdef RSS
5386 		uint32_t bid;
5387 
5388 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
5389 		    &bid) == 0)
5390 			idx = bid % sc->hn_tx_ring_inuse;
5391 		else
5392 #endif
5393 		{
5394 #if defined(INET6) || defined(INET)
5395 			int tcpsyn = 0;
5396 
5397 			if (m->m_pkthdr.len < 128 &&
5398 			    (m->m_pkthdr.csum_flags &
5399 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5400 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5401 				m = hn_check_tcpsyn(m, &tcpsyn);
5402 				if (__predict_false(m == NULL)) {
5403 					if_inc_counter(ifp,
5404 					    IFCOUNTER_OERRORS, 1);
5405 					return (EIO);
5406 				}
5407 			}
5408 #else
5409 			const int tcpsyn = 0;
5410 #endif
5411 			if (tcpsyn)
5412 				idx = 0;
5413 			else
5414 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5415 		}
5416 	}
5417 	txr = &sc->hn_tx_ring[idx];
5418 
5419 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5420 	if (error) {
5421 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5422 		return error;
5423 	}
5424 
5425 	if (txr->hn_oactive)
5426 		return 0;
5427 
5428 	if (txr->hn_sched_tx)
5429 		goto do_sched;
5430 
5431 	if (mtx_trylock(&txr->hn_tx_lock)) {
5432 		int sched;
5433 
5434 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
5435 		mtx_unlock(&txr->hn_tx_lock);
5436 		if (!sched)
5437 			return 0;
5438 	}
5439 do_sched:
5440 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5441 	return 0;
5442 }
5443 
5444 static void
5445 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5446 {
5447 	struct mbuf *m;
5448 
5449 	mtx_lock(&txr->hn_tx_lock);
5450 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5451 		m_freem(m);
5452 	mtx_unlock(&txr->hn_tx_lock);
5453 }
5454 
5455 static void
5456 hn_xmit_qflush(struct ifnet *ifp)
5457 {
5458 	struct hn_softc *sc = ifp->if_softc;
5459 	struct rm_priotracker pt;
5460 	int i;
5461 
5462 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5463 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5464 	if_qflush(ifp);
5465 
5466 	rm_rlock(&sc->hn_vf_lock, &pt);
5467 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5468 		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5469 	rm_runlock(&sc->hn_vf_lock, &pt);
5470 }
5471 
5472 static void
5473 hn_xmit_txeof(struct hn_tx_ring *txr)
5474 {
5475 
5476 	if (txr->hn_sched_tx)
5477 		goto do_sched;
5478 
5479 	if (mtx_trylock(&txr->hn_tx_lock)) {
5480 		int sched;
5481 
5482 		txr->hn_oactive = 0;
5483 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
5484 		mtx_unlock(&txr->hn_tx_lock);
5485 		if (sched) {
5486 			taskqueue_enqueue(txr->hn_tx_taskq,
5487 			    &txr->hn_tx_task);
5488 		}
5489 	} else {
5490 do_sched:
5491 		/*
5492 		 * Release the oactive earlier, with the hope, that
5493 		 * others could catch up.  The task will clear the
5494 		 * oactive again with the hn_tx_lock to avoid possible
5495 		 * races.
5496 		 */
5497 		txr->hn_oactive = 0;
5498 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5499 	}
5500 }
5501 
5502 static void
5503 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5504 {
5505 	struct hn_tx_ring *txr = xtxr;
5506 
5507 	mtx_lock(&txr->hn_tx_lock);
5508 	hn_xmit(txr, 0);
5509 	mtx_unlock(&txr->hn_tx_lock);
5510 }
5511 
5512 static void
5513 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
5514 {
5515 	struct hn_tx_ring *txr = xtxr;
5516 
5517 	mtx_lock(&txr->hn_tx_lock);
5518 	txr->hn_oactive = 0;
5519 	hn_xmit(txr, 0);
5520 	mtx_unlock(&txr->hn_tx_lock);
5521 }
5522 
5523 static int
5524 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
5525 {
5526 	struct vmbus_chan_br cbr;
5527 	struct hn_rx_ring *rxr;
5528 	struct hn_tx_ring *txr = NULL;
5529 	int idx, error;
5530 
5531 	idx = vmbus_chan_subidx(chan);
5532 
5533 	/*
5534 	 * Link this channel to RX/TX ring.
5535 	 */
5536 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5537 	    ("invalid channel index %d, should > 0 && < %d",
5538 	     idx, sc->hn_rx_ring_inuse));
5539 	rxr = &sc->hn_rx_ring[idx];
5540 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
5541 	    ("RX ring %d already attached", idx));
5542 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
5543 	rxr->hn_chan = chan;
5544 
5545 	if (bootverbose) {
5546 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
5547 		    idx, vmbus_chan_id(chan));
5548 	}
5549 
5550 	if (idx < sc->hn_tx_ring_inuse) {
5551 		txr = &sc->hn_tx_ring[idx];
5552 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
5553 		    ("TX ring %d already attached", idx));
5554 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
5555 
5556 		txr->hn_chan = chan;
5557 		if (bootverbose) {
5558 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
5559 			    idx, vmbus_chan_id(chan));
5560 		}
5561 	}
5562 
5563 	/* Bind this channel to a proper CPU. */
5564 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
5565 
5566 	/*
5567 	 * Open this channel
5568 	 */
5569 	cbr.cbr = rxr->hn_br;
5570 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
5571 	cbr.cbr_txsz = HN_TXBR_SIZE;
5572 	cbr.cbr_rxsz = HN_RXBR_SIZE;
5573 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
5574 	if (error) {
5575 		if (error == EISCONN) {
5576 			if_printf(sc->hn_ifp, "bufring is connected after "
5577 			    "chan%u open failure\n", vmbus_chan_id(chan));
5578 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5579 		} else {
5580 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
5581 			    vmbus_chan_id(chan), error);
5582 		}
5583 	}
5584 	return (error);
5585 }
5586 
5587 static void
5588 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
5589 {
5590 	struct hn_rx_ring *rxr;
5591 	int idx, error;
5592 
5593 	idx = vmbus_chan_subidx(chan);
5594 
5595 	/*
5596 	 * Link this channel to RX/TX ring.
5597 	 */
5598 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5599 	    ("invalid channel index %d, should > 0 && < %d",
5600 	     idx, sc->hn_rx_ring_inuse));
5601 	rxr = &sc->hn_rx_ring[idx];
5602 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
5603 	    ("RX ring %d is not attached", idx));
5604 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
5605 
5606 	if (idx < sc->hn_tx_ring_inuse) {
5607 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
5608 
5609 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
5610 		    ("TX ring %d is not attached attached", idx));
5611 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
5612 	}
5613 
5614 	/*
5615 	 * Close this channel.
5616 	 *
5617 	 * NOTE:
5618 	 * Channel closing does _not_ destroy the target channel.
5619 	 */
5620 	error = vmbus_chan_close_direct(chan);
5621 	if (error == EISCONN) {
5622 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
5623 		    "after being closed\n", vmbus_chan_id(chan));
5624 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5625 	} else if (error) {
5626 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
5627 		    vmbus_chan_id(chan), error);
5628 	}
5629 }
5630 
5631 static int
5632 hn_attach_subchans(struct hn_softc *sc)
5633 {
5634 	struct vmbus_channel **subchans;
5635 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5636 	int i, error = 0;
5637 
5638 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
5639 
5640 	/* Attach the sub-channels. */
5641 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5642 	for (i = 0; i < subchan_cnt; ++i) {
5643 		int error1;
5644 
5645 		error1 = hn_chan_attach(sc, subchans[i]);
5646 		if (error1) {
5647 			error = error1;
5648 			/* Move on; all channels will be detached later. */
5649 		}
5650 	}
5651 	vmbus_subchan_rel(subchans, subchan_cnt);
5652 
5653 	if (error) {
5654 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
5655 	} else {
5656 		if (bootverbose) {
5657 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
5658 			    subchan_cnt);
5659 		}
5660 	}
5661 	return (error);
5662 }
5663 
5664 static void
5665 hn_detach_allchans(struct hn_softc *sc)
5666 {
5667 	struct vmbus_channel **subchans;
5668 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5669 	int i;
5670 
5671 	if (subchan_cnt == 0)
5672 		goto back;
5673 
5674 	/* Detach the sub-channels. */
5675 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5676 	for (i = 0; i < subchan_cnt; ++i)
5677 		hn_chan_detach(sc, subchans[i]);
5678 	vmbus_subchan_rel(subchans, subchan_cnt);
5679 
5680 back:
5681 	/*
5682 	 * Detach the primary channel, _after_ all sub-channels
5683 	 * are detached.
5684 	 */
5685 	hn_chan_detach(sc, sc->hn_prichan);
5686 
5687 	/* Wait for sub-channels to be destroyed, if any. */
5688 	vmbus_subchan_drain(sc->hn_prichan);
5689 
5690 #ifdef INVARIANTS
5691 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5692 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
5693 		    HN_RX_FLAG_ATTACHED) == 0,
5694 		    ("%dth RX ring is still attached", i));
5695 	}
5696 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5697 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
5698 		    HN_TX_FLAG_ATTACHED) == 0,
5699 		    ("%dth TX ring is still attached", i));
5700 	}
5701 #endif
5702 }
5703 
5704 static int
5705 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
5706 {
5707 	struct vmbus_channel **subchans;
5708 	int nchan, rxr_cnt, error;
5709 
5710 	nchan = *nsubch + 1;
5711 	if (nchan == 1) {
5712 		/*
5713 		 * Multiple RX/TX rings are not requested.
5714 		 */
5715 		*nsubch = 0;
5716 		return (0);
5717 	}
5718 
5719 	/*
5720 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
5721 	 * table entries.
5722 	 */
5723 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
5724 	if (error) {
5725 		/* No RSS; this is benign. */
5726 		*nsubch = 0;
5727 		return (0);
5728 	}
5729 	if (bootverbose) {
5730 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
5731 		    rxr_cnt, nchan);
5732 	}
5733 
5734 	if (nchan > rxr_cnt)
5735 		nchan = rxr_cnt;
5736 	if (nchan == 1) {
5737 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
5738 		*nsubch = 0;
5739 		return (0);
5740 	}
5741 
5742 	/*
5743 	 * Allocate sub-channels from NVS.
5744 	 */
5745 	*nsubch = nchan - 1;
5746 	error = hn_nvs_alloc_subchans(sc, nsubch);
5747 	if (error || *nsubch == 0) {
5748 		/* Failed to allocate sub-channels. */
5749 		*nsubch = 0;
5750 		return (0);
5751 	}
5752 
5753 	/*
5754 	 * Wait for all sub-channels to become ready before moving on.
5755 	 */
5756 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5757 	vmbus_subchan_rel(subchans, *nsubch);
5758 	return (0);
5759 }
5760 
5761 static bool
5762 hn_synth_attachable(const struct hn_softc *sc)
5763 {
5764 	int i;
5765 
5766 	if (sc->hn_flags & HN_FLAG_ERRORS)
5767 		return (false);
5768 
5769 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5770 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5771 
5772 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5773 			return (false);
5774 	}
5775 	return (true);
5776 }
5777 
5778 /*
5779  * Make sure that the RX filter is zero after the successful
5780  * RNDIS initialization.
5781  *
5782  * NOTE:
5783  * Under certain conditions on certain versions of Hyper-V,
5784  * the RNDIS rxfilter is _not_ zero on the hypervisor side
5785  * after the successful RNDIS initialization, which breaks
5786  * the assumption of any following code (well, it breaks the
5787  * RNDIS API contract actually).  Clear the RNDIS rxfilter
5788  * explicitly, drain packets sneaking through, and drain the
5789  * interrupt taskqueues scheduled due to the stealth packets.
5790  */
5791 static void
5792 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5793 {
5794 
5795 	hn_disable_rx(sc);
5796 	hn_drain_rxtx(sc, nchan);
5797 }
5798 
5799 static int
5800 hn_synth_attach(struct hn_softc *sc, int mtu)
5801 {
5802 #define ATTACHED_NVS		0x0002
5803 #define ATTACHED_RNDIS		0x0004
5804 
5805 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5806 	int error, nsubch, nchan = 1, i, rndis_inited;
5807 	uint32_t old_caps, attached = 0;
5808 
5809 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5810 	    ("synthetic parts were attached"));
5811 
5812 	if (!hn_synth_attachable(sc))
5813 		return (ENXIO);
5814 
5815 	/* Save capabilities for later verification. */
5816 	old_caps = sc->hn_caps;
5817 	sc->hn_caps = 0;
5818 
5819 	/* Clear RSS stuffs. */
5820 	sc->hn_rss_ind_size = 0;
5821 	sc->hn_rss_hash = 0;
5822 
5823 	/*
5824 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
5825 	 */
5826 	error = hn_chan_attach(sc, sc->hn_prichan);
5827 	if (error)
5828 		goto failed;
5829 
5830 	/*
5831 	 * Attach NVS.
5832 	 */
5833 	error = hn_nvs_attach(sc, mtu);
5834 	if (error)
5835 		goto failed;
5836 	attached |= ATTACHED_NVS;
5837 
5838 	/*
5839 	 * Attach RNDIS _after_ NVS is attached.
5840 	 */
5841 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
5842 	if (rndis_inited)
5843 		attached |= ATTACHED_RNDIS;
5844 	if (error)
5845 		goto failed;
5846 
5847 	/*
5848 	 * Make sure capabilities are not changed.
5849 	 */
5850 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
5851 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
5852 		    old_caps, sc->hn_caps);
5853 		error = ENXIO;
5854 		goto failed;
5855 	}
5856 
5857 	/*
5858 	 * Allocate sub-channels for multi-TX/RX rings.
5859 	 *
5860 	 * NOTE:
5861 	 * The # of RX rings that can be used is equivalent to the # of
5862 	 * channels to be requested.
5863 	 */
5864 	nsubch = sc->hn_rx_ring_cnt - 1;
5865 	error = hn_synth_alloc_subchans(sc, &nsubch);
5866 	if (error)
5867 		goto failed;
5868 	/* NOTE: _Full_ synthetic parts detach is required now. */
5869 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
5870 
5871 	/*
5872 	 * Set the # of TX/RX rings that could be used according to
5873 	 * the # of channels that NVS offered.
5874 	 */
5875 	nchan = nsubch + 1;
5876 	hn_set_ring_inuse(sc, nchan);
5877 	if (nchan == 1) {
5878 		/* Only the primary channel can be used; done */
5879 		goto back;
5880 	}
5881 
5882 	/*
5883 	 * Attach the sub-channels.
5884 	 *
5885 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
5886 	 */
5887 	error = hn_attach_subchans(sc);
5888 	if (error)
5889 		goto failed;
5890 
5891 	/*
5892 	 * Configure RSS key and indirect table _after_ all sub-channels
5893 	 * are attached.
5894 	 */
5895 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
5896 		/*
5897 		 * RSS key is not set yet; set it to the default RSS key.
5898 		 */
5899 		if (bootverbose)
5900 			if_printf(sc->hn_ifp, "setup default RSS key\n");
5901 #ifdef RSS
5902 		rss_getkey(rss->rss_key);
5903 #else
5904 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
5905 #endif
5906 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
5907 	}
5908 
5909 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
5910 		/*
5911 		 * RSS indirect table is not set yet; set it up in round-
5912 		 * robin fashion.
5913 		 */
5914 		if (bootverbose) {
5915 			if_printf(sc->hn_ifp, "setup default RSS indirect "
5916 			    "table\n");
5917 		}
5918 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
5919 			uint32_t subidx;
5920 
5921 #ifdef RSS
5922 			subidx = rss_get_indirection_to_bucket(i);
5923 #else
5924 			subidx = i;
5925 #endif
5926 			rss->rss_ind[i] = subidx % nchan;
5927 		}
5928 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
5929 	} else {
5930 		/*
5931 		 * # of usable channels may be changed, so we have to
5932 		 * make sure that all entries in RSS indirect table
5933 		 * are valid.
5934 		 *
5935 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
5936 		 */
5937 		hn_rss_ind_fixup(sc);
5938 	}
5939 
5940 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
5941 	if (error)
5942 		goto failed;
5943 back:
5944 	/*
5945 	 * Fixup transmission aggregation setup.
5946 	 */
5947 	hn_set_txagg(sc);
5948 	hn_rndis_init_fixat(sc, nchan);
5949 	return (0);
5950 
5951 failed:
5952 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
5953 		hn_rndis_init_fixat(sc, nchan);
5954 		hn_synth_detach(sc);
5955 	} else {
5956 		if (attached & ATTACHED_RNDIS) {
5957 			hn_rndis_init_fixat(sc, nchan);
5958 			hn_rndis_detach(sc);
5959 		}
5960 		if (attached & ATTACHED_NVS)
5961 			hn_nvs_detach(sc);
5962 		hn_chan_detach(sc, sc->hn_prichan);
5963 		/* Restore old capabilities. */
5964 		sc->hn_caps = old_caps;
5965 	}
5966 	return (error);
5967 
5968 #undef ATTACHED_RNDIS
5969 #undef ATTACHED_NVS
5970 }
5971 
5972 /*
5973  * NOTE:
5974  * The interface must have been suspended though hn_suspend(), before
5975  * this function get called.
5976  */
5977 static void
5978 hn_synth_detach(struct hn_softc *sc)
5979 {
5980 
5981 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
5982 	    ("synthetic parts were not attached"));
5983 
5984 	/* Detach the RNDIS first. */
5985 	hn_rndis_detach(sc);
5986 
5987 	/* Detach NVS. */
5988 	hn_nvs_detach(sc);
5989 
5990 	/* Detach all of the channels. */
5991 	hn_detach_allchans(sc);
5992 
5993 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
5994 }
5995 
5996 static void
5997 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
5998 {
5999 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6000 	    ("invalid ring count %d", ring_cnt));
6001 
6002 	if (sc->hn_tx_ring_cnt > ring_cnt)
6003 		sc->hn_tx_ring_inuse = ring_cnt;
6004 	else
6005 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6006 	sc->hn_rx_ring_inuse = ring_cnt;
6007 
6008 #ifdef RSS
6009 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6010 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6011 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6012 		    rss_getnumbuckets());
6013 	}
6014 #endif
6015 
6016 	if (bootverbose) {
6017 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6018 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6019 	}
6020 }
6021 
6022 static void
6023 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6024 {
6025 
6026 	/*
6027 	 * NOTE:
6028 	 * The TX bufring will not be drained by the hypervisor,
6029 	 * if the primary channel is revoked.
6030 	 */
6031 	while (!vmbus_chan_rx_empty(chan) ||
6032 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6033 	     !vmbus_chan_tx_empty(chan)))
6034 		pause("waitch", 1);
6035 	vmbus_chan_intr_drain(chan);
6036 }
6037 
6038 static void
6039 hn_disable_rx(struct hn_softc *sc)
6040 {
6041 
6042 	/*
6043 	 * Disable RX by clearing RX filter forcefully.
6044 	 */
6045 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6046 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6047 
6048 	/*
6049 	 * Give RNDIS enough time to flush all pending data packets.
6050 	 */
6051 	pause("waitrx", (200 * hz) / 1000);
6052 }
6053 
6054 /*
6055  * NOTE:
6056  * RX/TX _must_ have been suspended/disabled, before this function
6057  * is called.
6058  */
6059 static void
6060 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6061 {
6062 	struct vmbus_channel **subch = NULL;
6063 	int nsubch;
6064 
6065 	/*
6066 	 * Drain RX/TX bufrings and interrupts.
6067 	 */
6068 	nsubch = nchan - 1;
6069 	if (nsubch > 0)
6070 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6071 
6072 	if (subch != NULL) {
6073 		int i;
6074 
6075 		for (i = 0; i < nsubch; ++i)
6076 			hn_chan_drain(sc, subch[i]);
6077 	}
6078 	hn_chan_drain(sc, sc->hn_prichan);
6079 
6080 	if (subch != NULL)
6081 		vmbus_subchan_rel(subch, nsubch);
6082 }
6083 
6084 static void
6085 hn_suspend_data(struct hn_softc *sc)
6086 {
6087 	struct hn_tx_ring *txr;
6088 	int i;
6089 
6090 	HN_LOCK_ASSERT(sc);
6091 
6092 	/*
6093 	 * Suspend TX.
6094 	 */
6095 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6096 		txr = &sc->hn_tx_ring[i];
6097 
6098 		mtx_lock(&txr->hn_tx_lock);
6099 		txr->hn_suspended = 1;
6100 		mtx_unlock(&txr->hn_tx_lock);
6101 		/* No one is able send more packets now. */
6102 
6103 		/*
6104 		 * Wait for all pending sends to finish.
6105 		 *
6106 		 * NOTE:
6107 		 * We will _not_ receive all pending send-done, if the
6108 		 * primary channel is revoked.
6109 		 */
6110 		while (hn_tx_ring_pending(txr) &&
6111 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6112 			pause("hnwtx", 1 /* 1 tick */);
6113 	}
6114 
6115 	/*
6116 	 * Disable RX.
6117 	 */
6118 	hn_disable_rx(sc);
6119 
6120 	/*
6121 	 * Drain RX/TX.
6122 	 */
6123 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6124 
6125 	/*
6126 	 * Drain any pending TX tasks.
6127 	 *
6128 	 * NOTE:
6129 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6130 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6131 	 */
6132 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6133 		txr = &sc->hn_tx_ring[i];
6134 
6135 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6136 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6137 	}
6138 }
6139 
6140 static void
6141 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6142 {
6143 
6144 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6145 }
6146 
6147 static void
6148 hn_suspend_mgmt(struct hn_softc *sc)
6149 {
6150 	struct task task;
6151 
6152 	HN_LOCK_ASSERT(sc);
6153 
6154 	/*
6155 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6156 	 * through hn_mgmt_taskq.
6157 	 */
6158 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6159 	vmbus_chan_run_task(sc->hn_prichan, &task);
6160 
6161 	/*
6162 	 * Make sure that all pending management tasks are completed.
6163 	 */
6164 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6165 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6166 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6167 }
6168 
6169 static void
6170 hn_suspend(struct hn_softc *sc)
6171 {
6172 
6173 	/* Disable polling. */
6174 	hn_polling(sc, 0);
6175 
6176 	/*
6177 	 * If the non-transparent mode VF is activated, the synthetic
6178 	 * device is receiving packets, so the data path of the
6179 	 * synthetic device must be suspended.
6180 	 */
6181 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6182 	    (sc->hn_flags & HN_FLAG_RXVF))
6183 		hn_suspend_data(sc);
6184 	hn_suspend_mgmt(sc);
6185 }
6186 
6187 static void
6188 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6189 {
6190 	int i;
6191 
6192 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6193 	    ("invalid TX ring count %d", tx_ring_cnt));
6194 
6195 	for (i = 0; i < tx_ring_cnt; ++i) {
6196 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6197 
6198 		mtx_lock(&txr->hn_tx_lock);
6199 		txr->hn_suspended = 0;
6200 		mtx_unlock(&txr->hn_tx_lock);
6201 	}
6202 }
6203 
6204 static void
6205 hn_resume_data(struct hn_softc *sc)
6206 {
6207 	int i;
6208 
6209 	HN_LOCK_ASSERT(sc);
6210 
6211 	/*
6212 	 * Re-enable RX.
6213 	 */
6214 	hn_rxfilter_config(sc);
6215 
6216 	/*
6217 	 * Make sure to clear suspend status on "all" TX rings,
6218 	 * since hn_tx_ring_inuse can be changed after
6219 	 * hn_suspend_data().
6220 	 */
6221 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6222 
6223 #ifdef HN_IFSTART_SUPPORT
6224 	if (!hn_use_if_start)
6225 #endif
6226 	{
6227 		/*
6228 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6229 		 * reduced.
6230 		 */
6231 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6232 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6233 	}
6234 
6235 	/*
6236 	 * Kick start TX.
6237 	 */
6238 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6239 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6240 
6241 		/*
6242 		 * Use txeof task, so that any pending oactive can be
6243 		 * cleared properly.
6244 		 */
6245 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6246 	}
6247 }
6248 
6249 static void
6250 hn_resume_mgmt(struct hn_softc *sc)
6251 {
6252 
6253 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6254 
6255 	/*
6256 	 * Kick off network change detection, if it was pending.
6257 	 * If no network change was pending, start link status
6258 	 * checks, which is more lightweight than network change
6259 	 * detection.
6260 	 */
6261 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6262 		hn_change_network(sc);
6263 	else
6264 		hn_update_link_status(sc);
6265 }
6266 
6267 static void
6268 hn_resume(struct hn_softc *sc)
6269 {
6270 
6271 	/*
6272 	 * If the non-transparent mode VF is activated, the synthetic
6273 	 * device have to receive packets, so the data path of the
6274 	 * synthetic device must be resumed.
6275 	 */
6276 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6277 	    (sc->hn_flags & HN_FLAG_RXVF))
6278 		hn_resume_data(sc);
6279 
6280 	/*
6281 	 * Don't resume link status change if VF is attached/activated.
6282 	 * - In the non-transparent VF mode, the synthetic device marks
6283 	 *   link down until the VF is deactivated; i.e. VF is down.
6284 	 * - In transparent VF mode, VF's media status is used until
6285 	 *   the VF is detached.
6286 	 */
6287 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6288 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6289 		hn_resume_mgmt(sc);
6290 
6291 	/*
6292 	 * Re-enable polling if this interface is running and
6293 	 * the polling is requested.
6294 	 */
6295 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6296 		hn_polling(sc, sc->hn_pollhz);
6297 }
6298 
6299 static void
6300 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6301 {
6302 	const struct rndis_status_msg *msg;
6303 	int ofs;
6304 
6305 	if (dlen < sizeof(*msg)) {
6306 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6307 		return;
6308 	}
6309 	msg = data;
6310 
6311 	switch (msg->rm_status) {
6312 	case RNDIS_STATUS_MEDIA_CONNECT:
6313 	case RNDIS_STATUS_MEDIA_DISCONNECT:
6314 		hn_update_link_status(sc);
6315 		break;
6316 
6317 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6318 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6319 		/* Not really useful; ignore. */
6320 		break;
6321 
6322 	case RNDIS_STATUS_NETWORK_CHANGE:
6323 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6324 		if (dlen < ofs + msg->rm_stbuflen ||
6325 		    msg->rm_stbuflen < sizeof(uint32_t)) {
6326 			if_printf(sc->hn_ifp, "network changed\n");
6327 		} else {
6328 			uint32_t change;
6329 
6330 			memcpy(&change, ((const uint8_t *)msg) + ofs,
6331 			    sizeof(change));
6332 			if_printf(sc->hn_ifp, "network changed, change %u\n",
6333 			    change);
6334 		}
6335 		hn_change_network(sc);
6336 		break;
6337 
6338 	default:
6339 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6340 		    msg->rm_status);
6341 		break;
6342 	}
6343 }
6344 
6345 static int
6346 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6347 {
6348 	const struct rndis_pktinfo *pi = info_data;
6349 	uint32_t mask = 0;
6350 
6351 	while (info_dlen != 0) {
6352 		const void *data;
6353 		uint32_t dlen;
6354 
6355 		if (__predict_false(info_dlen < sizeof(*pi)))
6356 			return (EINVAL);
6357 		if (__predict_false(info_dlen < pi->rm_size))
6358 			return (EINVAL);
6359 		info_dlen -= pi->rm_size;
6360 
6361 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6362 			return (EINVAL);
6363 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6364 			return (EINVAL);
6365 		dlen = pi->rm_size - pi->rm_pktinfooffset;
6366 		data = pi->rm_data;
6367 
6368 		switch (pi->rm_type) {
6369 		case NDIS_PKTINFO_TYPE_VLAN:
6370 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6371 				return (EINVAL);
6372 			info->vlan_info = *((const uint32_t *)data);
6373 			mask |= HN_RXINFO_VLAN;
6374 			break;
6375 
6376 		case NDIS_PKTINFO_TYPE_CSUM:
6377 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6378 				return (EINVAL);
6379 			info->csum_info = *((const uint32_t *)data);
6380 			mask |= HN_RXINFO_CSUM;
6381 			break;
6382 
6383 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6384 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6385 				return (EINVAL);
6386 			info->hash_value = *((const uint32_t *)data);
6387 			mask |= HN_RXINFO_HASHVAL;
6388 			break;
6389 
6390 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
6391 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6392 				return (EINVAL);
6393 			info->hash_info = *((const uint32_t *)data);
6394 			mask |= HN_RXINFO_HASHINF;
6395 			break;
6396 
6397 		default:
6398 			goto next;
6399 		}
6400 
6401 		if (mask == HN_RXINFO_ALL) {
6402 			/* All found; done */
6403 			break;
6404 		}
6405 next:
6406 		pi = (const struct rndis_pktinfo *)
6407 		    ((const uint8_t *)pi + pi->rm_size);
6408 	}
6409 
6410 	/*
6411 	 * Final fixup.
6412 	 * - If there is no hash value, invalidate the hash info.
6413 	 */
6414 	if ((mask & HN_RXINFO_HASHVAL) == 0)
6415 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6416 	return (0);
6417 }
6418 
6419 static __inline bool
6420 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6421 {
6422 
6423 	if (off < check_off) {
6424 		if (__predict_true(off + len <= check_off))
6425 			return (false);
6426 	} else if (off > check_off) {
6427 		if (__predict_true(check_off + check_len <= off))
6428 			return (false);
6429 	}
6430 	return (true);
6431 }
6432 
6433 static void
6434 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6435 {
6436 	const struct rndis_packet_msg *pkt;
6437 	struct hn_rxinfo info;
6438 	int data_off, pktinfo_off, data_len, pktinfo_len;
6439 
6440 	/*
6441 	 * Check length.
6442 	 */
6443 	if (__predict_false(dlen < sizeof(*pkt))) {
6444 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6445 		return;
6446 	}
6447 	pkt = data;
6448 
6449 	if (__predict_false(dlen < pkt->rm_len)) {
6450 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6451 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6452 		return;
6453 	}
6454 	if (__predict_false(pkt->rm_len <
6455 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6456 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6457 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
6458 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6459 		    pkt->rm_pktinfolen);
6460 		return;
6461 	}
6462 	if (__predict_false(pkt->rm_datalen == 0)) {
6463 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6464 		return;
6465 	}
6466 
6467 	/*
6468 	 * Check offests.
6469 	 */
6470 #define IS_OFFSET_INVALID(ofs)			\
6471 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
6472 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6473 
6474 	/* XXX Hyper-V does not meet data offset alignment requirement */
6475 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6476 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6477 		    "data offset %u\n", pkt->rm_dataoffset);
6478 		return;
6479 	}
6480 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6481 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6482 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6483 		    "oob offset %u\n", pkt->rm_oobdataoffset);
6484 		return;
6485 	}
6486 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6487 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6488 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6489 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6490 		return;
6491 	}
6492 
6493 #undef IS_OFFSET_INVALID
6494 
6495 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6496 	data_len = pkt->rm_datalen;
6497 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6498 	pktinfo_len = pkt->rm_pktinfolen;
6499 
6500 	/*
6501 	 * Check OOB coverage.
6502 	 */
6503 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
6504 		int oob_off, oob_len;
6505 
6506 		if_printf(rxr->hn_ifp, "got oobdata\n");
6507 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6508 		oob_len = pkt->rm_oobdatalen;
6509 
6510 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6511 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6512 			    "oob overflow, msglen %u, oob abs %d len %d\n",
6513 			    pkt->rm_len, oob_off, oob_len);
6514 			return;
6515 		}
6516 
6517 		/*
6518 		 * Check against data.
6519 		 */
6520 		if (hn_rndis_check_overlap(oob_off, oob_len,
6521 		    data_off, data_len)) {
6522 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6523 			    "oob overlaps data, oob abs %d len %d, "
6524 			    "data abs %d len %d\n",
6525 			    oob_off, oob_len, data_off, data_len);
6526 			return;
6527 		}
6528 
6529 		/*
6530 		 * Check against pktinfo.
6531 		 */
6532 		if (pktinfo_len != 0 &&
6533 		    hn_rndis_check_overlap(oob_off, oob_len,
6534 		    pktinfo_off, pktinfo_len)) {
6535 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6536 			    "oob overlaps pktinfo, oob abs %d len %d, "
6537 			    "pktinfo abs %d len %d\n",
6538 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
6539 			return;
6540 		}
6541 	}
6542 
6543 	/*
6544 	 * Check per-packet-info coverage and find useful per-packet-info.
6545 	 */
6546 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
6547 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
6548 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
6549 	if (__predict_true(pktinfo_len != 0)) {
6550 		bool overlap;
6551 		int error;
6552 
6553 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
6554 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6555 			    "pktinfo overflow, msglen %u, "
6556 			    "pktinfo abs %d len %d\n",
6557 			    pkt->rm_len, pktinfo_off, pktinfo_len);
6558 			return;
6559 		}
6560 
6561 		/*
6562 		 * Check packet info coverage.
6563 		 */
6564 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
6565 		    data_off, data_len);
6566 		if (__predict_false(overlap)) {
6567 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6568 			    "pktinfo overlap data, pktinfo abs %d len %d, "
6569 			    "data abs %d len %d\n",
6570 			    pktinfo_off, pktinfo_len, data_off, data_len);
6571 			return;
6572 		}
6573 
6574 		/*
6575 		 * Find useful per-packet-info.
6576 		 */
6577 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
6578 		    pktinfo_len, &info);
6579 		if (__predict_false(error)) {
6580 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
6581 			    "pktinfo\n");
6582 			return;
6583 		}
6584 	}
6585 
6586 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
6587 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6588 		    "data overflow, msglen %u, data abs %d len %d\n",
6589 		    pkt->rm_len, data_off, data_len);
6590 		return;
6591 	}
6592 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
6593 }
6594 
6595 static __inline void
6596 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
6597 {
6598 	const struct rndis_msghdr *hdr;
6599 
6600 	if (__predict_false(dlen < sizeof(*hdr))) {
6601 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
6602 		return;
6603 	}
6604 	hdr = data;
6605 
6606 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
6607 		/* Hot data path. */
6608 		hn_rndis_rx_data(rxr, data, dlen);
6609 		/* Done! */
6610 		return;
6611 	}
6612 
6613 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
6614 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
6615 	else
6616 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
6617 }
6618 
6619 static void
6620 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
6621 {
6622 	const struct hn_nvs_hdr *hdr;
6623 
6624 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
6625 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
6626 		return;
6627 	}
6628 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
6629 
6630 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
6631 		/* Useless; ignore */
6632 		return;
6633 	}
6634 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
6635 }
6636 
6637 static void
6638 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
6639     const struct vmbus_chanpkt_hdr *pkt)
6640 {
6641 	struct hn_nvs_sendctx *sndc;
6642 
6643 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
6644 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
6645 	    VMBUS_CHANPKT_DATALEN(pkt));
6646 	/*
6647 	 * NOTE:
6648 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
6649 	 * its callback.
6650 	 */
6651 }
6652 
6653 static void
6654 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6655     const struct vmbus_chanpkt_hdr *pkthdr)
6656 {
6657 	const struct vmbus_chanpkt_rxbuf *pkt;
6658 	const struct hn_nvs_hdr *nvs_hdr;
6659 	int count, i, hlen;
6660 
6661 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
6662 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
6663 		return;
6664 	}
6665 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
6666 
6667 	/* Make sure that this is a RNDIS message. */
6668 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
6669 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
6670 		    nvs_hdr->nvs_type);
6671 		return;
6672 	}
6673 
6674 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
6675 	if (__predict_false(hlen < sizeof(*pkt))) {
6676 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
6677 		return;
6678 	}
6679 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
6680 
6681 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
6682 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
6683 		    pkt->cp_rxbuf_id);
6684 		return;
6685 	}
6686 
6687 	count = pkt->cp_rxbuf_cnt;
6688 	if (__predict_false(hlen <
6689 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
6690 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
6691 		return;
6692 	}
6693 
6694 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
6695 	for (i = 0; i < count; ++i) {
6696 		int ofs, len;
6697 
6698 		ofs = pkt->cp_rxbuf[i].rb_ofs;
6699 		len = pkt->cp_rxbuf[i].rb_len;
6700 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
6701 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
6702 			    "ofs %d, len %d\n", i, ofs, len);
6703 			continue;
6704 		}
6705 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
6706 	}
6707 
6708 	/*
6709 	 * Ack the consumed RXBUF associated w/ this channel packet,
6710 	 * so that this RXBUF can be recycled by the hypervisor.
6711 	 */
6712 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
6713 }
6714 
6715 static void
6716 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6717     uint64_t tid)
6718 {
6719 	struct hn_nvs_rndis_ack ack;
6720 	int retries, error;
6721 
6722 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
6723 	ack.nvs_status = HN_NVS_STATUS_OK;
6724 
6725 	retries = 0;
6726 again:
6727 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
6728 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
6729 	if (__predict_false(error == EAGAIN)) {
6730 		/*
6731 		 * NOTE:
6732 		 * This should _not_ happen in real world, since the
6733 		 * consumption of the TX bufring from the TX path is
6734 		 * controlled.
6735 		 */
6736 		if (rxr->hn_ack_failed == 0)
6737 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
6738 		rxr->hn_ack_failed++;
6739 		retries++;
6740 		if (retries < 10) {
6741 			DELAY(100);
6742 			goto again;
6743 		}
6744 		/* RXBUF leaks! */
6745 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
6746 	}
6747 }
6748 
6749 static void
6750 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
6751 {
6752 	struct hn_rx_ring *rxr = xrxr;
6753 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
6754 
6755 	for (;;) {
6756 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
6757 		int error, pktlen;
6758 
6759 		pktlen = rxr->hn_pktbuf_len;
6760 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
6761 		if (__predict_false(error == ENOBUFS)) {
6762 			void *nbuf;
6763 			int nlen;
6764 
6765 			/*
6766 			 * Expand channel packet buffer.
6767 			 *
6768 			 * XXX
6769 			 * Use M_WAITOK here, since allocation failure
6770 			 * is fatal.
6771 			 */
6772 			nlen = rxr->hn_pktbuf_len * 2;
6773 			while (nlen < pktlen)
6774 				nlen *= 2;
6775 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
6776 
6777 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
6778 			    rxr->hn_pktbuf_len, nlen);
6779 
6780 			free(rxr->hn_pktbuf, M_DEVBUF);
6781 			rxr->hn_pktbuf = nbuf;
6782 			rxr->hn_pktbuf_len = nlen;
6783 			/* Retry! */
6784 			continue;
6785 		} else if (__predict_false(error == EAGAIN)) {
6786 			/* No more channel packets; done! */
6787 			break;
6788 		}
6789 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6790 
6791 		switch (pkt->cph_type) {
6792 		case VMBUS_CHANPKT_TYPE_COMP:
6793 			hn_nvs_handle_comp(sc, chan, pkt);
6794 			break;
6795 
6796 		case VMBUS_CHANPKT_TYPE_RXBUF:
6797 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
6798 			break;
6799 
6800 		case VMBUS_CHANPKT_TYPE_INBAND:
6801 			hn_nvs_handle_notify(sc, pkt);
6802 			break;
6803 
6804 		default:
6805 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6806 			    pkt->cph_type);
6807 			break;
6808 		}
6809 	}
6810 	hn_chan_rollup(rxr, rxr->hn_txr);
6811 }
6812 
6813 static void
6814 hn_sysinit(void *arg __unused)
6815 {
6816 	int i;
6817 
6818 #ifdef HN_IFSTART_SUPPORT
6819 	/*
6820 	 * Don't use ifnet.if_start if transparent VF mode is requested;
6821 	 * mainly due to the IFF_DRV_OACTIVE flag.
6822 	 */
6823 	if (hn_xpnt_vf && hn_use_if_start) {
6824 		hn_use_if_start = 0;
6825 		printf("hn: tranparent VF mode, if_transmit will be used, "
6826 		    "instead of if_start\n");
6827 	}
6828 #endif
6829 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
6830 		printf("hn: invalid transparent VF attach routing "
6831 		    "wait timeout %d, reset to %d\n",
6832 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
6833 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
6834 	}
6835 
6836 	/*
6837 	 * Initialize VF map.
6838 	 */
6839 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
6840 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
6841 	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
6842 	    M_WAITOK | M_ZERO);
6843 
6844 	/*
6845 	 * Fix the # of TX taskqueues.
6846 	 */
6847 	if (hn_tx_taskq_cnt <= 0)
6848 		hn_tx_taskq_cnt = 1;
6849 	else if (hn_tx_taskq_cnt > mp_ncpus)
6850 		hn_tx_taskq_cnt = mp_ncpus;
6851 
6852 	/*
6853 	 * Fix the TX taskqueue mode.
6854 	 */
6855 	switch (hn_tx_taskq_mode) {
6856 	case HN_TX_TASKQ_M_INDEP:
6857 	case HN_TX_TASKQ_M_GLOBAL:
6858 	case HN_TX_TASKQ_M_EVTTQ:
6859 		break;
6860 	default:
6861 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
6862 		break;
6863 	}
6864 
6865 	if (vm_guest != VM_GUEST_HV)
6866 		return;
6867 
6868 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
6869 		return;
6870 
6871 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
6872 	    M_DEVBUF, M_WAITOK);
6873 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
6874 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
6875 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
6876 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
6877 		    "hn tx%d", i);
6878 	}
6879 }
6880 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
6881 
6882 static void
6883 hn_sysuninit(void *arg __unused)
6884 {
6885 
6886 	if (hn_tx_taskque != NULL) {
6887 		int i;
6888 
6889 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
6890 			taskqueue_free(hn_tx_taskque[i]);
6891 		free(hn_tx_taskque, M_DEVBUF);
6892 	}
6893 
6894 	if (hn_vfmap != NULL)
6895 		free(hn_vfmap, M_DEVBUF);
6896 	rm_destroy(&hn_vfmap_lock);
6897 }
6898 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
6899