xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 4ca50eab86aec8001978a612a42f9ae8388eceab)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_hn.h"
59 #include "opt_inet6.h"
60 #include "opt_inet.h"
61 #include "opt_rss.h"
62 
63 #include <sys/param.h>
64 #include <sys/bus.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/smp.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
75 #include <sys/sx.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
80 #include <sys/eventhandler.h>
81 
82 #include <machine/atomic.h>
83 #include <machine/in_cksum.h>
84 
85 #include <net/bpf.h>
86 #include <net/ethernet.h>
87 #include <net/if.h>
88 #include <net/if_dl.h>
89 #include <net/if_media.h>
90 #include <net/if_types.h>
91 #include <net/if_var.h>
92 #include <net/rndis.h>
93 #ifdef RSS
94 #include <net/rss_config.h>
95 #endif
96 
97 #include <netinet/in_systm.h>
98 #include <netinet/in.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip6.h>
101 #include <netinet/tcp.h>
102 #include <netinet/tcp_lro.h>
103 #include <netinet/udp.h>
104 
105 #include <dev/hyperv/include/hyperv.h>
106 #include <dev/hyperv/include/hyperv_busdma.h>
107 #include <dev/hyperv/include/vmbus.h>
108 #include <dev/hyperv/include/vmbus_xact.h>
109 
110 #include <dev/hyperv/netvsc/ndis.h>
111 #include <dev/hyperv/netvsc/if_hnreg.h>
112 #include <dev/hyperv/netvsc/if_hnvar.h>
113 #include <dev/hyperv/netvsc/hn_nvs.h>
114 #include <dev/hyperv/netvsc/hn_rndis.h>
115 
116 #include "vmbus_if.h"
117 
118 #define HN_IFSTART_SUPPORT
119 
120 #define HN_RING_CNT_DEF_MAX		8
121 
122 /* YYY should get it from the underlying channel */
123 #define HN_TX_DESC_CNT			512
124 
125 #define HN_RNDIS_PKT_LEN					\
126 	(sizeof(struct rndis_packet_msg) +			\
127 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
128 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
129 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
130 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
131 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
132 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
133 
134 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
135 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
136 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
137 /* -1 for RNDIS packet message */
138 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
139 
140 #define HN_DIRECT_TX_SIZE_DEF		128
141 
142 #define HN_EARLY_TXEOF_THRESH		8
143 
144 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
145 
146 #define HN_LROENT_CNT_DEF		128
147 
148 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
149 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
150 /* YYY 2*MTU is a bit rough, but should be good enough. */
151 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
152 
153 #define HN_LRO_ACKCNT_DEF		1
154 
155 #define HN_LOCK_INIT(sc)		\
156 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
157 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
158 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
159 #define HN_LOCK(sc)					\
160 do {							\
161 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
162 		DELAY(1000);				\
163 } while (0)
164 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
165 
166 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
167 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
168 #define HN_CSUM_IP_HWASSIST(sc)		\
169 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
170 #define HN_CSUM_IP6_HWASSIST(sc)	\
171 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
172 
173 #define HN_PKTSIZE_MIN(align)		\
174 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
175 	    HN_RNDIS_PKT_LEN, (align))
176 #define HN_PKTSIZE(m, align)		\
177 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
178 
179 #ifdef RSS
180 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
181 #else
182 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
183 #endif
184 
185 struct hn_txdesc {
186 #ifndef HN_USE_TXDESC_BUFRING
187 	SLIST_ENTRY(hn_txdesc)		link;
188 #endif
189 	STAILQ_ENTRY(hn_txdesc)		agg_link;
190 
191 	/* Aggregated txdescs, in sending order. */
192 	STAILQ_HEAD(, hn_txdesc)	agg_list;
193 
194 	/* The oldest packet, if transmission aggregation happens. */
195 	struct mbuf			*m;
196 	struct hn_tx_ring		*txr;
197 	int				refs;
198 	uint32_t			flags;	/* HN_TXD_FLAG_ */
199 	struct hn_nvs_sendctx		send_ctx;
200 	uint32_t			chim_index;
201 	int				chim_size;
202 
203 	bus_dmamap_t			data_dmap;
204 
205 	bus_addr_t			rndis_pkt_paddr;
206 	struct rndis_packet_msg		*rndis_pkt;
207 	bus_dmamap_t			rndis_pkt_dmap;
208 };
209 
210 #define HN_TXD_FLAG_ONLIST		0x0001
211 #define HN_TXD_FLAG_DMAMAP		0x0002
212 #define HN_TXD_FLAG_ONAGG		0x0004
213 
214 struct hn_rxinfo {
215 	uint32_t			vlan_info;
216 	uint32_t			csum_info;
217 	uint32_t			hash_info;
218 	uint32_t			hash_value;
219 };
220 
221 struct hn_update_vf {
222 	struct hn_rx_ring	*rxr;
223 	struct ifnet		*vf;
224 };
225 
226 #define HN_RXINFO_VLAN			0x0001
227 #define HN_RXINFO_CSUM			0x0002
228 #define HN_RXINFO_HASHINF		0x0004
229 #define HN_RXINFO_HASHVAL		0x0008
230 #define HN_RXINFO_ALL			\
231 	(HN_RXINFO_VLAN |		\
232 	 HN_RXINFO_CSUM |		\
233 	 HN_RXINFO_HASHINF |		\
234 	 HN_RXINFO_HASHVAL)
235 
236 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
237 #define HN_NDIS_RXCSUM_INFO_INVALID	0
238 #define HN_NDIS_HASH_INFO_INVALID	0
239 
240 static int			hn_probe(device_t);
241 static int			hn_attach(device_t);
242 static int			hn_detach(device_t);
243 static int			hn_shutdown(device_t);
244 static void			hn_chan_callback(struct vmbus_channel *,
245 				    void *);
246 
247 static void			hn_init(void *);
248 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
249 #ifdef HN_IFSTART_SUPPORT
250 static void			hn_start(struct ifnet *);
251 #endif
252 static int			hn_transmit(struct ifnet *, struct mbuf *);
253 static void			hn_xmit_qflush(struct ifnet *);
254 static int			hn_ifmedia_upd(struct ifnet *);
255 static void			hn_ifmedia_sts(struct ifnet *,
256 				    struct ifmediareq *);
257 
258 static int			hn_rndis_rxinfo(const void *, int,
259 				    struct hn_rxinfo *);
260 static void			hn_rndis_rx_data(struct hn_rx_ring *,
261 				    const void *, int);
262 static void			hn_rndis_rx_status(struct hn_softc *,
263 				    const void *, int);
264 static void			hn_rndis_init_fixat(struct hn_softc *, int);
265 
266 static void			hn_nvs_handle_notify(struct hn_softc *,
267 				    const struct vmbus_chanpkt_hdr *);
268 static void			hn_nvs_handle_comp(struct hn_softc *,
269 				    struct vmbus_channel *,
270 				    const struct vmbus_chanpkt_hdr *);
271 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
272 				    struct vmbus_channel *,
273 				    const struct vmbus_chanpkt_hdr *);
274 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
275 				    struct vmbus_channel *, uint64_t);
276 
277 #if __FreeBSD_version >= 1100099
278 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
279 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
280 #endif
281 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
282 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
283 #if __FreeBSD_version < 1100095
284 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
285 #else
286 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
287 #endif
288 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
289 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
290 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
291 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
292 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
293 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
294 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
295 #ifndef RSS
296 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
297 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
298 #endif
299 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
300 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
301 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
302 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
303 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
304 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
305 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
306 
307 static void			hn_stop(struct hn_softc *, bool);
308 static void			hn_init_locked(struct hn_softc *);
309 static int			hn_chan_attach(struct hn_softc *,
310 				    struct vmbus_channel *);
311 static void			hn_chan_detach(struct hn_softc *,
312 				    struct vmbus_channel *);
313 static int			hn_attach_subchans(struct hn_softc *);
314 static void			hn_detach_allchans(struct hn_softc *);
315 static void			hn_chan_rollup(struct hn_rx_ring *,
316 				    struct hn_tx_ring *);
317 static void			hn_set_ring_inuse(struct hn_softc *, int);
318 static int			hn_synth_attach(struct hn_softc *, int);
319 static void			hn_synth_detach(struct hn_softc *);
320 static int			hn_synth_alloc_subchans(struct hn_softc *,
321 				    int *);
322 static bool			hn_synth_attachable(const struct hn_softc *);
323 static void			hn_suspend(struct hn_softc *);
324 static void			hn_suspend_data(struct hn_softc *);
325 static void			hn_suspend_mgmt(struct hn_softc *);
326 static void			hn_resume(struct hn_softc *);
327 static void			hn_resume_data(struct hn_softc *);
328 static void			hn_resume_mgmt(struct hn_softc *);
329 static void			hn_suspend_mgmt_taskfunc(void *, int);
330 static void			hn_chan_drain(struct hn_softc *,
331 				    struct vmbus_channel *);
332 static void			hn_disable_rx(struct hn_softc *);
333 static void			hn_drain_rxtx(struct hn_softc *, int);
334 static void			hn_polling(struct hn_softc *, u_int);
335 static void			hn_chan_polling(struct vmbus_channel *, u_int);
336 
337 static void			hn_update_link_status(struct hn_softc *);
338 static void			hn_change_network(struct hn_softc *);
339 static void			hn_link_taskfunc(void *, int);
340 static void			hn_netchg_init_taskfunc(void *, int);
341 static void			hn_netchg_status_taskfunc(void *, int);
342 static void			hn_link_status(struct hn_softc *);
343 
344 static int			hn_create_rx_data(struct hn_softc *, int);
345 static void			hn_destroy_rx_data(struct hn_softc *);
346 static int			hn_check_iplen(const struct mbuf *, int);
347 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
348 static int			hn_rxfilter_config(struct hn_softc *);
349 #ifndef RSS
350 static int			hn_rss_reconfig(struct hn_softc *);
351 #endif
352 static void			hn_rss_ind_fixup(struct hn_softc *);
353 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
354 				    int, const struct hn_rxinfo *);
355 
356 static int			hn_tx_ring_create(struct hn_softc *, int);
357 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
358 static int			hn_create_tx_data(struct hn_softc *, int);
359 static void			hn_fixup_tx_data(struct hn_softc *);
360 static void			hn_destroy_tx_data(struct hn_softc *);
361 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
362 static void			hn_txdesc_gc(struct hn_tx_ring *,
363 				    struct hn_txdesc *);
364 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
365 				    struct hn_txdesc *, struct mbuf **);
366 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
367 				    struct hn_txdesc *);
368 static void			hn_set_chim_size(struct hn_softc *, int);
369 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
370 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
371 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
372 static void			hn_resume_tx(struct hn_softc *, int);
373 static void			hn_set_txagg(struct hn_softc *);
374 static void			*hn_try_txagg(struct ifnet *,
375 				    struct hn_tx_ring *, struct hn_txdesc *,
376 				    int);
377 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
378 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
379 				    struct hn_softc *, struct vmbus_channel *,
380 				    const void *, int);
381 static int			hn_txpkt_sglist(struct hn_tx_ring *,
382 				    struct hn_txdesc *);
383 static int			hn_txpkt_chim(struct hn_tx_ring *,
384 				    struct hn_txdesc *);
385 static int			hn_xmit(struct hn_tx_ring *, int);
386 static void			hn_xmit_taskfunc(void *, int);
387 static void			hn_xmit_txeof(struct hn_tx_ring *);
388 static void			hn_xmit_txeof_taskfunc(void *, int);
389 #ifdef HN_IFSTART_SUPPORT
390 static int			hn_start_locked(struct hn_tx_ring *, int);
391 static void			hn_start_taskfunc(void *, int);
392 static void			hn_start_txeof(struct hn_tx_ring *);
393 static void			hn_start_txeof_taskfunc(void *, int);
394 #endif
395 
396 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
397     "Hyper-V network interface");
398 
399 /* Trust tcp segements verification on host side. */
400 static int			hn_trust_hosttcp = 1;
401 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
402     &hn_trust_hosttcp, 0,
403     "Trust tcp segement verification on host side, "
404     "when csum info is missing (global setting)");
405 
406 /* Trust udp datagrams verification on host side. */
407 static int			hn_trust_hostudp = 1;
408 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
409     &hn_trust_hostudp, 0,
410     "Trust udp datagram verification on host side, "
411     "when csum info is missing (global setting)");
412 
413 /* Trust ip packets verification on host side. */
414 static int			hn_trust_hostip = 1;
415 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
416     &hn_trust_hostip, 0,
417     "Trust ip packet verification on host side, "
418     "when csum info is missing (global setting)");
419 
420 /* Limit TSO burst size */
421 static int			hn_tso_maxlen = IP_MAXPACKET;
422 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
423     &hn_tso_maxlen, 0, "TSO burst limit");
424 
425 /* Limit chimney send size */
426 static int			hn_tx_chimney_size = 0;
427 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
428     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
429 
430 /* Limit the size of packet for direct transmission */
431 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
432 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
433     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
434 
435 /* # of LRO entries per RX ring */
436 #if defined(INET) || defined(INET6)
437 #if __FreeBSD_version >= 1100095
438 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
439 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
440     &hn_lro_entry_count, 0, "LRO entry count");
441 #endif
442 #endif
443 
444 static int			hn_tx_taskq_cnt = 1;
445 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
446     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
447 
448 #define HN_TX_TASKQ_M_INDEP	0
449 #define HN_TX_TASKQ_M_GLOBAL	1
450 #define HN_TX_TASKQ_M_EVTTQ	2
451 
452 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
453 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
454     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
455     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
456 
457 #ifndef HN_USE_TXDESC_BUFRING
458 static int			hn_use_txdesc_bufring = 0;
459 #else
460 static int			hn_use_txdesc_bufring = 1;
461 #endif
462 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
463     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
464 
465 #ifdef HN_IFSTART_SUPPORT
466 /* Use ifnet.if_start instead of ifnet.if_transmit */
467 static int			hn_use_if_start = 0;
468 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
469     &hn_use_if_start, 0, "Use if_start TX method");
470 #endif
471 
472 /* # of channels to use */
473 static int			hn_chan_cnt = 0;
474 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
475     &hn_chan_cnt, 0,
476     "# of channels to use; each channel has one RX ring and one TX ring");
477 
478 /* # of transmit rings to use */
479 static int			hn_tx_ring_cnt = 0;
480 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
481     &hn_tx_ring_cnt, 0, "# of TX rings to use");
482 
483 /* Software TX ring deptch */
484 static int			hn_tx_swq_depth = 0;
485 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
486     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
487 
488 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
489 #if __FreeBSD_version >= 1100095
490 static u_int			hn_lro_mbufq_depth = 0;
491 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
492     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
493 #endif
494 
495 /* Packet transmission aggregation size limit */
496 static int			hn_tx_agg_size = -1;
497 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
498     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
499 
500 /* Packet transmission aggregation count limit */
501 static int			hn_tx_agg_pkts = -1;
502 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
503     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
504 
505 static u_int			hn_cpu_index;	/* next CPU for channel */
506 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
507 
508 #ifndef RSS
509 static const uint8_t
510 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
511 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
512 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
513 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
514 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
515 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
516 };
517 #endif	/* !RSS */
518 
519 static device_method_t hn_methods[] = {
520 	/* Device interface */
521 	DEVMETHOD(device_probe,		hn_probe),
522 	DEVMETHOD(device_attach,	hn_attach),
523 	DEVMETHOD(device_detach,	hn_detach),
524 	DEVMETHOD(device_shutdown,	hn_shutdown),
525 	DEVMETHOD_END
526 };
527 
528 static driver_t hn_driver = {
529 	"hn",
530 	hn_methods,
531 	sizeof(struct hn_softc)
532 };
533 
534 static devclass_t hn_devclass;
535 
536 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
537 MODULE_VERSION(hn, 1);
538 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
539 
540 #if __FreeBSD_version >= 1100099
541 static void
542 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
543 {
544 	int i;
545 
546 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
547 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
548 }
549 #endif
550 
551 static int
552 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
553 {
554 
555 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
556 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
557 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
558 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
559 }
560 
561 static int
562 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
563 {
564 	struct hn_nvs_rndis rndis;
565 
566 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
567 	    txd->chim_size > 0, ("invalid rndis chim txd"));
568 
569 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
570 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
571 	rndis.nvs_chim_idx = txd->chim_index;
572 	rndis.nvs_chim_sz = txd->chim_size;
573 
574 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
575 	    &rndis, sizeof(rndis), &txd->send_ctx));
576 }
577 
578 static __inline uint32_t
579 hn_chim_alloc(struct hn_softc *sc)
580 {
581 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
582 	u_long *bmap = sc->hn_chim_bmap;
583 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
584 
585 	for (i = 0; i < bmap_cnt; ++i) {
586 		int idx;
587 
588 		idx = ffsl(~bmap[i]);
589 		if (idx == 0)
590 			continue;
591 
592 		--idx; /* ffsl is 1-based */
593 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
594 		    ("invalid i %d and idx %d", i, idx));
595 
596 		if (atomic_testandset_long(&bmap[i], idx))
597 			continue;
598 
599 		ret = i * LONG_BIT + idx;
600 		break;
601 	}
602 	return (ret);
603 }
604 
605 static __inline void
606 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
607 {
608 	u_long mask;
609 	uint32_t idx;
610 
611 	idx = chim_idx / LONG_BIT;
612 	KASSERT(idx < sc->hn_chim_bmap_cnt,
613 	    ("invalid chimney index 0x%x", chim_idx));
614 
615 	mask = 1UL << (chim_idx % LONG_BIT);
616 	KASSERT(sc->hn_chim_bmap[idx] & mask,
617 	    ("index bitmap 0x%lx, chimney index %u, "
618 	     "bitmap idx %d, bitmask 0x%lx",
619 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
620 
621 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
622 }
623 
624 #if defined(INET6) || defined(INET)
625 /*
626  * NOTE: If this function failed, the m_head would be freed.
627  */
628 static __inline struct mbuf *
629 hn_tso_fixup(struct mbuf *m_head)
630 {
631 	struct ether_vlan_header *evl;
632 	struct tcphdr *th;
633 	int ehlen;
634 
635 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
636 
637 #define PULLUP_HDR(m, len)				\
638 do {							\
639 	if (__predict_false((m)->m_len < (len))) {	\
640 		(m) = m_pullup((m), (len));		\
641 		if ((m) == NULL)			\
642 			return (NULL);			\
643 	}						\
644 } while (0)
645 
646 	PULLUP_HDR(m_head, sizeof(*evl));
647 	evl = mtod(m_head, struct ether_vlan_header *);
648 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
649 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
650 	else
651 		ehlen = ETHER_HDR_LEN;
652 
653 #ifdef INET
654 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
655 		struct ip *ip;
656 		int iphlen;
657 
658 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
659 		ip = mtodo(m_head, ehlen);
660 		iphlen = ip->ip_hl << 2;
661 
662 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
663 		th = mtodo(m_head, ehlen + iphlen);
664 
665 		ip->ip_len = 0;
666 		ip->ip_sum = 0;
667 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
668 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
669 	}
670 #endif
671 #if defined(INET6) && defined(INET)
672 	else
673 #endif
674 #ifdef INET6
675 	{
676 		struct ip6_hdr *ip6;
677 
678 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
679 		ip6 = mtodo(m_head, ehlen);
680 		if (ip6->ip6_nxt != IPPROTO_TCP) {
681 			m_freem(m_head);
682 			return (NULL);
683 		}
684 
685 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
686 		th = mtodo(m_head, ehlen + sizeof(*ip6));
687 
688 		ip6->ip6_plen = 0;
689 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
690 	}
691 #endif
692 	return (m_head);
693 
694 #undef PULLUP_HDR
695 }
696 #endif	/* INET6 || INET */
697 
698 static int
699 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
700 {
701 	int error = 0;
702 
703 	HN_LOCK_ASSERT(sc);
704 
705 	if (sc->hn_rx_filter != filter) {
706 		error = hn_rndis_set_rxfilter(sc, filter);
707 		if (!error)
708 			sc->hn_rx_filter = filter;
709 	}
710 	return (error);
711 }
712 
713 static int
714 hn_rxfilter_config(struct hn_softc *sc)
715 {
716 	struct ifnet *ifp = sc->hn_ifp;
717 	uint32_t filter;
718 
719 	HN_LOCK_ASSERT(sc);
720 
721 	if ((ifp->if_flags & IFF_PROMISC) ||
722 	    (sc->hn_flags & HN_FLAG_VF)) {
723 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
724 	} else {
725 		filter = NDIS_PACKET_TYPE_DIRECTED;
726 		if (ifp->if_flags & IFF_BROADCAST)
727 			filter |= NDIS_PACKET_TYPE_BROADCAST;
728 		/* TODO: support multicast list */
729 		if ((ifp->if_flags & IFF_ALLMULTI) ||
730 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
731 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
732 	}
733 	return (hn_set_rxfilter(sc, filter));
734 }
735 
736 static void
737 hn_set_txagg(struct hn_softc *sc)
738 {
739 	uint32_t size, pkts;
740 	int i;
741 
742 	/*
743 	 * Setup aggregation size.
744 	 */
745 	if (sc->hn_agg_size < 0)
746 		size = UINT32_MAX;
747 	else
748 		size = sc->hn_agg_size;
749 
750 	if (sc->hn_rndis_agg_size < size)
751 		size = sc->hn_rndis_agg_size;
752 
753 	/* NOTE: We only aggregate packets using chimney sending buffers. */
754 	if (size > (uint32_t)sc->hn_chim_szmax)
755 		size = sc->hn_chim_szmax;
756 
757 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
758 		/* Disable */
759 		size = 0;
760 		pkts = 0;
761 		goto done;
762 	}
763 
764 	/* NOTE: Type of the per TX ring setting is 'int'. */
765 	if (size > INT_MAX)
766 		size = INT_MAX;
767 
768 	/*
769 	 * Setup aggregation packet count.
770 	 */
771 	if (sc->hn_agg_pkts < 0)
772 		pkts = UINT32_MAX;
773 	else
774 		pkts = sc->hn_agg_pkts;
775 
776 	if (sc->hn_rndis_agg_pkts < pkts)
777 		pkts = sc->hn_rndis_agg_pkts;
778 
779 	if (pkts <= 1) {
780 		/* Disable */
781 		size = 0;
782 		pkts = 0;
783 		goto done;
784 	}
785 
786 	/* NOTE: Type of the per TX ring setting is 'short'. */
787 	if (pkts > SHRT_MAX)
788 		pkts = SHRT_MAX;
789 
790 done:
791 	/* NOTE: Type of the per TX ring setting is 'short'. */
792 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
793 		/* Disable */
794 		size = 0;
795 		pkts = 0;
796 	}
797 
798 	if (bootverbose) {
799 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
800 		    size, pkts, sc->hn_rndis_agg_align);
801 	}
802 
803 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
804 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
805 
806 		mtx_lock(&txr->hn_tx_lock);
807 		txr->hn_agg_szmax = size;
808 		txr->hn_agg_pktmax = pkts;
809 		txr->hn_agg_align = sc->hn_rndis_agg_align;
810 		mtx_unlock(&txr->hn_tx_lock);
811 	}
812 }
813 
814 static int
815 hn_get_txswq_depth(const struct hn_tx_ring *txr)
816 {
817 
818 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
819 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
820 		return txr->hn_txdesc_cnt;
821 	return hn_tx_swq_depth;
822 }
823 
824 #ifndef RSS
825 static int
826 hn_rss_reconfig(struct hn_softc *sc)
827 {
828 	int error;
829 
830 	HN_LOCK_ASSERT(sc);
831 
832 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
833 		return (ENXIO);
834 
835 	/*
836 	 * Disable RSS first.
837 	 *
838 	 * NOTE:
839 	 * Direct reconfiguration by setting the UNCHG flags does
840 	 * _not_ work properly.
841 	 */
842 	if (bootverbose)
843 		if_printf(sc->hn_ifp, "disable RSS\n");
844 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
845 	if (error) {
846 		if_printf(sc->hn_ifp, "RSS disable failed\n");
847 		return (error);
848 	}
849 
850 	/*
851 	 * Reenable the RSS w/ the updated RSS key or indirect
852 	 * table.
853 	 */
854 	if (bootverbose)
855 		if_printf(sc->hn_ifp, "reconfig RSS\n");
856 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
857 	if (error) {
858 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
859 		return (error);
860 	}
861 	return (0);
862 }
863 #endif	/* !RSS */
864 
865 static void
866 hn_rss_ind_fixup(struct hn_softc *sc)
867 {
868 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
869 	int i, nchan;
870 
871 	nchan = sc->hn_rx_ring_inuse;
872 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
873 
874 	/*
875 	 * Check indirect table to make sure that all channels in it
876 	 * can be used.
877 	 */
878 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
879 		if (rss->rss_ind[i] >= nchan) {
880 			if_printf(sc->hn_ifp,
881 			    "RSS indirect table %d fixup: %u -> %d\n",
882 			    i, rss->rss_ind[i], nchan - 1);
883 			rss->rss_ind[i] = nchan - 1;
884 		}
885 	}
886 }
887 
888 static int
889 hn_ifmedia_upd(struct ifnet *ifp __unused)
890 {
891 
892 	return EOPNOTSUPP;
893 }
894 
895 static void
896 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
897 {
898 	struct hn_softc *sc = ifp->if_softc;
899 
900 	ifmr->ifm_status = IFM_AVALID;
901 	ifmr->ifm_active = IFM_ETHER;
902 
903 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
904 		ifmr->ifm_active |= IFM_NONE;
905 		return;
906 	}
907 	ifmr->ifm_status |= IFM_ACTIVE;
908 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
909 }
910 
911 static void
912 hn_update_vf_task(void *arg, int pending __unused)
913 {
914 	struct hn_update_vf *uv = arg;
915 
916 	uv->rxr->hn_vf = uv->vf;
917 }
918 
919 static void
920 hn_update_vf(struct hn_softc *sc, struct ifnet *vf)
921 {
922 	struct hn_rx_ring *rxr;
923 	struct hn_update_vf uv;
924 	struct task task;
925 	int i;
926 
927 	HN_LOCK_ASSERT(sc);
928 
929 	TASK_INIT(&task, 0, hn_update_vf_task, &uv);
930 
931 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
932 		rxr = &sc->hn_rx_ring[i];
933 
934 		if (i < sc->hn_rx_ring_inuse) {
935 			uv.rxr = rxr;
936 			uv.vf = vf;
937 			vmbus_chan_run_task(rxr->hn_chan, &task);
938 		} else {
939 			rxr->hn_vf = vf;
940 		}
941 	}
942 }
943 
944 static void
945 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf)
946 {
947 	struct ifnet *hn_ifp;
948 
949 	HN_LOCK(sc);
950 
951 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
952 		goto out;
953 
954 	hn_ifp = sc->hn_ifp;
955 
956 	if (ifp == hn_ifp)
957 		goto out;
958 
959 	if (ifp->if_alloctype != IFT_ETHER)
960 		goto out;
961 
962 	/* Ignore lagg/vlan interfaces */
963 	if (strcmp(ifp->if_dname, "lagg") == 0 ||
964 	    strcmp(ifp->if_dname, "vlan") == 0)
965 		goto out;
966 
967 	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
968 		goto out;
969 
970 	/* Now we're sure 'ifp' is a real VF device. */
971 	if (vf) {
972 		if (sc->hn_flags & HN_FLAG_VF)
973 			goto out;
974 
975 		sc->hn_flags |= HN_FLAG_VF;
976 		hn_rxfilter_config(sc);
977 	} else {
978 		if (!(sc->hn_flags & HN_FLAG_VF))
979 			goto out;
980 
981 		sc->hn_flags &= ~HN_FLAG_VF;
982 		if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
983 			hn_rxfilter_config(sc);
984 		else
985 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
986 	}
987 
988 	hn_nvs_set_datapath(sc,
989 	    vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
990 
991 	hn_update_vf(sc, vf ? ifp : NULL);
992 
993 	if (vf) {
994 		hn_suspend_mgmt(sc);
995 		sc->hn_link_flags &=
996 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
997 		if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
998 	} else {
999 		hn_resume_mgmt(sc);
1000 	}
1001 
1002 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1003 	    vf ? "VF_UP" : "VF_DOWN", NULL);
1004 
1005 	if (bootverbose)
1006 		if_printf(hn_ifp, "Data path is switched %s %s\n",
1007 		    vf ? "to" : "from", if_name(ifp));
1008 out:
1009 	HN_UNLOCK(sc);
1010 }
1011 
1012 static void
1013 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1014 {
1015 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1016 		return;
1017 
1018 	hn_set_vf(arg, ifp, event == IFNET_EVENT_UP);
1019 }
1020 
1021 static void
1022 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1023 {
1024 	hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP);
1025 }
1026 
1027 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1028 static const struct hyperv_guid g_net_vsc_device_type = {
1029 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1030 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1031 };
1032 
1033 static int
1034 hn_probe(device_t dev)
1035 {
1036 
1037 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1038 	    &g_net_vsc_device_type) == 0) {
1039 		device_set_desc(dev, "Hyper-V Network Interface");
1040 		return BUS_PROBE_DEFAULT;
1041 	}
1042 	return ENXIO;
1043 }
1044 
1045 static int
1046 hn_attach(device_t dev)
1047 {
1048 	struct hn_softc *sc = device_get_softc(dev);
1049 	struct sysctl_oid_list *child;
1050 	struct sysctl_ctx_list *ctx;
1051 	uint8_t eaddr[ETHER_ADDR_LEN];
1052 	struct ifnet *ifp = NULL;
1053 	int error, ring_cnt, tx_ring_cnt;
1054 
1055 	sc->hn_dev = dev;
1056 	sc->hn_prichan = vmbus_get_channel(dev);
1057 	HN_LOCK_INIT(sc);
1058 
1059 	/*
1060 	 * Initialize these tunables once.
1061 	 */
1062 	sc->hn_agg_size = hn_tx_agg_size;
1063 	sc->hn_agg_pkts = hn_tx_agg_pkts;
1064 
1065 	/*
1066 	 * Setup taskqueue for transmission.
1067 	 */
1068 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1069 		int i;
1070 
1071 		sc->hn_tx_taskqs =
1072 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1073 		    M_DEVBUF, M_WAITOK);
1074 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1075 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1076 			    M_WAITOK, taskqueue_thread_enqueue,
1077 			    &sc->hn_tx_taskqs[i]);
1078 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1079 			    "%s tx%d", device_get_nameunit(dev), i);
1080 		}
1081 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1082 		sc->hn_tx_taskqs = hn_tx_taskque;
1083 	}
1084 
1085 	/*
1086 	 * Setup taskqueue for mangement tasks, e.g. link status.
1087 	 */
1088 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1089 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1090 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1091 	    device_get_nameunit(dev));
1092 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1093 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1094 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1095 	    hn_netchg_status_taskfunc, sc);
1096 
1097 	/*
1098 	 * Allocate ifnet and setup its name earlier, so that if_printf
1099 	 * can be used by functions, which will be called after
1100 	 * ether_ifattach().
1101 	 */
1102 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1103 	ifp->if_softc = sc;
1104 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1105 
1106 	/*
1107 	 * Initialize ifmedia earlier so that it can be unconditionally
1108 	 * destroyed, if error happened later on.
1109 	 */
1110 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1111 
1112 	/*
1113 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1114 	 * to use (tx_ring_cnt).
1115 	 *
1116 	 * NOTE:
1117 	 * The # of RX rings to use is same as the # of channels to use.
1118 	 */
1119 	ring_cnt = hn_chan_cnt;
1120 	if (ring_cnt <= 0) {
1121 		/* Default */
1122 		ring_cnt = mp_ncpus;
1123 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
1124 			ring_cnt = HN_RING_CNT_DEF_MAX;
1125 	} else if (ring_cnt > mp_ncpus) {
1126 		ring_cnt = mp_ncpus;
1127 	}
1128 #ifdef RSS
1129 	if (ring_cnt > rss_getnumbuckets())
1130 		ring_cnt = rss_getnumbuckets();
1131 #endif
1132 
1133 	tx_ring_cnt = hn_tx_ring_cnt;
1134 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1135 		tx_ring_cnt = ring_cnt;
1136 #ifdef HN_IFSTART_SUPPORT
1137 	if (hn_use_if_start) {
1138 		/* ifnet.if_start only needs one TX ring. */
1139 		tx_ring_cnt = 1;
1140 	}
1141 #endif
1142 
1143 	/*
1144 	 * Set the leader CPU for channels.
1145 	 */
1146 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1147 
1148 	/*
1149 	 * Create enough TX/RX rings, even if only limited number of
1150 	 * channels can be allocated.
1151 	 */
1152 	error = hn_create_tx_data(sc, tx_ring_cnt);
1153 	if (error)
1154 		goto failed;
1155 	error = hn_create_rx_data(sc, ring_cnt);
1156 	if (error)
1157 		goto failed;
1158 
1159 	/*
1160 	 * Create transaction context for NVS and RNDIS transactions.
1161 	 */
1162 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1163 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1164 	if (sc->hn_xact == NULL) {
1165 		error = ENXIO;
1166 		goto failed;
1167 	}
1168 
1169 	/*
1170 	 * Install orphan handler for the revocation of this device's
1171 	 * primary channel.
1172 	 *
1173 	 * NOTE:
1174 	 * The processing order is critical here:
1175 	 * Install the orphan handler, _before_ testing whether this
1176 	 * device's primary channel has been revoked or not.
1177 	 */
1178 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1179 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1180 		error = ENXIO;
1181 		goto failed;
1182 	}
1183 
1184 	/*
1185 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1186 	 */
1187 	error = hn_synth_attach(sc, ETHERMTU);
1188 	if (error)
1189 		goto failed;
1190 
1191 	error = hn_rndis_get_eaddr(sc, eaddr);
1192 	if (error)
1193 		goto failed;
1194 
1195 #if __FreeBSD_version >= 1100099
1196 	if (sc->hn_rx_ring_inuse > 1) {
1197 		/*
1198 		 * Reduce TCP segment aggregation limit for multiple
1199 		 * RX rings to increase ACK timeliness.
1200 		 */
1201 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1202 	}
1203 #endif
1204 
1205 	/*
1206 	 * Fixup TX stuffs after synthetic parts are attached.
1207 	 */
1208 	hn_fixup_tx_data(sc);
1209 
1210 	ctx = device_get_sysctl_ctx(dev);
1211 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1212 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1213 	    &sc->hn_nvs_ver, 0, "NVS version");
1214 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1215 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1216 	    hn_ndis_version_sysctl, "A", "NDIS version");
1217 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1218 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1219 	    hn_caps_sysctl, "A", "capabilities");
1220 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1221 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1222 	    hn_hwassist_sysctl, "A", "hwassist");
1223 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1224 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1225 	    hn_rxfilter_sysctl, "A", "rxfilter");
1226 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1227 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1228 	    hn_rss_hash_sysctl, "A", "RSS hash");
1229 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1230 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1231 #ifndef RSS
1232 	/*
1233 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
1234 	 */
1235 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1236 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1237 	    hn_rss_key_sysctl, "IU", "RSS key");
1238 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1239 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1240 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1241 #endif
1242 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1243 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1244 	    "RNDIS offered packet transmission aggregation size limit");
1245 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1246 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1247 	    "RNDIS offered packet transmission aggregation count limit");
1248 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1249 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1250 	    "RNDIS packet transmission aggregation alignment");
1251 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1252 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1253 	    hn_txagg_size_sysctl, "I",
1254 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1255 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1256 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1257 	    hn_txagg_pkts_sysctl, "I",
1258 	    "Packet transmission aggregation packets, "
1259 	    "0 -- disable, -1 -- auto");
1260 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1261 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1262 	    hn_polling_sysctl, "I",
1263 	    "Polling frequency: [100,1000000], 0 disable polling");
1264 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1265 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1266 	    hn_vf_sysctl, "A", "Virtual Function's name");
1267 
1268 	/*
1269 	 * Setup the ifmedia, which has been initialized earlier.
1270 	 */
1271 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1272 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1273 	/* XXX ifmedia_set really should do this for us */
1274 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1275 
1276 	/*
1277 	 * Setup the ifnet for this interface.
1278 	 */
1279 
1280 	ifp->if_baudrate = IF_Gbps(10);
1281 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1282 	ifp->if_ioctl = hn_ioctl;
1283 	ifp->if_init = hn_init;
1284 #ifdef HN_IFSTART_SUPPORT
1285 	if (hn_use_if_start) {
1286 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1287 
1288 		ifp->if_start = hn_start;
1289 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1290 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1291 		IFQ_SET_READY(&ifp->if_snd);
1292 	} else
1293 #endif
1294 	{
1295 		ifp->if_transmit = hn_transmit;
1296 		ifp->if_qflush = hn_xmit_qflush;
1297 	}
1298 
1299 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1300 #ifdef foo
1301 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1302 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1303 #endif
1304 	if (sc->hn_caps & HN_CAP_VLAN) {
1305 		/* XXX not sure about VLAN_MTU. */
1306 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1307 	}
1308 
1309 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1310 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1311 		ifp->if_capabilities |= IFCAP_TXCSUM;
1312 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1313 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1314 	if (sc->hn_caps & HN_CAP_TSO4) {
1315 		ifp->if_capabilities |= IFCAP_TSO4;
1316 		ifp->if_hwassist |= CSUM_IP_TSO;
1317 	}
1318 	if (sc->hn_caps & HN_CAP_TSO6) {
1319 		ifp->if_capabilities |= IFCAP_TSO6;
1320 		ifp->if_hwassist |= CSUM_IP6_TSO;
1321 	}
1322 
1323 	/* Enable all available capabilities by default. */
1324 	ifp->if_capenable = ifp->if_capabilities;
1325 
1326 	/*
1327 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1328 	 * be enabled through SIOCSIFCAP.
1329 	 */
1330 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1331 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1332 
1333 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1334 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1335 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1336 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1337 	}
1338 
1339 	ether_ifattach(ifp, eaddr);
1340 
1341 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1342 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1343 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1344 	}
1345 
1346 	/* Inform the upper layer about the long frame support. */
1347 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1348 
1349 	/*
1350 	 * Kick off link status check.
1351 	 */
1352 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1353 	hn_update_link_status(sc);
1354 
1355 	sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1356 	    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1357 
1358 	sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1359 	    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1360 
1361 	return (0);
1362 failed:
1363 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1364 		hn_synth_detach(sc);
1365 	hn_detach(dev);
1366 	return (error);
1367 }
1368 
1369 static int
1370 hn_detach(device_t dev)
1371 {
1372 	struct hn_softc *sc = device_get_softc(dev);
1373 	struct ifnet *ifp = sc->hn_ifp;
1374 
1375 	if (sc->hn_ifaddr_evthand != NULL)
1376 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1377 	if (sc->hn_ifnet_evthand != NULL)
1378 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1379 
1380 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1381 		/*
1382 		 * In case that the vmbus missed the orphan handler
1383 		 * installation.
1384 		 */
1385 		vmbus_xact_ctx_orphan(sc->hn_xact);
1386 	}
1387 
1388 	if (device_is_attached(dev)) {
1389 		HN_LOCK(sc);
1390 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1391 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1392 				hn_stop(sc, true);
1393 			/*
1394 			 * NOTE:
1395 			 * hn_stop() only suspends data, so managment
1396 			 * stuffs have to be suspended manually here.
1397 			 */
1398 			hn_suspend_mgmt(sc);
1399 			hn_synth_detach(sc);
1400 		}
1401 		HN_UNLOCK(sc);
1402 		ether_ifdetach(ifp);
1403 	}
1404 
1405 	ifmedia_removeall(&sc->hn_media);
1406 	hn_destroy_rx_data(sc);
1407 	hn_destroy_tx_data(sc);
1408 
1409 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1410 		int i;
1411 
1412 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
1413 			taskqueue_free(sc->hn_tx_taskqs[i]);
1414 		free(sc->hn_tx_taskqs, M_DEVBUF);
1415 	}
1416 	taskqueue_free(sc->hn_mgmt_taskq0);
1417 
1418 	if (sc->hn_xact != NULL) {
1419 		/*
1420 		 * Uninstall the orphan handler _before_ the xact is
1421 		 * destructed.
1422 		 */
1423 		vmbus_chan_unset_orphan(sc->hn_prichan);
1424 		vmbus_xact_ctx_destroy(sc->hn_xact);
1425 	}
1426 
1427 	if_free(ifp);
1428 
1429 	HN_LOCK_DESTROY(sc);
1430 	return (0);
1431 }
1432 
1433 static int
1434 hn_shutdown(device_t dev)
1435 {
1436 
1437 	return (0);
1438 }
1439 
1440 static void
1441 hn_link_status(struct hn_softc *sc)
1442 {
1443 	uint32_t link_status;
1444 	int error;
1445 
1446 	error = hn_rndis_get_linkstatus(sc, &link_status);
1447 	if (error) {
1448 		/* XXX what to do? */
1449 		return;
1450 	}
1451 
1452 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1453 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1454 	else
1455 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1456 	if_link_state_change(sc->hn_ifp,
1457 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1458 	    LINK_STATE_UP : LINK_STATE_DOWN);
1459 }
1460 
1461 static void
1462 hn_link_taskfunc(void *xsc, int pending __unused)
1463 {
1464 	struct hn_softc *sc = xsc;
1465 
1466 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1467 		return;
1468 	hn_link_status(sc);
1469 }
1470 
1471 static void
1472 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1473 {
1474 	struct hn_softc *sc = xsc;
1475 
1476 	/* Prevent any link status checks from running. */
1477 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1478 
1479 	/*
1480 	 * Fake up a [link down --> link up] state change; 5 seconds
1481 	 * delay is used, which closely simulates miibus reaction
1482 	 * upon link down event.
1483 	 */
1484 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1485 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1486 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1487 	    &sc->hn_netchg_status, 5 * hz);
1488 }
1489 
1490 static void
1491 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1492 {
1493 	struct hn_softc *sc = xsc;
1494 
1495 	/* Re-allow link status checks. */
1496 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1497 	hn_link_status(sc);
1498 }
1499 
1500 static void
1501 hn_update_link_status(struct hn_softc *sc)
1502 {
1503 
1504 	if (sc->hn_mgmt_taskq != NULL)
1505 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1506 }
1507 
1508 static void
1509 hn_change_network(struct hn_softc *sc)
1510 {
1511 
1512 	if (sc->hn_mgmt_taskq != NULL)
1513 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1514 }
1515 
1516 static __inline int
1517 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1518     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1519 {
1520 	struct mbuf *m = *m_head;
1521 	int error;
1522 
1523 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1524 
1525 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1526 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1527 	if (error == EFBIG) {
1528 		struct mbuf *m_new;
1529 
1530 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1531 		if (m_new == NULL)
1532 			return ENOBUFS;
1533 		else
1534 			*m_head = m = m_new;
1535 		txr->hn_tx_collapsed++;
1536 
1537 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1538 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1539 	}
1540 	if (!error) {
1541 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1542 		    BUS_DMASYNC_PREWRITE);
1543 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1544 	}
1545 	return error;
1546 }
1547 
1548 static __inline int
1549 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1550 {
1551 
1552 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1553 	    ("put an onlist txd %#x", txd->flags));
1554 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1555 	    ("put an onagg txd %#x", txd->flags));
1556 
1557 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1558 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1559 		return 0;
1560 
1561 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1562 		struct hn_txdesc *tmp_txd;
1563 
1564 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1565 			int freed;
1566 
1567 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1568 			    ("resursive aggregation on aggregated txdesc"));
1569 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1570 			    ("not aggregated txdesc"));
1571 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1572 			    ("aggregated txdesc uses dmamap"));
1573 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1574 			    ("aggregated txdesc consumes "
1575 			     "chimney sending buffer"));
1576 			KASSERT(tmp_txd->chim_size == 0,
1577 			    ("aggregated txdesc has non-zero "
1578 			     "chimney sending size"));
1579 
1580 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1581 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1582 			freed = hn_txdesc_put(txr, tmp_txd);
1583 			KASSERT(freed, ("failed to free aggregated txdesc"));
1584 		}
1585 	}
1586 
1587 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1588 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1589 		    ("chim txd uses dmamap"));
1590 		hn_chim_free(txr->hn_sc, txd->chim_index);
1591 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1592 		txd->chim_size = 0;
1593 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1594 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1595 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1596 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1597 		    txd->data_dmap);
1598 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1599 	}
1600 
1601 	if (txd->m != NULL) {
1602 		m_freem(txd->m);
1603 		txd->m = NULL;
1604 	}
1605 
1606 	txd->flags |= HN_TXD_FLAG_ONLIST;
1607 #ifndef HN_USE_TXDESC_BUFRING
1608 	mtx_lock_spin(&txr->hn_txlist_spin);
1609 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1610 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1611 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1612 	txr->hn_txdesc_avail++;
1613 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1614 	mtx_unlock_spin(&txr->hn_txlist_spin);
1615 #else	/* HN_USE_TXDESC_BUFRING */
1616 #ifdef HN_DEBUG
1617 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1618 #endif
1619 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1620 #endif	/* !HN_USE_TXDESC_BUFRING */
1621 
1622 	return 1;
1623 }
1624 
1625 static __inline struct hn_txdesc *
1626 hn_txdesc_get(struct hn_tx_ring *txr)
1627 {
1628 	struct hn_txdesc *txd;
1629 
1630 #ifndef HN_USE_TXDESC_BUFRING
1631 	mtx_lock_spin(&txr->hn_txlist_spin);
1632 	txd = SLIST_FIRST(&txr->hn_txlist);
1633 	if (txd != NULL) {
1634 		KASSERT(txr->hn_txdesc_avail > 0,
1635 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1636 		txr->hn_txdesc_avail--;
1637 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1638 	}
1639 	mtx_unlock_spin(&txr->hn_txlist_spin);
1640 #else
1641 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1642 #endif
1643 
1644 	if (txd != NULL) {
1645 #ifdef HN_USE_TXDESC_BUFRING
1646 #ifdef HN_DEBUG
1647 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1648 #endif
1649 #endif	/* HN_USE_TXDESC_BUFRING */
1650 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1651 		    STAILQ_EMPTY(&txd->agg_list) &&
1652 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1653 		    txd->chim_size == 0 &&
1654 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1655 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1656 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1657 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1658 		txd->refs = 1;
1659 	}
1660 	return txd;
1661 }
1662 
1663 static __inline void
1664 hn_txdesc_hold(struct hn_txdesc *txd)
1665 {
1666 
1667 	/* 0->1 transition will never work */
1668 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1669 	atomic_add_int(&txd->refs, 1);
1670 }
1671 
1672 static __inline void
1673 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1674 {
1675 
1676 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1677 	    ("recursive aggregation on aggregating txdesc"));
1678 
1679 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1680 	    ("already aggregated"));
1681 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1682 	    ("recursive aggregation on to-be-aggregated txdesc"));
1683 
1684 	txd->flags |= HN_TXD_FLAG_ONAGG;
1685 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1686 }
1687 
1688 static bool
1689 hn_tx_ring_pending(struct hn_tx_ring *txr)
1690 {
1691 	bool pending = false;
1692 
1693 #ifndef HN_USE_TXDESC_BUFRING
1694 	mtx_lock_spin(&txr->hn_txlist_spin);
1695 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1696 		pending = true;
1697 	mtx_unlock_spin(&txr->hn_txlist_spin);
1698 #else
1699 	if (!buf_ring_full(txr->hn_txdesc_br))
1700 		pending = true;
1701 #endif
1702 	return (pending);
1703 }
1704 
1705 static __inline void
1706 hn_txeof(struct hn_tx_ring *txr)
1707 {
1708 	txr->hn_has_txeof = 0;
1709 	txr->hn_txeof(txr);
1710 }
1711 
1712 static void
1713 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1714     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1715 {
1716 	struct hn_txdesc *txd = sndc->hn_cbarg;
1717 	struct hn_tx_ring *txr;
1718 
1719 	txr = txd->txr;
1720 	KASSERT(txr->hn_chan == chan,
1721 	    ("channel mismatch, on chan%u, should be chan%u",
1722 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1723 
1724 	txr->hn_has_txeof = 1;
1725 	hn_txdesc_put(txr, txd);
1726 
1727 	++txr->hn_txdone_cnt;
1728 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1729 		txr->hn_txdone_cnt = 0;
1730 		if (txr->hn_oactive)
1731 			hn_txeof(txr);
1732 	}
1733 }
1734 
1735 static void
1736 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1737 {
1738 #if defined(INET) || defined(INET6)
1739 	tcp_lro_flush_all(&rxr->hn_lro);
1740 #endif
1741 
1742 	/*
1743 	 * NOTE:
1744 	 * 'txr' could be NULL, if multiple channels and
1745 	 * ifnet.if_start method are enabled.
1746 	 */
1747 	if (txr == NULL || !txr->hn_has_txeof)
1748 		return;
1749 
1750 	txr->hn_txdone_cnt = 0;
1751 	hn_txeof(txr);
1752 }
1753 
1754 static __inline uint32_t
1755 hn_rndis_pktmsg_offset(uint32_t ofs)
1756 {
1757 
1758 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1759 	    ("invalid RNDIS packet msg offset %u", ofs));
1760 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1761 }
1762 
1763 static __inline void *
1764 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1765     size_t pi_dlen, uint32_t pi_type)
1766 {
1767 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1768 	struct rndis_pktinfo *pi;
1769 
1770 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1771 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1772 
1773 	/*
1774 	 * Per-packet-info does not move; it only grows.
1775 	 *
1776 	 * NOTE:
1777 	 * rm_pktinfooffset in this phase counts from the beginning
1778 	 * of rndis_packet_msg.
1779 	 */
1780 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1781 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1782 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1783 	    pkt->rm_pktinfolen);
1784 	pkt->rm_pktinfolen += pi_size;
1785 
1786 	pi->rm_size = pi_size;
1787 	pi->rm_type = pi_type;
1788 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1789 
1790 	return (pi->rm_data);
1791 }
1792 
1793 static __inline int
1794 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1795 {
1796 	struct hn_txdesc *txd;
1797 	struct mbuf *m;
1798 	int error, pkts;
1799 
1800 	txd = txr->hn_agg_txd;
1801 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1802 
1803 	/*
1804 	 * Since hn_txpkt() will reset this temporary stat, save
1805 	 * it now, so that oerrors can be updated properly, if
1806 	 * hn_txpkt() ever fails.
1807 	 */
1808 	pkts = txr->hn_stat_pkts;
1809 
1810 	/*
1811 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1812 	 * failure, save it for later freeing, if hn_txpkt() ever
1813 	 * fails.
1814 	 */
1815 	m = txd->m;
1816 	error = hn_txpkt(ifp, txr, txd);
1817 	if (__predict_false(error)) {
1818 		/* txd is freed, but m is not. */
1819 		m_freem(m);
1820 
1821 		txr->hn_flush_failed++;
1822 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1823 	}
1824 
1825 	/* Reset all aggregation states. */
1826 	txr->hn_agg_txd = NULL;
1827 	txr->hn_agg_szleft = 0;
1828 	txr->hn_agg_pktleft = 0;
1829 	txr->hn_agg_prevpkt = NULL;
1830 
1831 	return (error);
1832 }
1833 
1834 static void *
1835 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1836     int pktsize)
1837 {
1838 	void *chim;
1839 
1840 	if (txr->hn_agg_txd != NULL) {
1841 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1842 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1843 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1844 			int olen;
1845 
1846 			/*
1847 			 * Update the previous RNDIS packet's total length,
1848 			 * it can be increased due to the mandatory alignment
1849 			 * padding for this RNDIS packet.  And update the
1850 			 * aggregating txdesc's chimney sending buffer size
1851 			 * accordingly.
1852 			 *
1853 			 * XXX
1854 			 * Zero-out the padding, as required by the RNDIS spec.
1855 			 */
1856 			olen = pkt->rm_len;
1857 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1858 			agg_txd->chim_size += pkt->rm_len - olen;
1859 
1860 			/* Link this txdesc to the parent. */
1861 			hn_txdesc_agg(agg_txd, txd);
1862 
1863 			chim = (uint8_t *)pkt + pkt->rm_len;
1864 			/* Save the current packet for later fixup. */
1865 			txr->hn_agg_prevpkt = chim;
1866 
1867 			txr->hn_agg_pktleft--;
1868 			txr->hn_agg_szleft -= pktsize;
1869 			if (txr->hn_agg_szleft <=
1870 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1871 				/*
1872 				 * Probably can't aggregate more packets,
1873 				 * flush this aggregating txdesc proactively.
1874 				 */
1875 				txr->hn_agg_pktleft = 0;
1876 			}
1877 			/* Done! */
1878 			return (chim);
1879 		}
1880 		hn_flush_txagg(ifp, txr);
1881 	}
1882 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1883 
1884 	txr->hn_tx_chimney_tried++;
1885 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1886 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1887 		return (NULL);
1888 	txr->hn_tx_chimney++;
1889 
1890 	chim = txr->hn_sc->hn_chim +
1891 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1892 
1893 	if (txr->hn_agg_pktmax > 1 &&
1894 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1895 		txr->hn_agg_txd = txd;
1896 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1897 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1898 		txr->hn_agg_prevpkt = chim;
1899 	}
1900 	return (chim);
1901 }
1902 
1903 /*
1904  * NOTE:
1905  * If this function fails, then both txd and m_head0 will be freed.
1906  */
1907 static int
1908 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1909     struct mbuf **m_head0)
1910 {
1911 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1912 	int error, nsegs, i;
1913 	struct mbuf *m_head = *m_head0;
1914 	struct rndis_packet_msg *pkt;
1915 	uint32_t *pi_data;
1916 	void *chim = NULL;
1917 	int pkt_hlen, pkt_size;
1918 
1919 	pkt = txd->rndis_pkt;
1920 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1921 	if (pkt_size < txr->hn_chim_size) {
1922 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1923 		if (chim != NULL)
1924 			pkt = chim;
1925 	} else {
1926 		if (txr->hn_agg_txd != NULL)
1927 			hn_flush_txagg(ifp, txr);
1928 	}
1929 
1930 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1931 	pkt->rm_len = m_head->m_pkthdr.len;
1932 	pkt->rm_dataoffset = 0;
1933 	pkt->rm_datalen = m_head->m_pkthdr.len;
1934 	pkt->rm_oobdataoffset = 0;
1935 	pkt->rm_oobdatalen = 0;
1936 	pkt->rm_oobdataelements = 0;
1937 	pkt->rm_pktinfooffset = sizeof(*pkt);
1938 	pkt->rm_pktinfolen = 0;
1939 	pkt->rm_vchandle = 0;
1940 	pkt->rm_reserved = 0;
1941 
1942 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1943 		/*
1944 		 * Set the hash value for this packet, so that the host could
1945 		 * dispatch the TX done event for this packet back to this TX
1946 		 * ring's channel.
1947 		 */
1948 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1949 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1950 		*pi_data = txr->hn_tx_idx;
1951 	}
1952 
1953 	if (m_head->m_flags & M_VLANTAG) {
1954 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1955 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1956 		*pi_data = NDIS_VLAN_INFO_MAKE(
1957 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1958 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1959 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1960 	}
1961 
1962 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1963 #if defined(INET6) || defined(INET)
1964 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1965 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1966 #ifdef INET
1967 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1968 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1969 			    m_head->m_pkthdr.tso_segsz);
1970 		}
1971 #endif
1972 #if defined(INET6) && defined(INET)
1973 		else
1974 #endif
1975 #ifdef INET6
1976 		{
1977 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1978 			    m_head->m_pkthdr.tso_segsz);
1979 		}
1980 #endif
1981 #endif	/* INET6 || INET */
1982 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1983 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1984 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1985 		if (m_head->m_pkthdr.csum_flags &
1986 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1987 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1988 		} else {
1989 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1990 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1991 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1992 		}
1993 
1994 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1995 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1996 		else if (m_head->m_pkthdr.csum_flags &
1997 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1998 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1999 	}
2000 
2001 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2002 	/* Fixup RNDIS packet message total length */
2003 	pkt->rm_len += pkt_hlen;
2004 	/* Convert RNDIS packet message offsets */
2005 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2006 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2007 
2008 	/*
2009 	 * Fast path: Chimney sending.
2010 	 */
2011 	if (chim != NULL) {
2012 		struct hn_txdesc *tgt_txd = txd;
2013 
2014 		if (txr->hn_agg_txd != NULL) {
2015 			tgt_txd = txr->hn_agg_txd;
2016 #ifdef INVARIANTS
2017 			*m_head0 = NULL;
2018 #endif
2019 		}
2020 
2021 		KASSERT(pkt == chim,
2022 		    ("RNDIS pkt not in chimney sending buffer"));
2023 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2024 		    ("chimney sending buffer is not used"));
2025 		tgt_txd->chim_size += pkt->rm_len;
2026 
2027 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
2028 		    ((uint8_t *)chim) + pkt_hlen);
2029 
2030 		txr->hn_gpa_cnt = 0;
2031 		txr->hn_sendpkt = hn_txpkt_chim;
2032 		goto done;
2033 	}
2034 
2035 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2036 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2037 	    ("chimney buffer is used"));
2038 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2039 
2040 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2041 	if (__predict_false(error)) {
2042 		int freed;
2043 
2044 		/*
2045 		 * This mbuf is not linked w/ the txd yet, so free it now.
2046 		 */
2047 		m_freem(m_head);
2048 		*m_head0 = NULL;
2049 
2050 		freed = hn_txdesc_put(txr, txd);
2051 		KASSERT(freed != 0,
2052 		    ("fail to free txd upon txdma error"));
2053 
2054 		txr->hn_txdma_failed++;
2055 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2056 		return error;
2057 	}
2058 	*m_head0 = m_head;
2059 
2060 	/* +1 RNDIS packet message */
2061 	txr->hn_gpa_cnt = nsegs + 1;
2062 
2063 	/* send packet with page buffer */
2064 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2065 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2066 	txr->hn_gpa[0].gpa_len = pkt_hlen;
2067 
2068 	/*
2069 	 * Fill the page buffers with mbuf info after the page
2070 	 * buffer for RNDIS packet message.
2071 	 */
2072 	for (i = 0; i < nsegs; ++i) {
2073 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2074 
2075 		gpa->gpa_page = atop(segs[i].ds_addr);
2076 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2077 		gpa->gpa_len = segs[i].ds_len;
2078 	}
2079 
2080 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2081 	txd->chim_size = 0;
2082 	txr->hn_sendpkt = hn_txpkt_sglist;
2083 done:
2084 	txd->m = m_head;
2085 
2086 	/* Set the completion routine */
2087 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2088 
2089 	/* Update temporary stats for later use. */
2090 	txr->hn_stat_pkts++;
2091 	txr->hn_stat_size += m_head->m_pkthdr.len;
2092 	if (m_head->m_flags & M_MCAST)
2093 		txr->hn_stat_mcasts++;
2094 
2095 	return 0;
2096 }
2097 
2098 /*
2099  * NOTE:
2100  * If this function fails, then txd will be freed, but the mbuf
2101  * associated w/ the txd will _not_ be freed.
2102  */
2103 static int
2104 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2105 {
2106 	int error, send_failed = 0, has_bpf;
2107 
2108 again:
2109 	has_bpf = bpf_peers_present(ifp->if_bpf);
2110 	if (has_bpf) {
2111 		/*
2112 		 * Make sure that this txd and any aggregated txds are not
2113 		 * freed before ETHER_BPF_MTAP.
2114 		 */
2115 		hn_txdesc_hold(txd);
2116 	}
2117 	error = txr->hn_sendpkt(txr, txd);
2118 	if (!error) {
2119 		if (has_bpf) {
2120 			const struct hn_txdesc *tmp_txd;
2121 
2122 			ETHER_BPF_MTAP(ifp, txd->m);
2123 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2124 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
2125 		}
2126 
2127 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2128 #ifdef HN_IFSTART_SUPPORT
2129 		if (!hn_use_if_start)
2130 #endif
2131 		{
2132 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
2133 			    txr->hn_stat_size);
2134 			if (txr->hn_stat_mcasts != 0) {
2135 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2136 				    txr->hn_stat_mcasts);
2137 			}
2138 		}
2139 		txr->hn_pkts += txr->hn_stat_pkts;
2140 		txr->hn_sends++;
2141 	}
2142 	if (has_bpf)
2143 		hn_txdesc_put(txr, txd);
2144 
2145 	if (__predict_false(error)) {
2146 		int freed;
2147 
2148 		/*
2149 		 * This should "really rarely" happen.
2150 		 *
2151 		 * XXX Too many RX to be acked or too many sideband
2152 		 * commands to run?  Ask netvsc_channel_rollup()
2153 		 * to kick start later.
2154 		 */
2155 		txr->hn_has_txeof = 1;
2156 		if (!send_failed) {
2157 			txr->hn_send_failed++;
2158 			send_failed = 1;
2159 			/*
2160 			 * Try sending again after set hn_has_txeof;
2161 			 * in case that we missed the last
2162 			 * netvsc_channel_rollup().
2163 			 */
2164 			goto again;
2165 		}
2166 		if_printf(ifp, "send failed\n");
2167 
2168 		/*
2169 		 * Caller will perform further processing on the
2170 		 * associated mbuf, so don't free it in hn_txdesc_put();
2171 		 * only unload it from the DMA map in hn_txdesc_put(),
2172 		 * if it was loaded.
2173 		 */
2174 		txd->m = NULL;
2175 		freed = hn_txdesc_put(txr, txd);
2176 		KASSERT(freed != 0,
2177 		    ("fail to free txd upon send error"));
2178 
2179 		txr->hn_send_failed++;
2180 	}
2181 
2182 	/* Reset temporary stats, after this sending is done. */
2183 	txr->hn_stat_size = 0;
2184 	txr->hn_stat_pkts = 0;
2185 	txr->hn_stat_mcasts = 0;
2186 
2187 	return (error);
2188 }
2189 
2190 /*
2191  * Append the specified data to the indicated mbuf chain,
2192  * Extend the mbuf chain if the new data does not fit in
2193  * existing space.
2194  *
2195  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2196  * There should be an equivalent in the kernel mbuf code,
2197  * but there does not appear to be one yet.
2198  *
2199  * Differs from m_append() in that additional mbufs are
2200  * allocated with cluster size MJUMPAGESIZE, and filled
2201  * accordingly.
2202  *
2203  * Return 1 if able to complete the job; otherwise 0.
2204  */
2205 static int
2206 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2207 {
2208 	struct mbuf *m, *n;
2209 	int remainder, space;
2210 
2211 	for (m = m0; m->m_next != NULL; m = m->m_next)
2212 		;
2213 	remainder = len;
2214 	space = M_TRAILINGSPACE(m);
2215 	if (space > 0) {
2216 		/*
2217 		 * Copy into available space.
2218 		 */
2219 		if (space > remainder)
2220 			space = remainder;
2221 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2222 		m->m_len += space;
2223 		cp += space;
2224 		remainder -= space;
2225 	}
2226 	while (remainder > 0) {
2227 		/*
2228 		 * Allocate a new mbuf; could check space
2229 		 * and allocate a cluster instead.
2230 		 */
2231 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2232 		if (n == NULL)
2233 			break;
2234 		n->m_len = min(MJUMPAGESIZE, remainder);
2235 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2236 		cp += n->m_len;
2237 		remainder -= n->m_len;
2238 		m->m_next = n;
2239 		m = n;
2240 	}
2241 	if (m0->m_flags & M_PKTHDR)
2242 		m0->m_pkthdr.len += len - remainder;
2243 
2244 	return (remainder == 0);
2245 }
2246 
2247 #if defined(INET) || defined(INET6)
2248 static __inline int
2249 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2250 {
2251 #if __FreeBSD_version >= 1100095
2252 	if (hn_lro_mbufq_depth) {
2253 		tcp_lro_queue_mbuf(lc, m);
2254 		return 0;
2255 	}
2256 #endif
2257 	return tcp_lro_rx(lc, m, 0);
2258 }
2259 #endif
2260 
2261 static int
2262 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2263     const struct hn_rxinfo *info)
2264 {
2265 	struct ifnet *ifp;
2266 	struct mbuf *m_new;
2267 	int size, do_lro = 0, do_csum = 1;
2268 	int hash_type;
2269 
2270 	/* If the VF is active, inject the packet through the VF */
2271 	ifp = rxr->hn_vf ? rxr->hn_vf : rxr->hn_ifp;
2272 
2273 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2274 		/*
2275 		 * NOTE:
2276 		 * See the NOTE of hn_rndis_init_fixat().  This
2277 		 * function can be reached, immediately after the
2278 		 * RNDIS is initialized but before the ifnet is
2279 		 * setup on the hn_attach() path; drop the unexpected
2280 		 * packets.
2281 		 */
2282 		return (0);
2283 	}
2284 
2285 	if (dlen <= MHLEN) {
2286 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2287 		if (m_new == NULL) {
2288 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2289 			return (0);
2290 		}
2291 		memcpy(mtod(m_new, void *), data, dlen);
2292 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2293 		rxr->hn_small_pkts++;
2294 	} else {
2295 		/*
2296 		 * Get an mbuf with a cluster.  For packets 2K or less,
2297 		 * get a standard 2K cluster.  For anything larger, get a
2298 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2299 		 * if looped around to the Hyper-V TX channel, so avoid them.
2300 		 */
2301 		size = MCLBYTES;
2302 		if (dlen > MCLBYTES) {
2303 			/* 4096 */
2304 			size = MJUMPAGESIZE;
2305 		}
2306 
2307 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2308 		if (m_new == NULL) {
2309 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2310 			return (0);
2311 		}
2312 
2313 		hv_m_append(m_new, dlen, data);
2314 	}
2315 	m_new->m_pkthdr.rcvif = ifp;
2316 
2317 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2318 		do_csum = 0;
2319 
2320 	/* receive side checksum offload */
2321 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2322 		/* IP csum offload */
2323 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2324 			m_new->m_pkthdr.csum_flags |=
2325 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2326 			rxr->hn_csum_ip++;
2327 		}
2328 
2329 		/* TCP/UDP csum offload */
2330 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2331 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2332 			m_new->m_pkthdr.csum_flags |=
2333 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2334 			m_new->m_pkthdr.csum_data = 0xffff;
2335 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2336 				rxr->hn_csum_tcp++;
2337 			else
2338 				rxr->hn_csum_udp++;
2339 		}
2340 
2341 		/*
2342 		 * XXX
2343 		 * As of this write (Oct 28th, 2016), host side will turn
2344 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2345 		 * the do_lro setting here is actually _not_ accurate.  We
2346 		 * depend on the RSS hash type check to reset do_lro.
2347 		 */
2348 		if ((info->csum_info &
2349 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2350 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2351 			do_lro = 1;
2352 	} else {
2353 		const struct ether_header *eh;
2354 		uint16_t etype;
2355 		int hoff;
2356 
2357 		hoff = sizeof(*eh);
2358 		if (m_new->m_len < hoff)
2359 			goto skip;
2360 		eh = mtod(m_new, struct ether_header *);
2361 		etype = ntohs(eh->ether_type);
2362 		if (etype == ETHERTYPE_VLAN) {
2363 			const struct ether_vlan_header *evl;
2364 
2365 			hoff = sizeof(*evl);
2366 			if (m_new->m_len < hoff)
2367 				goto skip;
2368 			evl = mtod(m_new, struct ether_vlan_header *);
2369 			etype = ntohs(evl->evl_proto);
2370 		}
2371 
2372 		if (etype == ETHERTYPE_IP) {
2373 			int pr;
2374 
2375 			pr = hn_check_iplen(m_new, hoff);
2376 			if (pr == IPPROTO_TCP) {
2377 				if (do_csum &&
2378 				    (rxr->hn_trust_hcsum &
2379 				     HN_TRUST_HCSUM_TCP)) {
2380 					rxr->hn_csum_trusted++;
2381 					m_new->m_pkthdr.csum_flags |=
2382 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2383 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2384 					m_new->m_pkthdr.csum_data = 0xffff;
2385 				}
2386 				do_lro = 1;
2387 			} else if (pr == IPPROTO_UDP) {
2388 				if (do_csum &&
2389 				    (rxr->hn_trust_hcsum &
2390 				     HN_TRUST_HCSUM_UDP)) {
2391 					rxr->hn_csum_trusted++;
2392 					m_new->m_pkthdr.csum_flags |=
2393 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2394 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2395 					m_new->m_pkthdr.csum_data = 0xffff;
2396 				}
2397 			} else if (pr != IPPROTO_DONE && do_csum &&
2398 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2399 				rxr->hn_csum_trusted++;
2400 				m_new->m_pkthdr.csum_flags |=
2401 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2402 			}
2403 		}
2404 	}
2405 skip:
2406 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2407 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2408 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2409 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2410 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2411 		m_new->m_flags |= M_VLANTAG;
2412 	}
2413 
2414 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2415 		rxr->hn_rss_pkts++;
2416 		m_new->m_pkthdr.flowid = info->hash_value;
2417 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2418 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2419 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2420 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2421 
2422 			/*
2423 			 * NOTE:
2424 			 * do_lro is resetted, if the hash types are not TCP
2425 			 * related.  See the comment in the above csum_flags
2426 			 * setup section.
2427 			 */
2428 			switch (type) {
2429 			case NDIS_HASH_IPV4:
2430 				hash_type = M_HASHTYPE_RSS_IPV4;
2431 				do_lro = 0;
2432 				break;
2433 
2434 			case NDIS_HASH_TCP_IPV4:
2435 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2436 				break;
2437 
2438 			case NDIS_HASH_IPV6:
2439 				hash_type = M_HASHTYPE_RSS_IPV6;
2440 				do_lro = 0;
2441 				break;
2442 
2443 			case NDIS_HASH_IPV6_EX:
2444 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2445 				do_lro = 0;
2446 				break;
2447 
2448 			case NDIS_HASH_TCP_IPV6:
2449 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2450 				break;
2451 
2452 			case NDIS_HASH_TCP_IPV6_EX:
2453 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2454 				break;
2455 			}
2456 		}
2457 	} else {
2458 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2459 		hash_type = M_HASHTYPE_OPAQUE;
2460 	}
2461 	M_HASHTYPE_SET(m_new, hash_type);
2462 
2463 	/*
2464 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2465 	 * messages (not just data messages) will trigger a response.
2466 	 */
2467 
2468 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2469 	rxr->hn_pkts++;
2470 
2471 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2472 #if defined(INET) || defined(INET6)
2473 		struct lro_ctrl *lro = &rxr->hn_lro;
2474 
2475 		if (lro->lro_cnt) {
2476 			rxr->hn_lro_tried++;
2477 			if (hn_lro_rx(lro, m_new) == 0) {
2478 				/* DONE! */
2479 				return 0;
2480 			}
2481 		}
2482 #endif
2483 	}
2484 
2485 	/* We're not holding the lock here, so don't release it */
2486 	(*ifp->if_input)(ifp, m_new);
2487 
2488 	return (0);
2489 }
2490 
2491 static int
2492 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2493 {
2494 	struct hn_softc *sc = ifp->if_softc;
2495 	struct ifreq *ifr = (struct ifreq *)data;
2496 	int mask, error = 0;
2497 
2498 	switch (cmd) {
2499 	case SIOCSIFMTU:
2500 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2501 			error = EINVAL;
2502 			break;
2503 		}
2504 
2505 		HN_LOCK(sc);
2506 
2507 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2508 			HN_UNLOCK(sc);
2509 			break;
2510 		}
2511 
2512 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2513 			/* Can't change MTU */
2514 			HN_UNLOCK(sc);
2515 			error = EOPNOTSUPP;
2516 			break;
2517 		}
2518 
2519 		if (ifp->if_mtu == ifr->ifr_mtu) {
2520 			HN_UNLOCK(sc);
2521 			break;
2522 		}
2523 
2524 		/*
2525 		 * Suspend this interface before the synthetic parts
2526 		 * are ripped.
2527 		 */
2528 		hn_suspend(sc);
2529 
2530 		/*
2531 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2532 		 */
2533 		hn_synth_detach(sc);
2534 
2535 		/*
2536 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2537 		 * with the new MTU setting.
2538 		 */
2539 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2540 		if (error) {
2541 			HN_UNLOCK(sc);
2542 			break;
2543 		}
2544 
2545 		/*
2546 		 * Commit the requested MTU, after the synthetic parts
2547 		 * have been successfully attached.
2548 		 */
2549 		ifp->if_mtu = ifr->ifr_mtu;
2550 
2551 		/*
2552 		 * Make sure that various parameters based on MTU are
2553 		 * still valid, after the MTU change.
2554 		 */
2555 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2556 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2557 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2558 #if __FreeBSD_version >= 1100099
2559 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2560 		    HN_LRO_LENLIM_MIN(ifp))
2561 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2562 #endif
2563 
2564 		/*
2565 		 * All done!  Resume the interface now.
2566 		 */
2567 		hn_resume(sc);
2568 
2569 		HN_UNLOCK(sc);
2570 		break;
2571 
2572 	case SIOCSIFFLAGS:
2573 		HN_LOCK(sc);
2574 
2575 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2576 			HN_UNLOCK(sc);
2577 			break;
2578 		}
2579 
2580 		if (ifp->if_flags & IFF_UP) {
2581 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2582 				/*
2583 				 * Caller meight hold mutex, e.g.
2584 				 * bpf; use busy-wait for the RNDIS
2585 				 * reply.
2586 				 */
2587 				HN_NO_SLEEPING(sc);
2588 				hn_rxfilter_config(sc);
2589 				HN_SLEEPING_OK(sc);
2590 			} else {
2591 				hn_init_locked(sc);
2592 			}
2593 		} else {
2594 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2595 				hn_stop(sc, false);
2596 		}
2597 		sc->hn_if_flags = ifp->if_flags;
2598 
2599 		HN_UNLOCK(sc);
2600 		break;
2601 
2602 	case SIOCSIFCAP:
2603 		HN_LOCK(sc);
2604 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2605 
2606 		if (mask & IFCAP_TXCSUM) {
2607 			ifp->if_capenable ^= IFCAP_TXCSUM;
2608 			if (ifp->if_capenable & IFCAP_TXCSUM)
2609 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2610 			else
2611 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2612 		}
2613 		if (mask & IFCAP_TXCSUM_IPV6) {
2614 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2615 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2616 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2617 			else
2618 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2619 		}
2620 
2621 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2622 		if (mask & IFCAP_RXCSUM)
2623 			ifp->if_capenable ^= IFCAP_RXCSUM;
2624 #ifdef foo
2625 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2626 		if (mask & IFCAP_RXCSUM_IPV6)
2627 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2628 #endif
2629 
2630 		if (mask & IFCAP_LRO)
2631 			ifp->if_capenable ^= IFCAP_LRO;
2632 
2633 		if (mask & IFCAP_TSO4) {
2634 			ifp->if_capenable ^= IFCAP_TSO4;
2635 			if (ifp->if_capenable & IFCAP_TSO4)
2636 				ifp->if_hwassist |= CSUM_IP_TSO;
2637 			else
2638 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2639 		}
2640 		if (mask & IFCAP_TSO6) {
2641 			ifp->if_capenable ^= IFCAP_TSO6;
2642 			if (ifp->if_capenable & IFCAP_TSO6)
2643 				ifp->if_hwassist |= CSUM_IP6_TSO;
2644 			else
2645 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2646 		}
2647 
2648 		HN_UNLOCK(sc);
2649 		break;
2650 
2651 	case SIOCADDMULTI:
2652 	case SIOCDELMULTI:
2653 		HN_LOCK(sc);
2654 
2655 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2656 			HN_UNLOCK(sc);
2657 			break;
2658 		}
2659 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2660 			/*
2661 			 * Multicast uses mutex; use busy-wait for
2662 			 * the RNDIS reply.
2663 			 */
2664 			HN_NO_SLEEPING(sc);
2665 			hn_rxfilter_config(sc);
2666 			HN_SLEEPING_OK(sc);
2667 		}
2668 
2669 		HN_UNLOCK(sc);
2670 		break;
2671 
2672 	case SIOCSIFMEDIA:
2673 	case SIOCGIFMEDIA:
2674 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2675 		break;
2676 
2677 	default:
2678 		error = ether_ioctl(ifp, cmd, data);
2679 		break;
2680 	}
2681 	return (error);
2682 }
2683 
2684 static void
2685 hn_stop(struct hn_softc *sc, bool detaching)
2686 {
2687 	struct ifnet *ifp = sc->hn_ifp;
2688 	int i;
2689 
2690 	HN_LOCK_ASSERT(sc);
2691 
2692 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2693 	    ("synthetic parts were not attached"));
2694 
2695 	/* Disable polling. */
2696 	hn_polling(sc, 0);
2697 
2698 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2699 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2700 	hn_suspend_data(sc);
2701 
2702 	/* Clear OACTIVE bit. */
2703 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2704 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2705 		sc->hn_tx_ring[i].hn_oactive = 0;
2706 
2707 	/*
2708 	 * If the VF is active, make sure the filter is not 0, even if
2709 	 * the synthetic NIC is down.
2710 	 */
2711 	if (!detaching && (sc->hn_flags & HN_FLAG_VF))
2712 		hn_rxfilter_config(sc);
2713 }
2714 
2715 static void
2716 hn_init_locked(struct hn_softc *sc)
2717 {
2718 	struct ifnet *ifp = sc->hn_ifp;
2719 	int i;
2720 
2721 	HN_LOCK_ASSERT(sc);
2722 
2723 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2724 		return;
2725 
2726 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2727 		return;
2728 
2729 	/* Configure RX filter */
2730 	hn_rxfilter_config(sc);
2731 
2732 	/* Clear OACTIVE bit. */
2733 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2734 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2735 		sc->hn_tx_ring[i].hn_oactive = 0;
2736 
2737 	/* Clear TX 'suspended' bit. */
2738 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2739 
2740 	/* Everything is ready; unleash! */
2741 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2742 
2743 	/* Re-enable polling if requested. */
2744 	if (sc->hn_pollhz > 0)
2745 		hn_polling(sc, sc->hn_pollhz);
2746 }
2747 
2748 static void
2749 hn_init(void *xsc)
2750 {
2751 	struct hn_softc *sc = xsc;
2752 
2753 	HN_LOCK(sc);
2754 	hn_init_locked(sc);
2755 	HN_UNLOCK(sc);
2756 }
2757 
2758 #if __FreeBSD_version >= 1100099
2759 
2760 static int
2761 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2762 {
2763 	struct hn_softc *sc = arg1;
2764 	unsigned int lenlim;
2765 	int error;
2766 
2767 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2768 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2769 	if (error || req->newptr == NULL)
2770 		return error;
2771 
2772 	HN_LOCK(sc);
2773 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2774 	    lenlim > TCP_LRO_LENGTH_MAX) {
2775 		HN_UNLOCK(sc);
2776 		return EINVAL;
2777 	}
2778 	hn_set_lro_lenlim(sc, lenlim);
2779 	HN_UNLOCK(sc);
2780 
2781 	return 0;
2782 }
2783 
2784 static int
2785 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2786 {
2787 	struct hn_softc *sc = arg1;
2788 	int ackcnt, error, i;
2789 
2790 	/*
2791 	 * lro_ackcnt_lim is append count limit,
2792 	 * +1 to turn it into aggregation limit.
2793 	 */
2794 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2795 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2796 	if (error || req->newptr == NULL)
2797 		return error;
2798 
2799 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2800 		return EINVAL;
2801 
2802 	/*
2803 	 * Convert aggregation limit back to append
2804 	 * count limit.
2805 	 */
2806 	--ackcnt;
2807 	HN_LOCK(sc);
2808 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2809 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2810 	HN_UNLOCK(sc);
2811 	return 0;
2812 }
2813 
2814 #endif
2815 
2816 static int
2817 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2818 {
2819 	struct hn_softc *sc = arg1;
2820 	int hcsum = arg2;
2821 	int on, error, i;
2822 
2823 	on = 0;
2824 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2825 		on = 1;
2826 
2827 	error = sysctl_handle_int(oidp, &on, 0, req);
2828 	if (error || req->newptr == NULL)
2829 		return error;
2830 
2831 	HN_LOCK(sc);
2832 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2833 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2834 
2835 		if (on)
2836 			rxr->hn_trust_hcsum |= hcsum;
2837 		else
2838 			rxr->hn_trust_hcsum &= ~hcsum;
2839 	}
2840 	HN_UNLOCK(sc);
2841 	return 0;
2842 }
2843 
2844 static int
2845 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2846 {
2847 	struct hn_softc *sc = arg1;
2848 	int chim_size, error;
2849 
2850 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2851 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2852 	if (error || req->newptr == NULL)
2853 		return error;
2854 
2855 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2856 		return EINVAL;
2857 
2858 	HN_LOCK(sc);
2859 	hn_set_chim_size(sc, chim_size);
2860 	HN_UNLOCK(sc);
2861 	return 0;
2862 }
2863 
2864 #if __FreeBSD_version < 1100095
2865 static int
2866 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2867 {
2868 	struct hn_softc *sc = arg1;
2869 	int ofs = arg2, i, error;
2870 	struct hn_rx_ring *rxr;
2871 	uint64_t stat;
2872 
2873 	stat = 0;
2874 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2875 		rxr = &sc->hn_rx_ring[i];
2876 		stat += *((int *)((uint8_t *)rxr + ofs));
2877 	}
2878 
2879 	error = sysctl_handle_64(oidp, &stat, 0, req);
2880 	if (error || req->newptr == NULL)
2881 		return error;
2882 
2883 	/* Zero out this stat. */
2884 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2885 		rxr = &sc->hn_rx_ring[i];
2886 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2887 	}
2888 	return 0;
2889 }
2890 #else
2891 static int
2892 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2893 {
2894 	struct hn_softc *sc = arg1;
2895 	int ofs = arg2, i, error;
2896 	struct hn_rx_ring *rxr;
2897 	uint64_t stat;
2898 
2899 	stat = 0;
2900 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2901 		rxr = &sc->hn_rx_ring[i];
2902 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2903 	}
2904 
2905 	error = sysctl_handle_64(oidp, &stat, 0, req);
2906 	if (error || req->newptr == NULL)
2907 		return error;
2908 
2909 	/* Zero out this stat. */
2910 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2911 		rxr = &sc->hn_rx_ring[i];
2912 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2913 	}
2914 	return 0;
2915 }
2916 
2917 #endif
2918 
2919 static int
2920 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2921 {
2922 	struct hn_softc *sc = arg1;
2923 	int ofs = arg2, i, error;
2924 	struct hn_rx_ring *rxr;
2925 	u_long stat;
2926 
2927 	stat = 0;
2928 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2929 		rxr = &sc->hn_rx_ring[i];
2930 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2931 	}
2932 
2933 	error = sysctl_handle_long(oidp, &stat, 0, req);
2934 	if (error || req->newptr == NULL)
2935 		return error;
2936 
2937 	/* Zero out this stat. */
2938 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2939 		rxr = &sc->hn_rx_ring[i];
2940 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2941 	}
2942 	return 0;
2943 }
2944 
2945 static int
2946 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2947 {
2948 	struct hn_softc *sc = arg1;
2949 	int ofs = arg2, i, error;
2950 	struct hn_tx_ring *txr;
2951 	u_long stat;
2952 
2953 	stat = 0;
2954 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2955 		txr = &sc->hn_tx_ring[i];
2956 		stat += *((u_long *)((uint8_t *)txr + ofs));
2957 	}
2958 
2959 	error = sysctl_handle_long(oidp, &stat, 0, req);
2960 	if (error || req->newptr == NULL)
2961 		return error;
2962 
2963 	/* Zero out this stat. */
2964 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2965 		txr = &sc->hn_tx_ring[i];
2966 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2967 	}
2968 	return 0;
2969 }
2970 
2971 static int
2972 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2973 {
2974 	struct hn_softc *sc = arg1;
2975 	int ofs = arg2, i, error, conf;
2976 	struct hn_tx_ring *txr;
2977 
2978 	txr = &sc->hn_tx_ring[0];
2979 	conf = *((int *)((uint8_t *)txr + ofs));
2980 
2981 	error = sysctl_handle_int(oidp, &conf, 0, req);
2982 	if (error || req->newptr == NULL)
2983 		return error;
2984 
2985 	HN_LOCK(sc);
2986 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2987 		txr = &sc->hn_tx_ring[i];
2988 		*((int *)((uint8_t *)txr + ofs)) = conf;
2989 	}
2990 	HN_UNLOCK(sc);
2991 
2992 	return 0;
2993 }
2994 
2995 static int
2996 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2997 {
2998 	struct hn_softc *sc = arg1;
2999 	int error, size;
3000 
3001 	size = sc->hn_agg_size;
3002 	error = sysctl_handle_int(oidp, &size, 0, req);
3003 	if (error || req->newptr == NULL)
3004 		return (error);
3005 
3006 	HN_LOCK(sc);
3007 	sc->hn_agg_size = size;
3008 	hn_set_txagg(sc);
3009 	HN_UNLOCK(sc);
3010 
3011 	return (0);
3012 }
3013 
3014 static int
3015 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3016 {
3017 	struct hn_softc *sc = arg1;
3018 	int error, pkts;
3019 
3020 	pkts = sc->hn_agg_pkts;
3021 	error = sysctl_handle_int(oidp, &pkts, 0, req);
3022 	if (error || req->newptr == NULL)
3023 		return (error);
3024 
3025 	HN_LOCK(sc);
3026 	sc->hn_agg_pkts = pkts;
3027 	hn_set_txagg(sc);
3028 	HN_UNLOCK(sc);
3029 
3030 	return (0);
3031 }
3032 
3033 static int
3034 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3035 {
3036 	struct hn_softc *sc = arg1;
3037 	int pkts;
3038 
3039 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3040 	return (sysctl_handle_int(oidp, &pkts, 0, req));
3041 }
3042 
3043 static int
3044 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3045 {
3046 	struct hn_softc *sc = arg1;
3047 	int align;
3048 
3049 	align = sc->hn_tx_ring[0].hn_agg_align;
3050 	return (sysctl_handle_int(oidp, &align, 0, req));
3051 }
3052 
3053 static void
3054 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3055 {
3056 	if (pollhz == 0)
3057 		vmbus_chan_poll_disable(chan);
3058 	else
3059 		vmbus_chan_poll_enable(chan, pollhz);
3060 }
3061 
3062 static void
3063 hn_polling(struct hn_softc *sc, u_int pollhz)
3064 {
3065 	int nsubch = sc->hn_rx_ring_inuse - 1;
3066 
3067 	HN_LOCK_ASSERT(sc);
3068 
3069 	if (nsubch > 0) {
3070 		struct vmbus_channel **subch;
3071 		int i;
3072 
3073 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3074 		for (i = 0; i < nsubch; ++i)
3075 			hn_chan_polling(subch[i], pollhz);
3076 		vmbus_subchan_rel(subch, nsubch);
3077 	}
3078 	hn_chan_polling(sc->hn_prichan, pollhz);
3079 }
3080 
3081 static int
3082 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3083 {
3084 	struct hn_softc *sc = arg1;
3085 	int pollhz, error;
3086 
3087 	pollhz = sc->hn_pollhz;
3088 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
3089 	if (error || req->newptr == NULL)
3090 		return (error);
3091 
3092 	if (pollhz != 0 &&
3093 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3094 		return (EINVAL);
3095 
3096 	HN_LOCK(sc);
3097 	if (sc->hn_pollhz != pollhz) {
3098 		sc->hn_pollhz = pollhz;
3099 		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3100 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3101 			hn_polling(sc, sc->hn_pollhz);
3102 	}
3103 	HN_UNLOCK(sc);
3104 
3105 	return (0);
3106 }
3107 
3108 static int
3109 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3110 {
3111 	struct hn_softc *sc = arg1;
3112 	char verstr[16];
3113 
3114 	snprintf(verstr, sizeof(verstr), "%u.%u",
3115 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3116 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3117 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3118 }
3119 
3120 static int
3121 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3122 {
3123 	struct hn_softc *sc = arg1;
3124 	char caps_str[128];
3125 	uint32_t caps;
3126 
3127 	HN_LOCK(sc);
3128 	caps = sc->hn_caps;
3129 	HN_UNLOCK(sc);
3130 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3131 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3132 }
3133 
3134 static int
3135 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3136 {
3137 	struct hn_softc *sc = arg1;
3138 	char assist_str[128];
3139 	uint32_t hwassist;
3140 
3141 	HN_LOCK(sc);
3142 	hwassist = sc->hn_ifp->if_hwassist;
3143 	HN_UNLOCK(sc);
3144 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3145 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3146 }
3147 
3148 static int
3149 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3150 {
3151 	struct hn_softc *sc = arg1;
3152 	char filter_str[128];
3153 	uint32_t filter;
3154 
3155 	HN_LOCK(sc);
3156 	filter = sc->hn_rx_filter;
3157 	HN_UNLOCK(sc);
3158 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
3159 	    NDIS_PACKET_TYPES);
3160 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3161 }
3162 
3163 #ifndef RSS
3164 
3165 static int
3166 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3167 {
3168 	struct hn_softc *sc = arg1;
3169 	int error;
3170 
3171 	HN_LOCK(sc);
3172 
3173 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3174 	if (error || req->newptr == NULL)
3175 		goto back;
3176 
3177 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3178 	if (error)
3179 		goto back;
3180 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3181 
3182 	if (sc->hn_rx_ring_inuse > 1) {
3183 		error = hn_rss_reconfig(sc);
3184 	} else {
3185 		/* Not RSS capable, at least for now; just save the RSS key. */
3186 		error = 0;
3187 	}
3188 back:
3189 	HN_UNLOCK(sc);
3190 	return (error);
3191 }
3192 
3193 static int
3194 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3195 {
3196 	struct hn_softc *sc = arg1;
3197 	int error;
3198 
3199 	HN_LOCK(sc);
3200 
3201 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3202 	if (error || req->newptr == NULL)
3203 		goto back;
3204 
3205 	/*
3206 	 * Don't allow RSS indirect table change, if this interface is not
3207 	 * RSS capable currently.
3208 	 */
3209 	if (sc->hn_rx_ring_inuse == 1) {
3210 		error = EOPNOTSUPP;
3211 		goto back;
3212 	}
3213 
3214 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3215 	if (error)
3216 		goto back;
3217 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3218 
3219 	hn_rss_ind_fixup(sc);
3220 	error = hn_rss_reconfig(sc);
3221 back:
3222 	HN_UNLOCK(sc);
3223 	return (error);
3224 }
3225 
3226 #endif	/* !RSS */
3227 
3228 static int
3229 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3230 {
3231 	struct hn_softc *sc = arg1;
3232 	char hash_str[128];
3233 	uint32_t hash;
3234 
3235 	HN_LOCK(sc);
3236 	hash = sc->hn_rss_hash;
3237 	HN_UNLOCK(sc);
3238 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3239 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3240 }
3241 
3242 static int
3243 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3244 {
3245 	struct hn_softc *sc = arg1;
3246 	char vf_name[128];
3247 	struct ifnet *vf;
3248 
3249 	HN_LOCK(sc);
3250 	vf_name[0] = '\0';
3251 	vf = sc->hn_rx_ring[0].hn_vf;
3252 	if (vf != NULL)
3253 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf));
3254 	HN_UNLOCK(sc);
3255 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3256 }
3257 
3258 static int
3259 hn_check_iplen(const struct mbuf *m, int hoff)
3260 {
3261 	const struct ip *ip;
3262 	int len, iphlen, iplen;
3263 	const struct tcphdr *th;
3264 	int thoff;				/* TCP data offset */
3265 
3266 	len = hoff + sizeof(struct ip);
3267 
3268 	/* The packet must be at least the size of an IP header. */
3269 	if (m->m_pkthdr.len < len)
3270 		return IPPROTO_DONE;
3271 
3272 	/* The fixed IP header must reside completely in the first mbuf. */
3273 	if (m->m_len < len)
3274 		return IPPROTO_DONE;
3275 
3276 	ip = mtodo(m, hoff);
3277 
3278 	/* Bound check the packet's stated IP header length. */
3279 	iphlen = ip->ip_hl << 2;
3280 	if (iphlen < sizeof(struct ip))		/* minimum header length */
3281 		return IPPROTO_DONE;
3282 
3283 	/* The full IP header must reside completely in the one mbuf. */
3284 	if (m->m_len < hoff + iphlen)
3285 		return IPPROTO_DONE;
3286 
3287 	iplen = ntohs(ip->ip_len);
3288 
3289 	/*
3290 	 * Check that the amount of data in the buffers is as
3291 	 * at least much as the IP header would have us expect.
3292 	 */
3293 	if (m->m_pkthdr.len < hoff + iplen)
3294 		return IPPROTO_DONE;
3295 
3296 	/*
3297 	 * Ignore IP fragments.
3298 	 */
3299 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3300 		return IPPROTO_DONE;
3301 
3302 	/*
3303 	 * The TCP/IP or UDP/IP header must be entirely contained within
3304 	 * the first fragment of a packet.
3305 	 */
3306 	switch (ip->ip_p) {
3307 	case IPPROTO_TCP:
3308 		if (iplen < iphlen + sizeof(struct tcphdr))
3309 			return IPPROTO_DONE;
3310 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3311 			return IPPROTO_DONE;
3312 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3313 		thoff = th->th_off << 2;
3314 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3315 			return IPPROTO_DONE;
3316 		if (m->m_len < hoff + iphlen + thoff)
3317 			return IPPROTO_DONE;
3318 		break;
3319 	case IPPROTO_UDP:
3320 		if (iplen < iphlen + sizeof(struct udphdr))
3321 			return IPPROTO_DONE;
3322 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3323 			return IPPROTO_DONE;
3324 		break;
3325 	default:
3326 		if (iplen < iphlen)
3327 			return IPPROTO_DONE;
3328 		break;
3329 	}
3330 	return ip->ip_p;
3331 }
3332 
3333 static int
3334 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3335 {
3336 	struct sysctl_oid_list *child;
3337 	struct sysctl_ctx_list *ctx;
3338 	device_t dev = sc->hn_dev;
3339 #if defined(INET) || defined(INET6)
3340 #if __FreeBSD_version >= 1100095
3341 	int lroent_cnt;
3342 #endif
3343 #endif
3344 	int i;
3345 
3346 	/*
3347 	 * Create RXBUF for reception.
3348 	 *
3349 	 * NOTE:
3350 	 * - It is shared by all channels.
3351 	 * - A large enough buffer is allocated, certain version of NVSes
3352 	 *   may further limit the usable space.
3353 	 */
3354 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3355 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3356 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3357 	if (sc->hn_rxbuf == NULL) {
3358 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3359 		return (ENOMEM);
3360 	}
3361 
3362 	sc->hn_rx_ring_cnt = ring_cnt;
3363 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3364 
3365 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3366 	    M_DEVBUF, M_WAITOK | M_ZERO);
3367 
3368 #if defined(INET) || defined(INET6)
3369 #if __FreeBSD_version >= 1100095
3370 	lroent_cnt = hn_lro_entry_count;
3371 	if (lroent_cnt < TCP_LRO_ENTRIES)
3372 		lroent_cnt = TCP_LRO_ENTRIES;
3373 	if (bootverbose)
3374 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3375 #endif
3376 #endif	/* INET || INET6 */
3377 
3378 	ctx = device_get_sysctl_ctx(dev);
3379 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3380 
3381 	/* Create dev.hn.UNIT.rx sysctl tree */
3382 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3383 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3384 
3385 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3386 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3387 
3388 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3389 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3390 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3391 		if (rxr->hn_br == NULL) {
3392 			device_printf(dev, "allocate bufring failed\n");
3393 			return (ENOMEM);
3394 		}
3395 
3396 		if (hn_trust_hosttcp)
3397 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3398 		if (hn_trust_hostudp)
3399 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3400 		if (hn_trust_hostip)
3401 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3402 		rxr->hn_ifp = sc->hn_ifp;
3403 		if (i < sc->hn_tx_ring_cnt)
3404 			rxr->hn_txr = &sc->hn_tx_ring[i];
3405 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3406 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3407 		rxr->hn_rx_idx = i;
3408 		rxr->hn_rxbuf = sc->hn_rxbuf;
3409 
3410 		/*
3411 		 * Initialize LRO.
3412 		 */
3413 #if defined(INET) || defined(INET6)
3414 #if __FreeBSD_version >= 1100095
3415 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3416 		    hn_lro_mbufq_depth);
3417 #else
3418 		tcp_lro_init(&rxr->hn_lro);
3419 		rxr->hn_lro.ifp = sc->hn_ifp;
3420 #endif
3421 #if __FreeBSD_version >= 1100099
3422 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3423 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3424 #endif
3425 #endif	/* INET || INET6 */
3426 
3427 		if (sc->hn_rx_sysctl_tree != NULL) {
3428 			char name[16];
3429 
3430 			/*
3431 			 * Create per RX ring sysctl tree:
3432 			 * dev.hn.UNIT.rx.RINGID
3433 			 */
3434 			snprintf(name, sizeof(name), "%d", i);
3435 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3436 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3437 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3438 
3439 			if (rxr->hn_rx_sysctl_tree != NULL) {
3440 				SYSCTL_ADD_ULONG(ctx,
3441 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3442 				    OID_AUTO, "packets", CTLFLAG_RW,
3443 				    &rxr->hn_pkts, "# of packets received");
3444 				SYSCTL_ADD_ULONG(ctx,
3445 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3446 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3447 				    &rxr->hn_rss_pkts,
3448 				    "# of packets w/ RSS info received");
3449 				SYSCTL_ADD_INT(ctx,
3450 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3451 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3452 				    &rxr->hn_pktbuf_len, 0,
3453 				    "Temporary channel packet buffer length");
3454 			}
3455 		}
3456 	}
3457 
3458 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3459 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3460 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3461 #if __FreeBSD_version < 1100095
3462 	    hn_rx_stat_int_sysctl,
3463 #else
3464 	    hn_rx_stat_u64_sysctl,
3465 #endif
3466 	    "LU", "LRO queued");
3467 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3468 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3469 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3470 #if __FreeBSD_version < 1100095
3471 	    hn_rx_stat_int_sysctl,
3472 #else
3473 	    hn_rx_stat_u64_sysctl,
3474 #endif
3475 	    "LU", "LRO flushed");
3476 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3477 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3478 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3479 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3480 #if __FreeBSD_version >= 1100099
3481 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3482 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3483 	    hn_lro_lenlim_sysctl, "IU",
3484 	    "Max # of data bytes to be aggregated by LRO");
3485 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3486 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3487 	    hn_lro_ackcnt_sysctl, "I",
3488 	    "Max # of ACKs to be aggregated by LRO");
3489 #endif
3490 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3491 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3492 	    hn_trust_hcsum_sysctl, "I",
3493 	    "Trust tcp segement verification on host side, "
3494 	    "when csum info is missing");
3495 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3496 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3497 	    hn_trust_hcsum_sysctl, "I",
3498 	    "Trust udp datagram verification on host side, "
3499 	    "when csum info is missing");
3500 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3501 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3502 	    hn_trust_hcsum_sysctl, "I",
3503 	    "Trust ip packet verification on host side, "
3504 	    "when csum info is missing");
3505 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3506 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3507 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3508 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3509 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3510 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3511 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3512 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3513 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3514 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3515 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3516 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3517 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3518 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3519 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3520 	    hn_rx_stat_ulong_sysctl, "LU",
3521 	    "# of packets that we trust host's csum verification");
3522 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3523 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3524 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3525 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3526 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3527 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3528 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3529 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3530 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3531 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3532 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3533 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3534 
3535 	return (0);
3536 }
3537 
3538 static void
3539 hn_destroy_rx_data(struct hn_softc *sc)
3540 {
3541 	int i;
3542 
3543 	if (sc->hn_rxbuf != NULL) {
3544 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3545 			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3546 		else
3547 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
3548 		sc->hn_rxbuf = NULL;
3549 	}
3550 
3551 	if (sc->hn_rx_ring_cnt == 0)
3552 		return;
3553 
3554 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3555 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3556 
3557 		if (rxr->hn_br == NULL)
3558 			continue;
3559 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3560 			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3561 		} else {
3562 			device_printf(sc->hn_dev,
3563 			    "%dth channel bufring is referenced", i);
3564 		}
3565 		rxr->hn_br = NULL;
3566 
3567 #if defined(INET) || defined(INET6)
3568 		tcp_lro_free(&rxr->hn_lro);
3569 #endif
3570 		free(rxr->hn_pktbuf, M_DEVBUF);
3571 	}
3572 	free(sc->hn_rx_ring, M_DEVBUF);
3573 	sc->hn_rx_ring = NULL;
3574 
3575 	sc->hn_rx_ring_cnt = 0;
3576 	sc->hn_rx_ring_inuse = 0;
3577 }
3578 
3579 static int
3580 hn_tx_ring_create(struct hn_softc *sc, int id)
3581 {
3582 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3583 	device_t dev = sc->hn_dev;
3584 	bus_dma_tag_t parent_dtag;
3585 	int error, i;
3586 
3587 	txr->hn_sc = sc;
3588 	txr->hn_tx_idx = id;
3589 
3590 #ifndef HN_USE_TXDESC_BUFRING
3591 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3592 #endif
3593 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3594 
3595 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3596 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3597 	    M_DEVBUF, M_WAITOK | M_ZERO);
3598 #ifndef HN_USE_TXDESC_BUFRING
3599 	SLIST_INIT(&txr->hn_txlist);
3600 #else
3601 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3602 	    M_WAITOK, &txr->hn_tx_lock);
3603 #endif
3604 
3605 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3606 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3607 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3608 	} else {
3609 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3610 	}
3611 
3612 #ifdef HN_IFSTART_SUPPORT
3613 	if (hn_use_if_start) {
3614 		txr->hn_txeof = hn_start_txeof;
3615 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3616 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3617 	} else
3618 #endif
3619 	{
3620 		int br_depth;
3621 
3622 		txr->hn_txeof = hn_xmit_txeof;
3623 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3624 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3625 
3626 		br_depth = hn_get_txswq_depth(txr);
3627 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3628 		    M_WAITOK, &txr->hn_tx_lock);
3629 	}
3630 
3631 	txr->hn_direct_tx_size = hn_direct_tx_size;
3632 
3633 	/*
3634 	 * Always schedule transmission instead of trying to do direct
3635 	 * transmission.  This one gives the best performance so far.
3636 	 */
3637 	txr->hn_sched_tx = 1;
3638 
3639 	parent_dtag = bus_get_dma_tag(dev);
3640 
3641 	/* DMA tag for RNDIS packet messages. */
3642 	error = bus_dma_tag_create(parent_dtag, /* parent */
3643 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3644 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3645 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3646 	    BUS_SPACE_MAXADDR,		/* highaddr */
3647 	    NULL, NULL,			/* filter, filterarg */
3648 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3649 	    1,				/* nsegments */
3650 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3651 	    0,				/* flags */
3652 	    NULL,			/* lockfunc */
3653 	    NULL,			/* lockfuncarg */
3654 	    &txr->hn_tx_rndis_dtag);
3655 	if (error) {
3656 		device_printf(dev, "failed to create rndis dmatag\n");
3657 		return error;
3658 	}
3659 
3660 	/* DMA tag for data. */
3661 	error = bus_dma_tag_create(parent_dtag, /* parent */
3662 	    1,				/* alignment */
3663 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3664 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3665 	    BUS_SPACE_MAXADDR,		/* highaddr */
3666 	    NULL, NULL,			/* filter, filterarg */
3667 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3668 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3669 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3670 	    0,				/* flags */
3671 	    NULL,			/* lockfunc */
3672 	    NULL,			/* lockfuncarg */
3673 	    &txr->hn_tx_data_dtag);
3674 	if (error) {
3675 		device_printf(dev, "failed to create data dmatag\n");
3676 		return error;
3677 	}
3678 
3679 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3680 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3681 
3682 		txd->txr = txr;
3683 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3684 		STAILQ_INIT(&txd->agg_list);
3685 
3686 		/*
3687 		 * Allocate and load RNDIS packet message.
3688 		 */
3689         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3690 		    (void **)&txd->rndis_pkt,
3691 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3692 		    &txd->rndis_pkt_dmap);
3693 		if (error) {
3694 			device_printf(dev,
3695 			    "failed to allocate rndis_packet_msg, %d\n", i);
3696 			return error;
3697 		}
3698 
3699 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3700 		    txd->rndis_pkt_dmap,
3701 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3702 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3703 		    BUS_DMA_NOWAIT);
3704 		if (error) {
3705 			device_printf(dev,
3706 			    "failed to load rndis_packet_msg, %d\n", i);
3707 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3708 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3709 			return error;
3710 		}
3711 
3712 		/* DMA map for TX data. */
3713 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3714 		    &txd->data_dmap);
3715 		if (error) {
3716 			device_printf(dev,
3717 			    "failed to allocate tx data dmamap\n");
3718 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3719 			    txd->rndis_pkt_dmap);
3720 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3721 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3722 			return error;
3723 		}
3724 
3725 		/* All set, put it to list */
3726 		txd->flags |= HN_TXD_FLAG_ONLIST;
3727 #ifndef HN_USE_TXDESC_BUFRING
3728 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3729 #else
3730 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3731 #endif
3732 	}
3733 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3734 
3735 	if (sc->hn_tx_sysctl_tree != NULL) {
3736 		struct sysctl_oid_list *child;
3737 		struct sysctl_ctx_list *ctx;
3738 		char name[16];
3739 
3740 		/*
3741 		 * Create per TX ring sysctl tree:
3742 		 * dev.hn.UNIT.tx.RINGID
3743 		 */
3744 		ctx = device_get_sysctl_ctx(dev);
3745 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3746 
3747 		snprintf(name, sizeof(name), "%d", id);
3748 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3749 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3750 
3751 		if (txr->hn_tx_sysctl_tree != NULL) {
3752 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3753 
3754 #ifdef HN_DEBUG
3755 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3756 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3757 			    "# of available TX descs");
3758 #endif
3759 #ifdef HN_IFSTART_SUPPORT
3760 			if (!hn_use_if_start)
3761 #endif
3762 			{
3763 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3764 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3765 				    "over active");
3766 			}
3767 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3768 			    CTLFLAG_RW, &txr->hn_pkts,
3769 			    "# of packets transmitted");
3770 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3771 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3772 		}
3773 	}
3774 
3775 	return 0;
3776 }
3777 
3778 static void
3779 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3780 {
3781 	struct hn_tx_ring *txr = txd->txr;
3782 
3783 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3784 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3785 
3786 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3787 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3788 	    txd->rndis_pkt_dmap);
3789 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3790 }
3791 
3792 static void
3793 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3794 {
3795 
3796 	KASSERT(txd->refs == 0 || txd->refs == 1,
3797 	    ("invalid txd refs %d", txd->refs));
3798 
3799 	/* Aggregated txds will be freed by their aggregating txd. */
3800 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3801 		int freed;
3802 
3803 		freed = hn_txdesc_put(txr, txd);
3804 		KASSERT(freed, ("can't free txdesc"));
3805 	}
3806 }
3807 
3808 static void
3809 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3810 {
3811 	int i;
3812 
3813 	if (txr->hn_txdesc == NULL)
3814 		return;
3815 
3816 	/*
3817 	 * NOTE:
3818 	 * Because the freeing of aggregated txds will be deferred
3819 	 * to the aggregating txd, two passes are used here:
3820 	 * - The first pass GCes any pending txds.  This GC is necessary,
3821 	 *   since if the channels are revoked, hypervisor will not
3822 	 *   deliver send-done for all pending txds.
3823 	 * - The second pass frees the busdma stuffs, i.e. after all txds
3824 	 *   were freed.
3825 	 */
3826 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3827 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3828 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3829 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3830 
3831 	if (txr->hn_tx_data_dtag != NULL)
3832 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3833 	if (txr->hn_tx_rndis_dtag != NULL)
3834 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3835 
3836 #ifdef HN_USE_TXDESC_BUFRING
3837 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3838 #endif
3839 
3840 	free(txr->hn_txdesc, M_DEVBUF);
3841 	txr->hn_txdesc = NULL;
3842 
3843 	if (txr->hn_mbuf_br != NULL)
3844 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3845 
3846 #ifndef HN_USE_TXDESC_BUFRING
3847 	mtx_destroy(&txr->hn_txlist_spin);
3848 #endif
3849 	mtx_destroy(&txr->hn_tx_lock);
3850 }
3851 
3852 static int
3853 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3854 {
3855 	struct sysctl_oid_list *child;
3856 	struct sysctl_ctx_list *ctx;
3857 	int i;
3858 
3859 	/*
3860 	 * Create TXBUF for chimney sending.
3861 	 *
3862 	 * NOTE: It is shared by all channels.
3863 	 */
3864 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3865 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3866 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3867 	if (sc->hn_chim == NULL) {
3868 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3869 		return (ENOMEM);
3870 	}
3871 
3872 	sc->hn_tx_ring_cnt = ring_cnt;
3873 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3874 
3875 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3876 	    M_DEVBUF, M_WAITOK | M_ZERO);
3877 
3878 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3879 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3880 
3881 	/* Create dev.hn.UNIT.tx sysctl tree */
3882 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3883 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3884 
3885 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3886 		int error;
3887 
3888 		error = hn_tx_ring_create(sc, i);
3889 		if (error)
3890 			return error;
3891 	}
3892 
3893 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3894 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3895 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3896 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3897 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3898 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3899 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3900 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3901 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3902 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3903 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3904 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3905 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3906 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3907 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3908 	    hn_tx_stat_ulong_sysctl, "LU",
3909 	    "# of packet transmission aggregation flush failure");
3910 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3911 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3912 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3913 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3914 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3915 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3916 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3917 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3918 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3919 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3920 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3921 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3922 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3923 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3924 	    "# of total TX descs");
3925 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3926 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3927 	    "Chimney send packet size upper boundary");
3928 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3929 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3930 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3931 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3932 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3933 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3934 	    hn_tx_conf_int_sysctl, "I",
3935 	    "Size of the packet for direct transmission");
3936 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3937 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3938 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3939 	    hn_tx_conf_int_sysctl, "I",
3940 	    "Always schedule transmission "
3941 	    "instead of doing direct transmission");
3942 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3943 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3944 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3945 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3946 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3947 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3948 	    "Applied packet transmission aggregation size");
3949 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3950 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3951 	    hn_txagg_pktmax_sysctl, "I",
3952 	    "Applied packet transmission aggregation packets");
3953 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3954 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3955 	    hn_txagg_align_sysctl, "I",
3956 	    "Applied packet transmission aggregation alignment");
3957 
3958 	return 0;
3959 }
3960 
3961 static void
3962 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3963 {
3964 	int i;
3965 
3966 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3967 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3968 }
3969 
3970 static void
3971 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3972 {
3973 	struct ifnet *ifp = sc->hn_ifp;
3974 	int tso_minlen;
3975 
3976 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3977 		return;
3978 
3979 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3980 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3981 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3982 
3983 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3984 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3985 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3986 
3987 	if (tso_maxlen < tso_minlen)
3988 		tso_maxlen = tso_minlen;
3989 	else if (tso_maxlen > IP_MAXPACKET)
3990 		tso_maxlen = IP_MAXPACKET;
3991 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3992 		tso_maxlen = sc->hn_ndis_tso_szmax;
3993 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3994 	if (bootverbose)
3995 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3996 }
3997 
3998 static void
3999 hn_fixup_tx_data(struct hn_softc *sc)
4000 {
4001 	uint64_t csum_assist;
4002 	int i;
4003 
4004 	hn_set_chim_size(sc, sc->hn_chim_szmax);
4005 	if (hn_tx_chimney_size > 0 &&
4006 	    hn_tx_chimney_size < sc->hn_chim_szmax)
4007 		hn_set_chim_size(sc, hn_tx_chimney_size);
4008 
4009 	csum_assist = 0;
4010 	if (sc->hn_caps & HN_CAP_IPCS)
4011 		csum_assist |= CSUM_IP;
4012 	if (sc->hn_caps & HN_CAP_TCP4CS)
4013 		csum_assist |= CSUM_IP_TCP;
4014 	if (sc->hn_caps & HN_CAP_UDP4CS)
4015 		csum_assist |= CSUM_IP_UDP;
4016 	if (sc->hn_caps & HN_CAP_TCP6CS)
4017 		csum_assist |= CSUM_IP6_TCP;
4018 	if (sc->hn_caps & HN_CAP_UDP6CS)
4019 		csum_assist |= CSUM_IP6_UDP;
4020 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4021 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4022 
4023 	if (sc->hn_caps & HN_CAP_HASHVAL) {
4024 		/*
4025 		 * Support HASHVAL pktinfo on TX path.
4026 		 */
4027 		if (bootverbose)
4028 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4029 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4030 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4031 	}
4032 }
4033 
4034 static void
4035 hn_destroy_tx_data(struct hn_softc *sc)
4036 {
4037 	int i;
4038 
4039 	if (sc->hn_chim != NULL) {
4040 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4041 			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4042 		} else {
4043 			device_printf(sc->hn_dev,
4044 			    "chimney sending buffer is referenced");
4045 		}
4046 		sc->hn_chim = NULL;
4047 	}
4048 
4049 	if (sc->hn_tx_ring_cnt == 0)
4050 		return;
4051 
4052 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4053 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4054 
4055 	free(sc->hn_tx_ring, M_DEVBUF);
4056 	sc->hn_tx_ring = NULL;
4057 
4058 	sc->hn_tx_ring_cnt = 0;
4059 	sc->hn_tx_ring_inuse = 0;
4060 }
4061 
4062 #ifdef HN_IFSTART_SUPPORT
4063 
4064 static void
4065 hn_start_taskfunc(void *xtxr, int pending __unused)
4066 {
4067 	struct hn_tx_ring *txr = xtxr;
4068 
4069 	mtx_lock(&txr->hn_tx_lock);
4070 	hn_start_locked(txr, 0);
4071 	mtx_unlock(&txr->hn_tx_lock);
4072 }
4073 
4074 static int
4075 hn_start_locked(struct hn_tx_ring *txr, int len)
4076 {
4077 	struct hn_softc *sc = txr->hn_sc;
4078 	struct ifnet *ifp = sc->hn_ifp;
4079 	int sched = 0;
4080 
4081 	KASSERT(hn_use_if_start,
4082 	    ("hn_start_locked is called, when if_start is disabled"));
4083 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4084 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4085 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4086 
4087 	if (__predict_false(txr->hn_suspended))
4088 		return (0);
4089 
4090 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4091 	    IFF_DRV_RUNNING)
4092 		return (0);
4093 
4094 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4095 		struct hn_txdesc *txd;
4096 		struct mbuf *m_head;
4097 		int error;
4098 
4099 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4100 		if (m_head == NULL)
4101 			break;
4102 
4103 		if (len > 0 && m_head->m_pkthdr.len > len) {
4104 			/*
4105 			 * This sending could be time consuming; let callers
4106 			 * dispatch this packet sending (and sending of any
4107 			 * following up packets) to tx taskqueue.
4108 			 */
4109 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4110 			sched = 1;
4111 			break;
4112 		}
4113 
4114 #if defined(INET6) || defined(INET)
4115 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4116 			m_head = hn_tso_fixup(m_head);
4117 			if (__predict_false(m_head == NULL)) {
4118 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4119 				continue;
4120 			}
4121 		}
4122 #endif
4123 
4124 		txd = hn_txdesc_get(txr);
4125 		if (txd == NULL) {
4126 			txr->hn_no_txdescs++;
4127 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4128 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4129 			break;
4130 		}
4131 
4132 		error = hn_encap(ifp, txr, txd, &m_head);
4133 		if (error) {
4134 			/* Both txd and m_head are freed */
4135 			KASSERT(txr->hn_agg_txd == NULL,
4136 			    ("encap failed w/ pending aggregating txdesc"));
4137 			continue;
4138 		}
4139 
4140 		if (txr->hn_agg_pktleft == 0) {
4141 			if (txr->hn_agg_txd != NULL) {
4142 				KASSERT(m_head == NULL,
4143 				    ("pending mbuf for aggregating txdesc"));
4144 				error = hn_flush_txagg(ifp, txr);
4145 				if (__predict_false(error)) {
4146 					atomic_set_int(&ifp->if_drv_flags,
4147 					    IFF_DRV_OACTIVE);
4148 					break;
4149 				}
4150 			} else {
4151 				KASSERT(m_head != NULL, ("mbuf was freed"));
4152 				error = hn_txpkt(ifp, txr, txd);
4153 				if (__predict_false(error)) {
4154 					/* txd is freed, but m_head is not */
4155 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4156 					atomic_set_int(&ifp->if_drv_flags,
4157 					    IFF_DRV_OACTIVE);
4158 					break;
4159 				}
4160 			}
4161 		}
4162 #ifdef INVARIANTS
4163 		else {
4164 			KASSERT(txr->hn_agg_txd != NULL,
4165 			    ("no aggregating txdesc"));
4166 			KASSERT(m_head == NULL,
4167 			    ("pending mbuf for aggregating txdesc"));
4168 		}
4169 #endif
4170 	}
4171 
4172 	/* Flush pending aggerated transmission. */
4173 	if (txr->hn_agg_txd != NULL)
4174 		hn_flush_txagg(ifp, txr);
4175 	return (sched);
4176 }
4177 
4178 static void
4179 hn_start(struct ifnet *ifp)
4180 {
4181 	struct hn_softc *sc = ifp->if_softc;
4182 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4183 
4184 	if (txr->hn_sched_tx)
4185 		goto do_sched;
4186 
4187 	if (mtx_trylock(&txr->hn_tx_lock)) {
4188 		int sched;
4189 
4190 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4191 		mtx_unlock(&txr->hn_tx_lock);
4192 		if (!sched)
4193 			return;
4194 	}
4195 do_sched:
4196 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4197 }
4198 
4199 static void
4200 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4201 {
4202 	struct hn_tx_ring *txr = xtxr;
4203 
4204 	mtx_lock(&txr->hn_tx_lock);
4205 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4206 	hn_start_locked(txr, 0);
4207 	mtx_unlock(&txr->hn_tx_lock);
4208 }
4209 
4210 static void
4211 hn_start_txeof(struct hn_tx_ring *txr)
4212 {
4213 	struct hn_softc *sc = txr->hn_sc;
4214 	struct ifnet *ifp = sc->hn_ifp;
4215 
4216 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4217 
4218 	if (txr->hn_sched_tx)
4219 		goto do_sched;
4220 
4221 	if (mtx_trylock(&txr->hn_tx_lock)) {
4222 		int sched;
4223 
4224 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4225 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4226 		mtx_unlock(&txr->hn_tx_lock);
4227 		if (sched) {
4228 			taskqueue_enqueue(txr->hn_tx_taskq,
4229 			    &txr->hn_tx_task);
4230 		}
4231 	} else {
4232 do_sched:
4233 		/*
4234 		 * Release the OACTIVE earlier, with the hope, that
4235 		 * others could catch up.  The task will clear the
4236 		 * flag again with the hn_tx_lock to avoid possible
4237 		 * races.
4238 		 */
4239 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4240 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4241 	}
4242 }
4243 
4244 #endif	/* HN_IFSTART_SUPPORT */
4245 
4246 static int
4247 hn_xmit(struct hn_tx_ring *txr, int len)
4248 {
4249 	struct hn_softc *sc = txr->hn_sc;
4250 	struct ifnet *ifp = sc->hn_ifp;
4251 	struct mbuf *m_head;
4252 	int sched = 0;
4253 
4254 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4255 #ifdef HN_IFSTART_SUPPORT
4256 	KASSERT(hn_use_if_start == 0,
4257 	    ("hn_xmit is called, when if_start is enabled"));
4258 #endif
4259 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4260 
4261 	if (__predict_false(txr->hn_suspended))
4262 		return (0);
4263 
4264 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4265 		return (0);
4266 
4267 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4268 		struct hn_txdesc *txd;
4269 		int error;
4270 
4271 		if (len > 0 && m_head->m_pkthdr.len > len) {
4272 			/*
4273 			 * This sending could be time consuming; let callers
4274 			 * dispatch this packet sending (and sending of any
4275 			 * following up packets) to tx taskqueue.
4276 			 */
4277 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4278 			sched = 1;
4279 			break;
4280 		}
4281 
4282 		txd = hn_txdesc_get(txr);
4283 		if (txd == NULL) {
4284 			txr->hn_no_txdescs++;
4285 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4286 			txr->hn_oactive = 1;
4287 			break;
4288 		}
4289 
4290 		error = hn_encap(ifp, txr, txd, &m_head);
4291 		if (error) {
4292 			/* Both txd and m_head are freed; discard */
4293 			KASSERT(txr->hn_agg_txd == NULL,
4294 			    ("encap failed w/ pending aggregating txdesc"));
4295 			drbr_advance(ifp, txr->hn_mbuf_br);
4296 			continue;
4297 		}
4298 
4299 		if (txr->hn_agg_pktleft == 0) {
4300 			if (txr->hn_agg_txd != NULL) {
4301 				KASSERT(m_head == NULL,
4302 				    ("pending mbuf for aggregating txdesc"));
4303 				error = hn_flush_txagg(ifp, txr);
4304 				if (__predict_false(error)) {
4305 					txr->hn_oactive = 1;
4306 					break;
4307 				}
4308 			} else {
4309 				KASSERT(m_head != NULL, ("mbuf was freed"));
4310 				error = hn_txpkt(ifp, txr, txd);
4311 				if (__predict_false(error)) {
4312 					/* txd is freed, but m_head is not */
4313 					drbr_putback(ifp, txr->hn_mbuf_br,
4314 					    m_head);
4315 					txr->hn_oactive = 1;
4316 					break;
4317 				}
4318 			}
4319 		}
4320 #ifdef INVARIANTS
4321 		else {
4322 			KASSERT(txr->hn_agg_txd != NULL,
4323 			    ("no aggregating txdesc"));
4324 			KASSERT(m_head == NULL,
4325 			    ("pending mbuf for aggregating txdesc"));
4326 		}
4327 #endif
4328 
4329 		/* Sent */
4330 		drbr_advance(ifp, txr->hn_mbuf_br);
4331 	}
4332 
4333 	/* Flush pending aggerated transmission. */
4334 	if (txr->hn_agg_txd != NULL)
4335 		hn_flush_txagg(ifp, txr);
4336 	return (sched);
4337 }
4338 
4339 static int
4340 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4341 {
4342 	struct hn_softc *sc = ifp->if_softc;
4343 	struct hn_tx_ring *txr;
4344 	int error, idx = 0;
4345 
4346 #if defined(INET6) || defined(INET)
4347 	/*
4348 	 * Perform TSO packet header fixup now, since the TSO
4349 	 * packet header should be cache-hot.
4350 	 */
4351 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4352 		m = hn_tso_fixup(m);
4353 		if (__predict_false(m == NULL)) {
4354 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4355 			return EIO;
4356 		}
4357 	}
4358 #endif
4359 
4360 	/*
4361 	 * Select the TX ring based on flowid
4362 	 */
4363 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4364 #ifdef RSS
4365 		uint32_t bid;
4366 
4367 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4368 		    &bid) == 0)
4369 			idx = bid % sc->hn_tx_ring_inuse;
4370 		else
4371 #endif
4372 			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4373 	}
4374 	txr = &sc->hn_tx_ring[idx];
4375 
4376 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4377 	if (error) {
4378 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4379 		return error;
4380 	}
4381 
4382 	if (txr->hn_oactive)
4383 		return 0;
4384 
4385 	if (txr->hn_sched_tx)
4386 		goto do_sched;
4387 
4388 	if (mtx_trylock(&txr->hn_tx_lock)) {
4389 		int sched;
4390 
4391 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4392 		mtx_unlock(&txr->hn_tx_lock);
4393 		if (!sched)
4394 			return 0;
4395 	}
4396 do_sched:
4397 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4398 	return 0;
4399 }
4400 
4401 static void
4402 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4403 {
4404 	struct mbuf *m;
4405 
4406 	mtx_lock(&txr->hn_tx_lock);
4407 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4408 		m_freem(m);
4409 	mtx_unlock(&txr->hn_tx_lock);
4410 }
4411 
4412 static void
4413 hn_xmit_qflush(struct ifnet *ifp)
4414 {
4415 	struct hn_softc *sc = ifp->if_softc;
4416 	int i;
4417 
4418 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4419 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4420 	if_qflush(ifp);
4421 }
4422 
4423 static void
4424 hn_xmit_txeof(struct hn_tx_ring *txr)
4425 {
4426 
4427 	if (txr->hn_sched_tx)
4428 		goto do_sched;
4429 
4430 	if (mtx_trylock(&txr->hn_tx_lock)) {
4431 		int sched;
4432 
4433 		txr->hn_oactive = 0;
4434 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4435 		mtx_unlock(&txr->hn_tx_lock);
4436 		if (sched) {
4437 			taskqueue_enqueue(txr->hn_tx_taskq,
4438 			    &txr->hn_tx_task);
4439 		}
4440 	} else {
4441 do_sched:
4442 		/*
4443 		 * Release the oactive earlier, with the hope, that
4444 		 * others could catch up.  The task will clear the
4445 		 * oactive again with the hn_tx_lock to avoid possible
4446 		 * races.
4447 		 */
4448 		txr->hn_oactive = 0;
4449 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4450 	}
4451 }
4452 
4453 static void
4454 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4455 {
4456 	struct hn_tx_ring *txr = xtxr;
4457 
4458 	mtx_lock(&txr->hn_tx_lock);
4459 	hn_xmit(txr, 0);
4460 	mtx_unlock(&txr->hn_tx_lock);
4461 }
4462 
4463 static void
4464 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4465 {
4466 	struct hn_tx_ring *txr = xtxr;
4467 
4468 	mtx_lock(&txr->hn_tx_lock);
4469 	txr->hn_oactive = 0;
4470 	hn_xmit(txr, 0);
4471 	mtx_unlock(&txr->hn_tx_lock);
4472 }
4473 
4474 static int
4475 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4476 {
4477 	struct vmbus_chan_br cbr;
4478 	struct hn_rx_ring *rxr;
4479 	struct hn_tx_ring *txr = NULL;
4480 	int idx, error;
4481 
4482 	idx = vmbus_chan_subidx(chan);
4483 
4484 	/*
4485 	 * Link this channel to RX/TX ring.
4486 	 */
4487 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4488 	    ("invalid channel index %d, should > 0 && < %d",
4489 	     idx, sc->hn_rx_ring_inuse));
4490 	rxr = &sc->hn_rx_ring[idx];
4491 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4492 	    ("RX ring %d already attached", idx));
4493 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4494 	rxr->hn_chan = chan;
4495 
4496 	if (bootverbose) {
4497 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4498 		    idx, vmbus_chan_id(chan));
4499 	}
4500 
4501 	if (idx < sc->hn_tx_ring_inuse) {
4502 		txr = &sc->hn_tx_ring[idx];
4503 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4504 		    ("TX ring %d already attached", idx));
4505 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4506 
4507 		txr->hn_chan = chan;
4508 		if (bootverbose) {
4509 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4510 			    idx, vmbus_chan_id(chan));
4511 		}
4512 	}
4513 
4514 	/* Bind this channel to a proper CPU. */
4515 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4516 
4517 	/*
4518 	 * Open this channel
4519 	 */
4520 	cbr.cbr = rxr->hn_br;
4521 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4522 	cbr.cbr_txsz = HN_TXBR_SIZE;
4523 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4524 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4525 	if (error) {
4526 		if (error == EISCONN) {
4527 			if_printf(sc->hn_ifp, "bufring is connected after "
4528 			    "chan%u open failure\n", vmbus_chan_id(chan));
4529 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4530 		} else {
4531 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4532 			    vmbus_chan_id(chan), error);
4533 		}
4534 	}
4535 	return (error);
4536 }
4537 
4538 static void
4539 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4540 {
4541 	struct hn_rx_ring *rxr;
4542 	int idx, error;
4543 
4544 	idx = vmbus_chan_subidx(chan);
4545 
4546 	/*
4547 	 * Link this channel to RX/TX ring.
4548 	 */
4549 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4550 	    ("invalid channel index %d, should > 0 && < %d",
4551 	     idx, sc->hn_rx_ring_inuse));
4552 	rxr = &sc->hn_rx_ring[idx];
4553 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4554 	    ("RX ring %d is not attached", idx));
4555 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4556 
4557 	if (idx < sc->hn_tx_ring_inuse) {
4558 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4559 
4560 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4561 		    ("TX ring %d is not attached attached", idx));
4562 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4563 	}
4564 
4565 	/*
4566 	 * Close this channel.
4567 	 *
4568 	 * NOTE:
4569 	 * Channel closing does _not_ destroy the target channel.
4570 	 */
4571 	error = vmbus_chan_close_direct(chan);
4572 	if (error == EISCONN) {
4573 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
4574 		    "after being closed\n", vmbus_chan_id(chan));
4575 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4576 	} else if (error) {
4577 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4578 		    vmbus_chan_id(chan), error);
4579 	}
4580 }
4581 
4582 static int
4583 hn_attach_subchans(struct hn_softc *sc)
4584 {
4585 	struct vmbus_channel **subchans;
4586 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4587 	int i, error = 0;
4588 
4589 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
4590 
4591 	/* Attach the sub-channels. */
4592 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4593 	for (i = 0; i < subchan_cnt; ++i) {
4594 		int error1;
4595 
4596 		error1 = hn_chan_attach(sc, subchans[i]);
4597 		if (error1) {
4598 			error = error1;
4599 			/* Move on; all channels will be detached later. */
4600 		}
4601 	}
4602 	vmbus_subchan_rel(subchans, subchan_cnt);
4603 
4604 	if (error) {
4605 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4606 	} else {
4607 		if (bootverbose) {
4608 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4609 			    subchan_cnt);
4610 		}
4611 	}
4612 	return (error);
4613 }
4614 
4615 static void
4616 hn_detach_allchans(struct hn_softc *sc)
4617 {
4618 	struct vmbus_channel **subchans;
4619 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4620 	int i;
4621 
4622 	if (subchan_cnt == 0)
4623 		goto back;
4624 
4625 	/* Detach the sub-channels. */
4626 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4627 	for (i = 0; i < subchan_cnt; ++i)
4628 		hn_chan_detach(sc, subchans[i]);
4629 	vmbus_subchan_rel(subchans, subchan_cnt);
4630 
4631 back:
4632 	/*
4633 	 * Detach the primary channel, _after_ all sub-channels
4634 	 * are detached.
4635 	 */
4636 	hn_chan_detach(sc, sc->hn_prichan);
4637 
4638 	/* Wait for sub-channels to be destroyed, if any. */
4639 	vmbus_subchan_drain(sc->hn_prichan);
4640 
4641 #ifdef INVARIANTS
4642 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4643 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4644 		    HN_RX_FLAG_ATTACHED) == 0,
4645 		    ("%dth RX ring is still attached", i));
4646 	}
4647 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4648 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4649 		    HN_TX_FLAG_ATTACHED) == 0,
4650 		    ("%dth TX ring is still attached", i));
4651 	}
4652 #endif
4653 }
4654 
4655 static int
4656 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4657 {
4658 	struct vmbus_channel **subchans;
4659 	int nchan, rxr_cnt, error;
4660 
4661 	nchan = *nsubch + 1;
4662 	if (nchan == 1) {
4663 		/*
4664 		 * Multiple RX/TX rings are not requested.
4665 		 */
4666 		*nsubch = 0;
4667 		return (0);
4668 	}
4669 
4670 	/*
4671 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4672 	 * table entries.
4673 	 */
4674 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4675 	if (error) {
4676 		/* No RSS; this is benign. */
4677 		*nsubch = 0;
4678 		return (0);
4679 	}
4680 	if (bootverbose) {
4681 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4682 		    rxr_cnt, nchan);
4683 	}
4684 
4685 	if (nchan > rxr_cnt)
4686 		nchan = rxr_cnt;
4687 	if (nchan == 1) {
4688 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4689 		*nsubch = 0;
4690 		return (0);
4691 	}
4692 
4693 	/*
4694 	 * Allocate sub-channels from NVS.
4695 	 */
4696 	*nsubch = nchan - 1;
4697 	error = hn_nvs_alloc_subchans(sc, nsubch);
4698 	if (error || *nsubch == 0) {
4699 		/* Failed to allocate sub-channels. */
4700 		*nsubch = 0;
4701 		return (0);
4702 	}
4703 
4704 	/*
4705 	 * Wait for all sub-channels to become ready before moving on.
4706 	 */
4707 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4708 	vmbus_subchan_rel(subchans, *nsubch);
4709 	return (0);
4710 }
4711 
4712 static bool
4713 hn_synth_attachable(const struct hn_softc *sc)
4714 {
4715 	int i;
4716 
4717 	if (sc->hn_flags & HN_FLAG_ERRORS)
4718 		return (false);
4719 
4720 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4721 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4722 
4723 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4724 			return (false);
4725 	}
4726 	return (true);
4727 }
4728 
4729 /*
4730  * Make sure that the RX filter is zero after the successful
4731  * RNDIS initialization.
4732  *
4733  * NOTE:
4734  * Under certain conditions on certain versions of Hyper-V,
4735  * the RNDIS rxfilter is _not_ zero on the hypervisor side
4736  * after the successful RNDIS initialization, which breaks
4737  * the assumption of any following code (well, it breaks the
4738  * RNDIS API contract actually).  Clear the RNDIS rxfilter
4739  * explicitly, drain packets sneaking through, and drain the
4740  * interrupt taskqueues scheduled due to the stealth packets.
4741  */
4742 static void
4743 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
4744 {
4745 
4746 	hn_disable_rx(sc);
4747 	hn_drain_rxtx(sc, nchan);
4748 }
4749 
4750 static int
4751 hn_synth_attach(struct hn_softc *sc, int mtu)
4752 {
4753 #define ATTACHED_NVS		0x0002
4754 #define ATTACHED_RNDIS		0x0004
4755 
4756 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4757 	int error, nsubch, nchan = 1, i, rndis_inited;
4758 	uint32_t old_caps, attached = 0;
4759 
4760 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4761 	    ("synthetic parts were attached"));
4762 
4763 	if (!hn_synth_attachable(sc))
4764 		return (ENXIO);
4765 
4766 	/* Save capabilities for later verification. */
4767 	old_caps = sc->hn_caps;
4768 	sc->hn_caps = 0;
4769 
4770 	/* Clear RSS stuffs. */
4771 	sc->hn_rss_ind_size = 0;
4772 	sc->hn_rss_hash = 0;
4773 
4774 	/*
4775 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4776 	 */
4777 	error = hn_chan_attach(sc, sc->hn_prichan);
4778 	if (error)
4779 		goto failed;
4780 
4781 	/*
4782 	 * Attach NVS.
4783 	 */
4784 	error = hn_nvs_attach(sc, mtu);
4785 	if (error)
4786 		goto failed;
4787 	attached |= ATTACHED_NVS;
4788 
4789 	/*
4790 	 * Attach RNDIS _after_ NVS is attached.
4791 	 */
4792 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
4793 	if (rndis_inited)
4794 		attached |= ATTACHED_RNDIS;
4795 	if (error)
4796 		goto failed;
4797 
4798 	/*
4799 	 * Make sure capabilities are not changed.
4800 	 */
4801 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4802 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4803 		    old_caps, sc->hn_caps);
4804 		error = ENXIO;
4805 		goto failed;
4806 	}
4807 
4808 	/*
4809 	 * Allocate sub-channels for multi-TX/RX rings.
4810 	 *
4811 	 * NOTE:
4812 	 * The # of RX rings that can be used is equivalent to the # of
4813 	 * channels to be requested.
4814 	 */
4815 	nsubch = sc->hn_rx_ring_cnt - 1;
4816 	error = hn_synth_alloc_subchans(sc, &nsubch);
4817 	if (error)
4818 		goto failed;
4819 	/* NOTE: _Full_ synthetic parts detach is required now. */
4820 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4821 
4822 	/*
4823 	 * Set the # of TX/RX rings that could be used according to
4824 	 * the # of channels that NVS offered.
4825 	 */
4826 	nchan = nsubch + 1;
4827 	hn_set_ring_inuse(sc, nchan);
4828 	if (nchan == 1) {
4829 		/* Only the primary channel can be used; done */
4830 		goto back;
4831 	}
4832 
4833 	/*
4834 	 * Attach the sub-channels.
4835 	 *
4836 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
4837 	 */
4838 	error = hn_attach_subchans(sc);
4839 	if (error)
4840 		goto failed;
4841 
4842 	/*
4843 	 * Configure RSS key and indirect table _after_ all sub-channels
4844 	 * are attached.
4845 	 */
4846 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4847 		/*
4848 		 * RSS key is not set yet; set it to the default RSS key.
4849 		 */
4850 		if (bootverbose)
4851 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4852 #ifdef RSS
4853 		rss_getkey(rss->rss_key);
4854 #else
4855 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4856 #endif
4857 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4858 	}
4859 
4860 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4861 		/*
4862 		 * RSS indirect table is not set yet; set it up in round-
4863 		 * robin fashion.
4864 		 */
4865 		if (bootverbose) {
4866 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4867 			    "table\n");
4868 		}
4869 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4870 			uint32_t subidx;
4871 
4872 #ifdef RSS
4873 			subidx = rss_get_indirection_to_bucket(i);
4874 #else
4875 			subidx = i;
4876 #endif
4877 			rss->rss_ind[i] = subidx % nchan;
4878 		}
4879 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4880 	} else {
4881 		/*
4882 		 * # of usable channels may be changed, so we have to
4883 		 * make sure that all entries in RSS indirect table
4884 		 * are valid.
4885 		 *
4886 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
4887 		 */
4888 		hn_rss_ind_fixup(sc);
4889 	}
4890 
4891 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4892 	if (error)
4893 		goto failed;
4894 back:
4895 	/*
4896 	 * Fixup transmission aggregation setup.
4897 	 */
4898 	hn_set_txagg(sc);
4899 	hn_rndis_init_fixat(sc, nchan);
4900 	return (0);
4901 
4902 failed:
4903 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4904 		hn_rndis_init_fixat(sc, nchan);
4905 		hn_synth_detach(sc);
4906 	} else {
4907 		if (attached & ATTACHED_RNDIS) {
4908 			hn_rndis_init_fixat(sc, nchan);
4909 			hn_rndis_detach(sc);
4910 		}
4911 		if (attached & ATTACHED_NVS)
4912 			hn_nvs_detach(sc);
4913 		hn_chan_detach(sc, sc->hn_prichan);
4914 		/* Restore old capabilities. */
4915 		sc->hn_caps = old_caps;
4916 	}
4917 	return (error);
4918 
4919 #undef ATTACHED_RNDIS
4920 #undef ATTACHED_NVS
4921 }
4922 
4923 /*
4924  * NOTE:
4925  * The interface must have been suspended though hn_suspend(), before
4926  * this function get called.
4927  */
4928 static void
4929 hn_synth_detach(struct hn_softc *sc)
4930 {
4931 
4932 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4933 	    ("synthetic parts were not attached"));
4934 
4935 	/* Detach the RNDIS first. */
4936 	hn_rndis_detach(sc);
4937 
4938 	/* Detach NVS. */
4939 	hn_nvs_detach(sc);
4940 
4941 	/* Detach all of the channels. */
4942 	hn_detach_allchans(sc);
4943 
4944 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4945 }
4946 
4947 static void
4948 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4949 {
4950 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4951 	    ("invalid ring count %d", ring_cnt));
4952 
4953 	if (sc->hn_tx_ring_cnt > ring_cnt)
4954 		sc->hn_tx_ring_inuse = ring_cnt;
4955 	else
4956 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4957 	sc->hn_rx_ring_inuse = ring_cnt;
4958 
4959 #ifdef RSS
4960 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4961 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4962 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4963 		    rss_getnumbuckets());
4964 	}
4965 #endif
4966 
4967 	if (bootverbose) {
4968 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4969 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4970 	}
4971 }
4972 
4973 static void
4974 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4975 {
4976 
4977 	/*
4978 	 * NOTE:
4979 	 * The TX bufring will not be drained by the hypervisor,
4980 	 * if the primary channel is revoked.
4981 	 */
4982 	while (!vmbus_chan_rx_empty(chan) ||
4983 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4984 	     !vmbus_chan_tx_empty(chan)))
4985 		pause("waitch", 1);
4986 	vmbus_chan_intr_drain(chan);
4987 }
4988 
4989 static void
4990 hn_disable_rx(struct hn_softc *sc)
4991 {
4992 
4993 	/*
4994 	 * Disable RX by clearing RX filter forcefully.
4995 	 */
4996 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4997 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
4998 
4999 	/*
5000 	 * Give RNDIS enough time to flush all pending data packets.
5001 	 */
5002 	pause("waitrx", (200 * hz) / 1000);
5003 }
5004 
5005 /*
5006  * NOTE:
5007  * RX/TX _must_ have been suspended/disabled, before this function
5008  * is called.
5009  */
5010 static void
5011 hn_drain_rxtx(struct hn_softc *sc, int nchan)
5012 {
5013 	struct vmbus_channel **subch = NULL;
5014 	int nsubch;
5015 
5016 	/*
5017 	 * Drain RX/TX bufrings and interrupts.
5018 	 */
5019 	nsubch = nchan - 1;
5020 	if (nsubch > 0)
5021 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
5022 
5023 	if (subch != NULL) {
5024 		int i;
5025 
5026 		for (i = 0; i < nsubch; ++i)
5027 			hn_chan_drain(sc, subch[i]);
5028 	}
5029 	hn_chan_drain(sc, sc->hn_prichan);
5030 
5031 	if (subch != NULL)
5032 		vmbus_subchan_rel(subch, nsubch);
5033 }
5034 
5035 static void
5036 hn_suspend_data(struct hn_softc *sc)
5037 {
5038 	struct hn_tx_ring *txr;
5039 	int i;
5040 
5041 	HN_LOCK_ASSERT(sc);
5042 
5043 	/*
5044 	 * Suspend TX.
5045 	 */
5046 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5047 		txr = &sc->hn_tx_ring[i];
5048 
5049 		mtx_lock(&txr->hn_tx_lock);
5050 		txr->hn_suspended = 1;
5051 		mtx_unlock(&txr->hn_tx_lock);
5052 		/* No one is able send more packets now. */
5053 
5054 		/*
5055 		 * Wait for all pending sends to finish.
5056 		 *
5057 		 * NOTE:
5058 		 * We will _not_ receive all pending send-done, if the
5059 		 * primary channel is revoked.
5060 		 */
5061 		while (hn_tx_ring_pending(txr) &&
5062 		    !vmbus_chan_is_revoked(sc->hn_prichan))
5063 			pause("hnwtx", 1 /* 1 tick */);
5064 	}
5065 
5066 	/*
5067 	 * Disable RX.
5068 	 */
5069 	hn_disable_rx(sc);
5070 
5071 	/*
5072 	 * Drain RX/TX.
5073 	 */
5074 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
5075 
5076 	/*
5077 	 * Drain any pending TX tasks.
5078 	 *
5079 	 * NOTE:
5080 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
5081 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
5082 	 */
5083 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5084 		txr = &sc->hn_tx_ring[i];
5085 
5086 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5087 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5088 	}
5089 }
5090 
5091 static void
5092 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5093 {
5094 
5095 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5096 }
5097 
5098 static void
5099 hn_suspend_mgmt(struct hn_softc *sc)
5100 {
5101 	struct task task;
5102 
5103 	HN_LOCK_ASSERT(sc);
5104 
5105 	/*
5106 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5107 	 * through hn_mgmt_taskq.
5108 	 */
5109 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5110 	vmbus_chan_run_task(sc->hn_prichan, &task);
5111 
5112 	/*
5113 	 * Make sure that all pending management tasks are completed.
5114 	 */
5115 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5116 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5117 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
5118 }
5119 
5120 static void
5121 hn_suspend(struct hn_softc *sc)
5122 {
5123 
5124 	/* Disable polling. */
5125 	hn_polling(sc, 0);
5126 
5127 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5128 	    (sc->hn_flags & HN_FLAG_VF))
5129 		hn_suspend_data(sc);
5130 	hn_suspend_mgmt(sc);
5131 }
5132 
5133 static void
5134 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5135 {
5136 	int i;
5137 
5138 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5139 	    ("invalid TX ring count %d", tx_ring_cnt));
5140 
5141 	for (i = 0; i < tx_ring_cnt; ++i) {
5142 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5143 
5144 		mtx_lock(&txr->hn_tx_lock);
5145 		txr->hn_suspended = 0;
5146 		mtx_unlock(&txr->hn_tx_lock);
5147 	}
5148 }
5149 
5150 static void
5151 hn_resume_data(struct hn_softc *sc)
5152 {
5153 	int i;
5154 
5155 	HN_LOCK_ASSERT(sc);
5156 
5157 	/*
5158 	 * Re-enable RX.
5159 	 */
5160 	hn_rxfilter_config(sc);
5161 
5162 	/*
5163 	 * Make sure to clear suspend status on "all" TX rings,
5164 	 * since hn_tx_ring_inuse can be changed after
5165 	 * hn_suspend_data().
5166 	 */
5167 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5168 
5169 #ifdef HN_IFSTART_SUPPORT
5170 	if (!hn_use_if_start)
5171 #endif
5172 	{
5173 		/*
5174 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
5175 		 * reduced.
5176 		 */
5177 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5178 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5179 	}
5180 
5181 	/*
5182 	 * Kick start TX.
5183 	 */
5184 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5185 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5186 
5187 		/*
5188 		 * Use txeof task, so that any pending oactive can be
5189 		 * cleared properly.
5190 		 */
5191 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5192 	}
5193 }
5194 
5195 static void
5196 hn_resume_mgmt(struct hn_softc *sc)
5197 {
5198 
5199 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5200 
5201 	/*
5202 	 * Kick off network change detection, if it was pending.
5203 	 * If no network change was pending, start link status
5204 	 * checks, which is more lightweight than network change
5205 	 * detection.
5206 	 */
5207 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5208 		hn_change_network(sc);
5209 	else
5210 		hn_update_link_status(sc);
5211 }
5212 
5213 static void
5214 hn_resume(struct hn_softc *sc)
5215 {
5216 
5217 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5218 	    (sc->hn_flags & HN_FLAG_VF))
5219 		hn_resume_data(sc);
5220 
5221 	/*
5222 	 * When the VF is activated, the synthetic interface is changed
5223 	 * to DOWN in hn_set_vf(). Here, if the VF is still active, we
5224 	 * don't call hn_resume_mgmt() until the VF is deactivated in
5225 	 * hn_set_vf().
5226 	 */
5227 	if (!(sc->hn_flags & HN_FLAG_VF))
5228 		hn_resume_mgmt(sc);
5229 
5230 	/*
5231 	 * Re-enable polling if this interface is running and
5232 	 * the polling is requested.
5233 	 */
5234 	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5235 		hn_polling(sc, sc->hn_pollhz);
5236 }
5237 
5238 static void
5239 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5240 {
5241 	const struct rndis_status_msg *msg;
5242 	int ofs;
5243 
5244 	if (dlen < sizeof(*msg)) {
5245 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5246 		return;
5247 	}
5248 	msg = data;
5249 
5250 	switch (msg->rm_status) {
5251 	case RNDIS_STATUS_MEDIA_CONNECT:
5252 	case RNDIS_STATUS_MEDIA_DISCONNECT:
5253 		hn_update_link_status(sc);
5254 		break;
5255 
5256 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5257 		/* Not really useful; ignore. */
5258 		break;
5259 
5260 	case RNDIS_STATUS_NETWORK_CHANGE:
5261 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5262 		if (dlen < ofs + msg->rm_stbuflen ||
5263 		    msg->rm_stbuflen < sizeof(uint32_t)) {
5264 			if_printf(sc->hn_ifp, "network changed\n");
5265 		} else {
5266 			uint32_t change;
5267 
5268 			memcpy(&change, ((const uint8_t *)msg) + ofs,
5269 			    sizeof(change));
5270 			if_printf(sc->hn_ifp, "network changed, change %u\n",
5271 			    change);
5272 		}
5273 		hn_change_network(sc);
5274 		break;
5275 
5276 	default:
5277 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5278 		    msg->rm_status);
5279 		break;
5280 	}
5281 }
5282 
5283 static int
5284 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5285 {
5286 	const struct rndis_pktinfo *pi = info_data;
5287 	uint32_t mask = 0;
5288 
5289 	while (info_dlen != 0) {
5290 		const void *data;
5291 		uint32_t dlen;
5292 
5293 		if (__predict_false(info_dlen < sizeof(*pi)))
5294 			return (EINVAL);
5295 		if (__predict_false(info_dlen < pi->rm_size))
5296 			return (EINVAL);
5297 		info_dlen -= pi->rm_size;
5298 
5299 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5300 			return (EINVAL);
5301 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5302 			return (EINVAL);
5303 		dlen = pi->rm_size - pi->rm_pktinfooffset;
5304 		data = pi->rm_data;
5305 
5306 		switch (pi->rm_type) {
5307 		case NDIS_PKTINFO_TYPE_VLAN:
5308 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5309 				return (EINVAL);
5310 			info->vlan_info = *((const uint32_t *)data);
5311 			mask |= HN_RXINFO_VLAN;
5312 			break;
5313 
5314 		case NDIS_PKTINFO_TYPE_CSUM:
5315 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5316 				return (EINVAL);
5317 			info->csum_info = *((const uint32_t *)data);
5318 			mask |= HN_RXINFO_CSUM;
5319 			break;
5320 
5321 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5322 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5323 				return (EINVAL);
5324 			info->hash_value = *((const uint32_t *)data);
5325 			mask |= HN_RXINFO_HASHVAL;
5326 			break;
5327 
5328 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
5329 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5330 				return (EINVAL);
5331 			info->hash_info = *((const uint32_t *)data);
5332 			mask |= HN_RXINFO_HASHINF;
5333 			break;
5334 
5335 		default:
5336 			goto next;
5337 		}
5338 
5339 		if (mask == HN_RXINFO_ALL) {
5340 			/* All found; done */
5341 			break;
5342 		}
5343 next:
5344 		pi = (const struct rndis_pktinfo *)
5345 		    ((const uint8_t *)pi + pi->rm_size);
5346 	}
5347 
5348 	/*
5349 	 * Final fixup.
5350 	 * - If there is no hash value, invalidate the hash info.
5351 	 */
5352 	if ((mask & HN_RXINFO_HASHVAL) == 0)
5353 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5354 	return (0);
5355 }
5356 
5357 static __inline bool
5358 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5359 {
5360 
5361 	if (off < check_off) {
5362 		if (__predict_true(off + len <= check_off))
5363 			return (false);
5364 	} else if (off > check_off) {
5365 		if (__predict_true(check_off + check_len <= off))
5366 			return (false);
5367 	}
5368 	return (true);
5369 }
5370 
5371 static void
5372 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5373 {
5374 	const struct rndis_packet_msg *pkt;
5375 	struct hn_rxinfo info;
5376 	int data_off, pktinfo_off, data_len, pktinfo_len;
5377 
5378 	/*
5379 	 * Check length.
5380 	 */
5381 	if (__predict_false(dlen < sizeof(*pkt))) {
5382 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5383 		return;
5384 	}
5385 	pkt = data;
5386 
5387 	if (__predict_false(dlen < pkt->rm_len)) {
5388 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5389 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5390 		return;
5391 	}
5392 	if (__predict_false(pkt->rm_len <
5393 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5394 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5395 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
5396 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5397 		    pkt->rm_pktinfolen);
5398 		return;
5399 	}
5400 	if (__predict_false(pkt->rm_datalen == 0)) {
5401 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5402 		return;
5403 	}
5404 
5405 	/*
5406 	 * Check offests.
5407 	 */
5408 #define IS_OFFSET_INVALID(ofs)			\
5409 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
5410 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5411 
5412 	/* XXX Hyper-V does not meet data offset alignment requirement */
5413 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5414 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5415 		    "data offset %u\n", pkt->rm_dataoffset);
5416 		return;
5417 	}
5418 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5419 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5420 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5421 		    "oob offset %u\n", pkt->rm_oobdataoffset);
5422 		return;
5423 	}
5424 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5425 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5426 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5427 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5428 		return;
5429 	}
5430 
5431 #undef IS_OFFSET_INVALID
5432 
5433 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5434 	data_len = pkt->rm_datalen;
5435 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5436 	pktinfo_len = pkt->rm_pktinfolen;
5437 
5438 	/*
5439 	 * Check OOB coverage.
5440 	 */
5441 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
5442 		int oob_off, oob_len;
5443 
5444 		if_printf(rxr->hn_ifp, "got oobdata\n");
5445 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5446 		oob_len = pkt->rm_oobdatalen;
5447 
5448 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5449 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5450 			    "oob overflow, msglen %u, oob abs %d len %d\n",
5451 			    pkt->rm_len, oob_off, oob_len);
5452 			return;
5453 		}
5454 
5455 		/*
5456 		 * Check against data.
5457 		 */
5458 		if (hn_rndis_check_overlap(oob_off, oob_len,
5459 		    data_off, data_len)) {
5460 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5461 			    "oob overlaps data, oob abs %d len %d, "
5462 			    "data abs %d len %d\n",
5463 			    oob_off, oob_len, data_off, data_len);
5464 			return;
5465 		}
5466 
5467 		/*
5468 		 * Check against pktinfo.
5469 		 */
5470 		if (pktinfo_len != 0 &&
5471 		    hn_rndis_check_overlap(oob_off, oob_len,
5472 		    pktinfo_off, pktinfo_len)) {
5473 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5474 			    "oob overlaps pktinfo, oob abs %d len %d, "
5475 			    "pktinfo abs %d len %d\n",
5476 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
5477 			return;
5478 		}
5479 	}
5480 
5481 	/*
5482 	 * Check per-packet-info coverage and find useful per-packet-info.
5483 	 */
5484 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5485 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5486 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5487 	if (__predict_true(pktinfo_len != 0)) {
5488 		bool overlap;
5489 		int error;
5490 
5491 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5492 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5493 			    "pktinfo overflow, msglen %u, "
5494 			    "pktinfo abs %d len %d\n",
5495 			    pkt->rm_len, pktinfo_off, pktinfo_len);
5496 			return;
5497 		}
5498 
5499 		/*
5500 		 * Check packet info coverage.
5501 		 */
5502 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5503 		    data_off, data_len);
5504 		if (__predict_false(overlap)) {
5505 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5506 			    "pktinfo overlap data, pktinfo abs %d len %d, "
5507 			    "data abs %d len %d\n",
5508 			    pktinfo_off, pktinfo_len, data_off, data_len);
5509 			return;
5510 		}
5511 
5512 		/*
5513 		 * Find useful per-packet-info.
5514 		 */
5515 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5516 		    pktinfo_len, &info);
5517 		if (__predict_false(error)) {
5518 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5519 			    "pktinfo\n");
5520 			return;
5521 		}
5522 	}
5523 
5524 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
5525 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5526 		    "data overflow, msglen %u, data abs %d len %d\n",
5527 		    pkt->rm_len, data_off, data_len);
5528 		return;
5529 	}
5530 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5531 }
5532 
5533 static __inline void
5534 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5535 {
5536 	const struct rndis_msghdr *hdr;
5537 
5538 	if (__predict_false(dlen < sizeof(*hdr))) {
5539 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5540 		return;
5541 	}
5542 	hdr = data;
5543 
5544 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5545 		/* Hot data path. */
5546 		hn_rndis_rx_data(rxr, data, dlen);
5547 		/* Done! */
5548 		return;
5549 	}
5550 
5551 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5552 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5553 	else
5554 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5555 }
5556 
5557 static void
5558 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5559 {
5560 	const struct hn_nvs_hdr *hdr;
5561 
5562 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5563 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5564 		return;
5565 	}
5566 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5567 
5568 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5569 		/* Useless; ignore */
5570 		return;
5571 	}
5572 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5573 }
5574 
5575 static void
5576 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5577     const struct vmbus_chanpkt_hdr *pkt)
5578 {
5579 	struct hn_nvs_sendctx *sndc;
5580 
5581 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5582 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5583 	    VMBUS_CHANPKT_DATALEN(pkt));
5584 	/*
5585 	 * NOTE:
5586 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5587 	 * its callback.
5588 	 */
5589 }
5590 
5591 static void
5592 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5593     const struct vmbus_chanpkt_hdr *pkthdr)
5594 {
5595 	const struct vmbus_chanpkt_rxbuf *pkt;
5596 	const struct hn_nvs_hdr *nvs_hdr;
5597 	int count, i, hlen;
5598 
5599 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5600 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5601 		return;
5602 	}
5603 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5604 
5605 	/* Make sure that this is a RNDIS message. */
5606 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5607 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5608 		    nvs_hdr->nvs_type);
5609 		return;
5610 	}
5611 
5612 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5613 	if (__predict_false(hlen < sizeof(*pkt))) {
5614 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5615 		return;
5616 	}
5617 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5618 
5619 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5620 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5621 		    pkt->cp_rxbuf_id);
5622 		return;
5623 	}
5624 
5625 	count = pkt->cp_rxbuf_cnt;
5626 	if (__predict_false(hlen <
5627 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5628 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5629 		return;
5630 	}
5631 
5632 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5633 	for (i = 0; i < count; ++i) {
5634 		int ofs, len;
5635 
5636 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5637 		len = pkt->cp_rxbuf[i].rb_len;
5638 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5639 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5640 			    "ofs %d, len %d\n", i, ofs, len);
5641 			continue;
5642 		}
5643 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5644 	}
5645 
5646 	/*
5647 	 * Ack the consumed RXBUF associated w/ this channel packet,
5648 	 * so that this RXBUF can be recycled by the hypervisor.
5649 	 */
5650 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5651 }
5652 
5653 static void
5654 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5655     uint64_t tid)
5656 {
5657 	struct hn_nvs_rndis_ack ack;
5658 	int retries, error;
5659 
5660 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5661 	ack.nvs_status = HN_NVS_STATUS_OK;
5662 
5663 	retries = 0;
5664 again:
5665 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5666 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5667 	if (__predict_false(error == EAGAIN)) {
5668 		/*
5669 		 * NOTE:
5670 		 * This should _not_ happen in real world, since the
5671 		 * consumption of the TX bufring from the TX path is
5672 		 * controlled.
5673 		 */
5674 		if (rxr->hn_ack_failed == 0)
5675 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5676 		rxr->hn_ack_failed++;
5677 		retries++;
5678 		if (retries < 10) {
5679 			DELAY(100);
5680 			goto again;
5681 		}
5682 		/* RXBUF leaks! */
5683 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5684 	}
5685 }
5686 
5687 static void
5688 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5689 {
5690 	struct hn_rx_ring *rxr = xrxr;
5691 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5692 
5693 	for (;;) {
5694 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5695 		int error, pktlen;
5696 
5697 		pktlen = rxr->hn_pktbuf_len;
5698 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5699 		if (__predict_false(error == ENOBUFS)) {
5700 			void *nbuf;
5701 			int nlen;
5702 
5703 			/*
5704 			 * Expand channel packet buffer.
5705 			 *
5706 			 * XXX
5707 			 * Use M_WAITOK here, since allocation failure
5708 			 * is fatal.
5709 			 */
5710 			nlen = rxr->hn_pktbuf_len * 2;
5711 			while (nlen < pktlen)
5712 				nlen *= 2;
5713 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5714 
5715 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5716 			    rxr->hn_pktbuf_len, nlen);
5717 
5718 			free(rxr->hn_pktbuf, M_DEVBUF);
5719 			rxr->hn_pktbuf = nbuf;
5720 			rxr->hn_pktbuf_len = nlen;
5721 			/* Retry! */
5722 			continue;
5723 		} else if (__predict_false(error == EAGAIN)) {
5724 			/* No more channel packets; done! */
5725 			break;
5726 		}
5727 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5728 
5729 		switch (pkt->cph_type) {
5730 		case VMBUS_CHANPKT_TYPE_COMP:
5731 			hn_nvs_handle_comp(sc, chan, pkt);
5732 			break;
5733 
5734 		case VMBUS_CHANPKT_TYPE_RXBUF:
5735 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5736 			break;
5737 
5738 		case VMBUS_CHANPKT_TYPE_INBAND:
5739 			hn_nvs_handle_notify(sc, pkt);
5740 			break;
5741 
5742 		default:
5743 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5744 			    pkt->cph_type);
5745 			break;
5746 		}
5747 	}
5748 	hn_chan_rollup(rxr, rxr->hn_txr);
5749 }
5750 
5751 static void
5752 hn_tx_taskq_create(void *arg __unused)
5753 {
5754 	int i;
5755 
5756 	/*
5757 	 * Fix the # of TX taskqueues.
5758 	 */
5759 	if (hn_tx_taskq_cnt <= 0)
5760 		hn_tx_taskq_cnt = 1;
5761 	else if (hn_tx_taskq_cnt > mp_ncpus)
5762 		hn_tx_taskq_cnt = mp_ncpus;
5763 
5764 	/*
5765 	 * Fix the TX taskqueue mode.
5766 	 */
5767 	switch (hn_tx_taskq_mode) {
5768 	case HN_TX_TASKQ_M_INDEP:
5769 	case HN_TX_TASKQ_M_GLOBAL:
5770 	case HN_TX_TASKQ_M_EVTTQ:
5771 		break;
5772 	default:
5773 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5774 		break;
5775 	}
5776 
5777 	if (vm_guest != VM_GUEST_HV)
5778 		return;
5779 
5780 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5781 		return;
5782 
5783 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5784 	    M_DEVBUF, M_WAITOK);
5785 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5786 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5787 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5788 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5789 		    "hn tx%d", i);
5790 	}
5791 }
5792 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5793     hn_tx_taskq_create, NULL);
5794 
5795 static void
5796 hn_tx_taskq_destroy(void *arg __unused)
5797 {
5798 
5799 	if (hn_tx_taskque != NULL) {
5800 		int i;
5801 
5802 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
5803 			taskqueue_free(hn_tx_taskque[i]);
5804 		free(hn_tx_taskque, M_DEVBUF);
5805 	}
5806 }
5807 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5808     hn_tx_taskq_destroy, NULL);
5809