xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision f391d6bc1d0464f62f1b8264666c897a680156b1)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_inet6.h"
59 #include "opt_inet.h"
60 
61 #include <sys/param.h>
62 #include <sys/bus.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
66 #include <sys/mbuf.h>
67 #include <sys/module.h>
68 #include <sys/queue.h>
69 #include <sys/lock.h>
70 #include <sys/smp.h>
71 #include <sys/socket.h>
72 #include <sys/sockio.h>
73 #include <sys/sx.h>
74 #include <sys/sysctl.h>
75 #include <sys/systm.h>
76 #include <sys/taskqueue.h>
77 #include <sys/buf_ring.h>
78 
79 #include <machine/atomic.h>
80 #include <machine/in_cksum.h>
81 
82 #include <net/bpf.h>
83 #include <net/ethernet.h>
84 #include <net/if.h>
85 #include <net/if_media.h>
86 #include <net/if_types.h>
87 #include <net/if_var.h>
88 #include <net/rndis.h>
89 
90 #include <netinet/in_systm.h>
91 #include <netinet/in.h>
92 #include <netinet/ip.h>
93 #include <netinet/ip6.h>
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_lro.h>
96 #include <netinet/udp.h>
97 
98 #include <dev/hyperv/include/hyperv.h>
99 #include <dev/hyperv/include/hyperv_busdma.h>
100 #include <dev/hyperv/include/vmbus.h>
101 #include <dev/hyperv/include/vmbus_xact.h>
102 
103 #include <dev/hyperv/netvsc/ndis.h>
104 #include <dev/hyperv/netvsc/if_hnreg.h>
105 #include <dev/hyperv/netvsc/if_hnvar.h>
106 #include <dev/hyperv/netvsc/hn_nvs.h>
107 #include <dev/hyperv/netvsc/hn_rndis.h>
108 
109 #include "vmbus_if.h"
110 
111 #define HN_IFSTART_SUPPORT
112 
113 #define HN_RING_CNT_DEF_MAX		8
114 
115 /* YYY should get it from the underlying channel */
116 #define HN_TX_DESC_CNT			512
117 
118 #define HN_RNDIS_PKT_LEN					\
119 	(sizeof(struct rndis_packet_msg) +			\
120 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
121 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
122 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
123 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
124 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
125 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
126 
127 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
128 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
129 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
130 /* -1 for RNDIS packet message */
131 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
132 
133 #define HN_DIRECT_TX_SIZE_DEF		128
134 
135 #define HN_EARLY_TXEOF_THRESH		8
136 
137 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
138 
139 #define HN_LROENT_CNT_DEF		128
140 
141 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
142 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
143 /* YYY 2*MTU is a bit rough, but should be good enough. */
144 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
145 
146 #define HN_LRO_ACKCNT_DEF		1
147 
148 #define HN_LOCK_INIT(sc)		\
149 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
150 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
151 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
152 #define HN_LOCK(sc)					\
153 do {							\
154 	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
155 		DELAY(1000);				\
156 } while (0)
157 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
158 
159 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
160 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
161 #define HN_CSUM_IP_HWASSIST(sc)		\
162 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
163 #define HN_CSUM_IP6_HWASSIST(sc)	\
164 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
165 
166 #define HN_PKTSIZE_MIN(align)		\
167 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
168 	    HN_RNDIS_PKT_LEN, (align))
169 #define HN_PKTSIZE(m, align)		\
170 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
171 
172 struct hn_txdesc {
173 #ifndef HN_USE_TXDESC_BUFRING
174 	SLIST_ENTRY(hn_txdesc)		link;
175 #endif
176 	STAILQ_ENTRY(hn_txdesc)		agg_link;
177 
178 	/* Aggregated txdescs, in sending order. */
179 	STAILQ_HEAD(, hn_txdesc)	agg_list;
180 
181 	/* The oldest packet, if transmission aggregation happens. */
182 	struct mbuf			*m;
183 	struct hn_tx_ring		*txr;
184 	int				refs;
185 	uint32_t			flags;	/* HN_TXD_FLAG_ */
186 	struct hn_nvs_sendctx		send_ctx;
187 	uint32_t			chim_index;
188 	int				chim_size;
189 
190 	bus_dmamap_t			data_dmap;
191 
192 	bus_addr_t			rndis_pkt_paddr;
193 	struct rndis_packet_msg		*rndis_pkt;
194 	bus_dmamap_t			rndis_pkt_dmap;
195 };
196 
197 #define HN_TXD_FLAG_ONLIST		0x0001
198 #define HN_TXD_FLAG_DMAMAP		0x0002
199 #define HN_TXD_FLAG_ONAGG		0x0004
200 
201 struct hn_rxinfo {
202 	uint32_t			vlan_info;
203 	uint32_t			csum_info;
204 	uint32_t			hash_info;
205 	uint32_t			hash_value;
206 };
207 
208 #define HN_RXINFO_VLAN			0x0001
209 #define HN_RXINFO_CSUM			0x0002
210 #define HN_RXINFO_HASHINF		0x0004
211 #define HN_RXINFO_HASHVAL		0x0008
212 #define HN_RXINFO_ALL			\
213 	(HN_RXINFO_VLAN |		\
214 	 HN_RXINFO_CSUM |		\
215 	 HN_RXINFO_HASHINF |		\
216 	 HN_RXINFO_HASHVAL)
217 
218 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
219 #define HN_NDIS_RXCSUM_INFO_INVALID	0
220 #define HN_NDIS_HASH_INFO_INVALID	0
221 
222 static int			hn_probe(device_t);
223 static int			hn_attach(device_t);
224 static int			hn_detach(device_t);
225 static int			hn_shutdown(device_t);
226 static void			hn_chan_callback(struct vmbus_channel *,
227 				    void *);
228 
229 static void			hn_init(void *);
230 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
231 #ifdef HN_IFSTART_SUPPORT
232 static void			hn_start(struct ifnet *);
233 #endif
234 static int			hn_transmit(struct ifnet *, struct mbuf *);
235 static void			hn_xmit_qflush(struct ifnet *);
236 static int			hn_ifmedia_upd(struct ifnet *);
237 static void			hn_ifmedia_sts(struct ifnet *,
238 				    struct ifmediareq *);
239 
240 static int			hn_rndis_rxinfo(const void *, int,
241 				    struct hn_rxinfo *);
242 static void			hn_rndis_rx_data(struct hn_rx_ring *,
243 				    const void *, int);
244 static void			hn_rndis_rx_status(struct hn_softc *,
245 				    const void *, int);
246 
247 static void			hn_nvs_handle_notify(struct hn_softc *,
248 				    const struct vmbus_chanpkt_hdr *);
249 static void			hn_nvs_handle_comp(struct hn_softc *,
250 				    struct vmbus_channel *,
251 				    const struct vmbus_chanpkt_hdr *);
252 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
253 				    struct vmbus_channel *,
254 				    const struct vmbus_chanpkt_hdr *);
255 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
256 				    struct vmbus_channel *, uint64_t);
257 
258 #if __FreeBSD_version >= 1100099
259 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
260 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
261 #endif
262 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
263 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
264 #if __FreeBSD_version < 1100095
265 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
266 #else
267 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
268 #endif
269 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
270 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
271 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
272 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
273 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
274 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
275 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
276 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
277 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
278 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
279 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
280 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
281 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
282 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
283 
284 static void			hn_stop(struct hn_softc *);
285 static void			hn_init_locked(struct hn_softc *);
286 static int			hn_chan_attach(struct hn_softc *,
287 				    struct vmbus_channel *);
288 static void			hn_chan_detach(struct hn_softc *,
289 				    struct vmbus_channel *);
290 static int			hn_attach_subchans(struct hn_softc *);
291 static void			hn_detach_allchans(struct hn_softc *);
292 static void			hn_chan_rollup(struct hn_rx_ring *,
293 				    struct hn_tx_ring *);
294 static void			hn_set_ring_inuse(struct hn_softc *, int);
295 static int			hn_synth_attach(struct hn_softc *, int);
296 static void			hn_synth_detach(struct hn_softc *);
297 static int			hn_synth_alloc_subchans(struct hn_softc *,
298 				    int *);
299 static void			hn_suspend(struct hn_softc *);
300 static void			hn_suspend_data(struct hn_softc *);
301 static void			hn_suspend_mgmt(struct hn_softc *);
302 static void			hn_resume(struct hn_softc *);
303 static void			hn_resume_data(struct hn_softc *);
304 static void			hn_resume_mgmt(struct hn_softc *);
305 static void			hn_suspend_mgmt_taskfunc(void *, int);
306 static void			hn_chan_drain(struct vmbus_channel *);
307 
308 static void			hn_update_link_status(struct hn_softc *);
309 static void			hn_change_network(struct hn_softc *);
310 static void			hn_link_taskfunc(void *, int);
311 static void			hn_netchg_init_taskfunc(void *, int);
312 static void			hn_netchg_status_taskfunc(void *, int);
313 static void			hn_link_status(struct hn_softc *);
314 
315 static int			hn_create_rx_data(struct hn_softc *, int);
316 static void			hn_destroy_rx_data(struct hn_softc *);
317 static int			hn_check_iplen(const struct mbuf *, int);
318 static int			hn_set_rxfilter(struct hn_softc *);
319 static int			hn_rss_reconfig(struct hn_softc *);
320 static void			hn_rss_ind_fixup(struct hn_softc *, int);
321 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
322 				    int, const struct hn_rxinfo *);
323 
324 static int			hn_tx_ring_create(struct hn_softc *, int);
325 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
326 static int			hn_create_tx_data(struct hn_softc *, int);
327 static void			hn_fixup_tx_data(struct hn_softc *);
328 static void			hn_destroy_tx_data(struct hn_softc *);
329 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
330 static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
331 				    struct hn_txdesc *, struct mbuf **);
332 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
333 				    struct hn_txdesc *);
334 static void			hn_set_chim_size(struct hn_softc *, int);
335 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
336 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
337 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
338 static void			hn_resume_tx(struct hn_softc *, int);
339 static void			hn_set_txagg(struct hn_softc *);
340 static void			*hn_try_txagg(struct ifnet *,
341 				    struct hn_tx_ring *, struct hn_txdesc *,
342 				    int);
343 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
344 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
345 				    struct hn_softc *, struct vmbus_channel *,
346 				    const void *, int);
347 static int			hn_txpkt_sglist(struct hn_tx_ring *,
348 				    struct hn_txdesc *);
349 static int			hn_txpkt_chim(struct hn_tx_ring *,
350 				    struct hn_txdesc *);
351 static int			hn_xmit(struct hn_tx_ring *, int);
352 static void			hn_xmit_taskfunc(void *, int);
353 static void			hn_xmit_txeof(struct hn_tx_ring *);
354 static void			hn_xmit_txeof_taskfunc(void *, int);
355 #ifdef HN_IFSTART_SUPPORT
356 static int			hn_start_locked(struct hn_tx_ring *, int);
357 static void			hn_start_taskfunc(void *, int);
358 static void			hn_start_txeof(struct hn_tx_ring *);
359 static void			hn_start_txeof_taskfunc(void *, int);
360 #endif
361 
362 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
363     "Hyper-V network interface");
364 
365 /* Trust tcp segements verification on host side. */
366 static int			hn_trust_hosttcp = 1;
367 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
368     &hn_trust_hosttcp, 0,
369     "Trust tcp segement verification on host side, "
370     "when csum info is missing (global setting)");
371 
372 /* Trust udp datagrams verification on host side. */
373 static int			hn_trust_hostudp = 1;
374 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
375     &hn_trust_hostudp, 0,
376     "Trust udp datagram verification on host side, "
377     "when csum info is missing (global setting)");
378 
379 /* Trust ip packets verification on host side. */
380 static int			hn_trust_hostip = 1;
381 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
382     &hn_trust_hostip, 0,
383     "Trust ip packet verification on host side, "
384     "when csum info is missing (global setting)");
385 
386 /* Limit TSO burst size */
387 static int			hn_tso_maxlen = IP_MAXPACKET;
388 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
389     &hn_tso_maxlen, 0, "TSO burst limit");
390 
391 /* Limit chimney send size */
392 static int			hn_tx_chimney_size = 0;
393 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
394     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
395 
396 /* Limit the size of packet for direct transmission */
397 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
398 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
399     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
400 
401 /* # of LRO entries per RX ring */
402 #if defined(INET) || defined(INET6)
403 #if __FreeBSD_version >= 1100095
404 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
405 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
406     &hn_lro_entry_count, 0, "LRO entry count");
407 #endif
408 #endif
409 
410 /* Use shared TX taskqueue */
411 static int			hn_share_tx_taskq = 0;
412 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
413     &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
414 
415 #ifndef HN_USE_TXDESC_BUFRING
416 static int			hn_use_txdesc_bufring = 0;
417 #else
418 static int			hn_use_txdesc_bufring = 1;
419 #endif
420 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
421     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
422 
423 /* Bind TX taskqueue to the target CPU */
424 static int			hn_bind_tx_taskq = -1;
425 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
426     &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
427 
428 #ifdef HN_IFSTART_SUPPORT
429 /* Use ifnet.if_start instead of ifnet.if_transmit */
430 static int			hn_use_if_start = 0;
431 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
432     &hn_use_if_start, 0, "Use if_start TX method");
433 #endif
434 
435 /* # of channels to use */
436 static int			hn_chan_cnt = 0;
437 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
438     &hn_chan_cnt, 0,
439     "# of channels to use; each channel has one RX ring and one TX ring");
440 
441 /* # of transmit rings to use */
442 static int			hn_tx_ring_cnt = 0;
443 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
444     &hn_tx_ring_cnt, 0, "# of TX rings to use");
445 
446 /* Software TX ring deptch */
447 static int			hn_tx_swq_depth = 0;
448 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
449     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
450 
451 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
452 #if __FreeBSD_version >= 1100095
453 static u_int			hn_lro_mbufq_depth = 0;
454 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
455     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
456 #endif
457 
458 /* Packet transmission aggregation size limit */
459 static int			hn_tx_agg_size = -1;
460 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
461     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
462 
463 /* Packet transmission aggregation count limit */
464 static int			hn_tx_agg_pkts = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
466     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
467 
468 static u_int			hn_cpu_index;	/* next CPU for channel */
469 static struct taskqueue		*hn_tx_taskq;	/* shared TX taskqueue */
470 
471 static const uint8_t
472 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
473 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
474 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
475 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
476 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
477 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
478 };
479 
480 static device_method_t hn_methods[] = {
481 	/* Device interface */
482 	DEVMETHOD(device_probe,		hn_probe),
483 	DEVMETHOD(device_attach,	hn_attach),
484 	DEVMETHOD(device_detach,	hn_detach),
485 	DEVMETHOD(device_shutdown,	hn_shutdown),
486 	DEVMETHOD_END
487 };
488 
489 static driver_t hn_driver = {
490 	"hn",
491 	hn_methods,
492 	sizeof(struct hn_softc)
493 };
494 
495 static devclass_t hn_devclass;
496 
497 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
498 MODULE_VERSION(hn, 1);
499 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
500 
501 #if __FreeBSD_version >= 1100099
502 static void
503 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
504 {
505 	int i;
506 
507 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
508 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
509 }
510 #endif
511 
512 static int
513 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
514 {
515 
516 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
517 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
518 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
519 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
520 }
521 
522 static int
523 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
524 {
525 	struct hn_nvs_rndis rndis;
526 
527 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
528 	    txd->chim_size > 0, ("invalid rndis chim txd"));
529 
530 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
531 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
532 	rndis.nvs_chim_idx = txd->chim_index;
533 	rndis.nvs_chim_sz = txd->chim_size;
534 
535 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
536 	    &rndis, sizeof(rndis), &txd->send_ctx));
537 }
538 
539 static __inline uint32_t
540 hn_chim_alloc(struct hn_softc *sc)
541 {
542 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
543 	u_long *bmap = sc->hn_chim_bmap;
544 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
545 
546 	for (i = 0; i < bmap_cnt; ++i) {
547 		int idx;
548 
549 		idx = ffsl(~bmap[i]);
550 		if (idx == 0)
551 			continue;
552 
553 		--idx; /* ffsl is 1-based */
554 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
555 		    ("invalid i %d and idx %d", i, idx));
556 
557 		if (atomic_testandset_long(&bmap[i], idx))
558 			continue;
559 
560 		ret = i * LONG_BIT + idx;
561 		break;
562 	}
563 	return (ret);
564 }
565 
566 static __inline void
567 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
568 {
569 	u_long mask;
570 	uint32_t idx;
571 
572 	idx = chim_idx / LONG_BIT;
573 	KASSERT(idx < sc->hn_chim_bmap_cnt,
574 	    ("invalid chimney index 0x%x", chim_idx));
575 
576 	mask = 1UL << (chim_idx % LONG_BIT);
577 	KASSERT(sc->hn_chim_bmap[idx] & mask,
578 	    ("index bitmap 0x%lx, chimney index %u, "
579 	     "bitmap idx %d, bitmask 0x%lx",
580 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
581 
582 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
583 }
584 
585 #if defined(INET6) || defined(INET)
586 /*
587  * NOTE: If this function failed, the m_head would be freed.
588  */
589 static __inline struct mbuf *
590 hn_tso_fixup(struct mbuf *m_head)
591 {
592 	struct ether_vlan_header *evl;
593 	struct tcphdr *th;
594 	int ehlen;
595 
596 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
597 
598 #define PULLUP_HDR(m, len)				\
599 do {							\
600 	if (__predict_false((m)->m_len < (len))) {	\
601 		(m) = m_pullup((m), (len));		\
602 		if ((m) == NULL)			\
603 			return (NULL);			\
604 	}						\
605 } while (0)
606 
607 	PULLUP_HDR(m_head, sizeof(*evl));
608 	evl = mtod(m_head, struct ether_vlan_header *);
609 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
610 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
611 	else
612 		ehlen = ETHER_HDR_LEN;
613 
614 #ifdef INET
615 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
616 		struct ip *ip;
617 		int iphlen;
618 
619 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
620 		ip = mtodo(m_head, ehlen);
621 		iphlen = ip->ip_hl << 2;
622 
623 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
624 		th = mtodo(m_head, ehlen + iphlen);
625 
626 		ip->ip_len = 0;
627 		ip->ip_sum = 0;
628 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
629 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
630 	}
631 #endif
632 #if defined(INET6) && defined(INET)
633 	else
634 #endif
635 #ifdef INET6
636 	{
637 		struct ip6_hdr *ip6;
638 
639 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
640 		ip6 = mtodo(m_head, ehlen);
641 		if (ip6->ip6_nxt != IPPROTO_TCP) {
642 			m_freem(m_head);
643 			return (NULL);
644 		}
645 
646 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
647 		th = mtodo(m_head, ehlen + sizeof(*ip6));
648 
649 		ip6->ip6_plen = 0;
650 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
651 	}
652 #endif
653 	return (m_head);
654 
655 #undef PULLUP_HDR
656 }
657 #endif	/* INET6 || INET */
658 
659 static int
660 hn_set_rxfilter(struct hn_softc *sc)
661 {
662 	struct ifnet *ifp = sc->hn_ifp;
663 	uint32_t filter;
664 	int error = 0;
665 
666 	HN_LOCK_ASSERT(sc);
667 
668 	if (ifp->if_flags & IFF_PROMISC) {
669 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
670 	} else {
671 		filter = NDIS_PACKET_TYPE_DIRECTED;
672 		if (ifp->if_flags & IFF_BROADCAST)
673 			filter |= NDIS_PACKET_TYPE_BROADCAST;
674 		/* TODO: support multicast list */
675 		if ((ifp->if_flags & IFF_ALLMULTI) ||
676 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
677 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
678 	}
679 
680 	if (sc->hn_rx_filter != filter) {
681 		error = hn_rndis_set_rxfilter(sc, filter);
682 		if (!error)
683 			sc->hn_rx_filter = filter;
684 	}
685 	return (error);
686 }
687 
688 static void
689 hn_set_txagg(struct hn_softc *sc)
690 {
691 	uint32_t size, pkts;
692 	int i;
693 
694 	/*
695 	 * Setup aggregation size.
696 	 */
697 	if (sc->hn_agg_size < 0)
698 		size = UINT32_MAX;
699 	else
700 		size = sc->hn_agg_size;
701 
702 	if (sc->hn_rndis_agg_size < size)
703 		size = sc->hn_rndis_agg_size;
704 
705 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
706 		/* Disable */
707 		size = 0;
708 		pkts = 0;
709 		goto done;
710 	}
711 
712 	/* NOTE: Type of the per TX ring setting is 'int'. */
713 	if (size > INT_MAX)
714 		size = INT_MAX;
715 
716 	/* NOTE: We only aggregate packets using chimney sending buffers. */
717 	if (size > (uint32_t)sc->hn_chim_szmax)
718 		size = sc->hn_chim_szmax;
719 
720 	/*
721 	 * Setup aggregation packet count.
722 	 */
723 	if (sc->hn_agg_pkts < 0)
724 		pkts = UINT32_MAX;
725 	else
726 		pkts = sc->hn_agg_pkts;
727 
728 	if (sc->hn_rndis_agg_pkts < pkts)
729 		pkts = sc->hn_rndis_agg_pkts;
730 
731 	if (pkts <= 1) {
732 		/* Disable */
733 		size = 0;
734 		pkts = 0;
735 		goto done;
736 	}
737 
738 	/* NOTE: Type of the per TX ring setting is 'short'. */
739 	if (pkts > SHRT_MAX)
740 		pkts = SHRT_MAX;
741 
742 done:
743 	/* NOTE: Type of the per TX ring setting is 'short'. */
744 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
745 		/* Disable */
746 		size = 0;
747 		pkts = 0;
748 	}
749 
750 	if (bootverbose) {
751 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
752 		    size, pkts, sc->hn_rndis_agg_align);
753 	}
754 
755 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
756 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
757 
758 		mtx_lock(&txr->hn_tx_lock);
759 		txr->hn_agg_szmax = size;
760 		txr->hn_agg_pktmax = pkts;
761 		txr->hn_agg_align = sc->hn_rndis_agg_align;
762 		mtx_unlock(&txr->hn_tx_lock);
763 	}
764 }
765 
766 static int
767 hn_get_txswq_depth(const struct hn_tx_ring *txr)
768 {
769 
770 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
771 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
772 		return txr->hn_txdesc_cnt;
773 	return hn_tx_swq_depth;
774 }
775 
776 static int
777 hn_rss_reconfig(struct hn_softc *sc)
778 {
779 	int error;
780 
781 	HN_LOCK_ASSERT(sc);
782 
783 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
784 		return (ENXIO);
785 
786 	/*
787 	 * Disable RSS first.
788 	 *
789 	 * NOTE:
790 	 * Direct reconfiguration by setting the UNCHG flags does
791 	 * _not_ work properly.
792 	 */
793 	if (bootverbose)
794 		if_printf(sc->hn_ifp, "disable RSS\n");
795 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
796 	if (error) {
797 		if_printf(sc->hn_ifp, "RSS disable failed\n");
798 		return (error);
799 	}
800 
801 	/*
802 	 * Reenable the RSS w/ the updated RSS key or indirect
803 	 * table.
804 	 */
805 	if (bootverbose)
806 		if_printf(sc->hn_ifp, "reconfig RSS\n");
807 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
808 	if (error) {
809 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
810 		return (error);
811 	}
812 	return (0);
813 }
814 
815 static void
816 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
817 {
818 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
819 	int i;
820 
821 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
822 
823 	/*
824 	 * Check indirect table to make sure that all channels in it
825 	 * can be used.
826 	 */
827 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
828 		if (rss->rss_ind[i] >= nchan) {
829 			if_printf(sc->hn_ifp,
830 			    "RSS indirect table %d fixup: %u -> %d\n",
831 			    i, rss->rss_ind[i], nchan - 1);
832 			rss->rss_ind[i] = nchan - 1;
833 		}
834 	}
835 }
836 
837 static int
838 hn_ifmedia_upd(struct ifnet *ifp __unused)
839 {
840 
841 	return EOPNOTSUPP;
842 }
843 
844 static void
845 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
846 {
847 	struct hn_softc *sc = ifp->if_softc;
848 
849 	ifmr->ifm_status = IFM_AVALID;
850 	ifmr->ifm_active = IFM_ETHER;
851 
852 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
853 		ifmr->ifm_active |= IFM_NONE;
854 		return;
855 	}
856 	ifmr->ifm_status |= IFM_ACTIVE;
857 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
858 }
859 
860 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
861 static const struct hyperv_guid g_net_vsc_device_type = {
862 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
863 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
864 };
865 
866 static int
867 hn_probe(device_t dev)
868 {
869 
870 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
871 	    &g_net_vsc_device_type) == 0) {
872 		device_set_desc(dev, "Hyper-V Network Interface");
873 		return BUS_PROBE_DEFAULT;
874 	}
875 	return ENXIO;
876 }
877 
878 static int
879 hn_attach(device_t dev)
880 {
881 	struct hn_softc *sc = device_get_softc(dev);
882 	struct sysctl_oid_list *child;
883 	struct sysctl_ctx_list *ctx;
884 	uint8_t eaddr[ETHER_ADDR_LEN];
885 	struct ifnet *ifp = NULL;
886 	int error, ring_cnt, tx_ring_cnt;
887 
888 	sc->hn_dev = dev;
889 	sc->hn_prichan = vmbus_get_channel(dev);
890 	HN_LOCK_INIT(sc);
891 
892 	/*
893 	 * Initialize these tunables once.
894 	 */
895 	sc->hn_agg_size = hn_tx_agg_size;
896 	sc->hn_agg_pkts = hn_tx_agg_pkts;
897 
898 	/*
899 	 * Setup taskqueue for transmission.
900 	 */
901 	if (hn_tx_taskq == NULL) {
902 		sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
903 		    taskqueue_thread_enqueue, &sc->hn_tx_taskq);
904 		if (hn_bind_tx_taskq >= 0) {
905 			int cpu = hn_bind_tx_taskq;
906 			cpuset_t cpu_set;
907 
908 			if (cpu > mp_ncpus - 1)
909 				cpu = mp_ncpus - 1;
910 			CPU_SETOF(cpu, &cpu_set);
911 			taskqueue_start_threads_cpuset(&sc->hn_tx_taskq, 1,
912 			    PI_NET, &cpu_set, "%s tx",
913 			    device_get_nameunit(dev));
914 		} else {
915 			taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET,
916 			    "%s tx", device_get_nameunit(dev));
917 		}
918 	} else {
919 		sc->hn_tx_taskq = hn_tx_taskq;
920 	}
921 
922 	/*
923 	 * Setup taskqueue for mangement tasks, e.g. link status.
924 	 */
925 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
926 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
927 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
928 	    device_get_nameunit(dev));
929 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
930 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
931 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
932 	    hn_netchg_status_taskfunc, sc);
933 
934 	/*
935 	 * Allocate ifnet and setup its name earlier, so that if_printf
936 	 * can be used by functions, which will be called after
937 	 * ether_ifattach().
938 	 */
939 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
940 	ifp->if_softc = sc;
941 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
942 
943 	/*
944 	 * Initialize ifmedia earlier so that it can be unconditionally
945 	 * destroyed, if error happened later on.
946 	 */
947 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
948 
949 	/*
950 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
951 	 * to use (tx_ring_cnt).
952 	 *
953 	 * NOTE:
954 	 * The # of RX rings to use is same as the # of channels to use.
955 	 */
956 	ring_cnt = hn_chan_cnt;
957 	if (ring_cnt <= 0) {
958 		/* Default */
959 		ring_cnt = mp_ncpus;
960 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
961 			ring_cnt = HN_RING_CNT_DEF_MAX;
962 	} else if (ring_cnt > mp_ncpus) {
963 		ring_cnt = mp_ncpus;
964 	}
965 
966 	tx_ring_cnt = hn_tx_ring_cnt;
967 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
968 		tx_ring_cnt = ring_cnt;
969 #ifdef HN_IFSTART_SUPPORT
970 	if (hn_use_if_start) {
971 		/* ifnet.if_start only needs one TX ring. */
972 		tx_ring_cnt = 1;
973 	}
974 #endif
975 
976 	/*
977 	 * Set the leader CPU for channels.
978 	 */
979 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
980 
981 	/*
982 	 * Create enough TX/RX rings, even if only limited number of
983 	 * channels can be allocated.
984 	 */
985 	error = hn_create_tx_data(sc, tx_ring_cnt);
986 	if (error)
987 		goto failed;
988 	error = hn_create_rx_data(sc, ring_cnt);
989 	if (error)
990 		goto failed;
991 
992 	/*
993 	 * Create transaction context for NVS and RNDIS transactions.
994 	 */
995 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
996 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
997 	if (sc->hn_xact == NULL)
998 		goto failed;
999 
1000 	/*
1001 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
1002 	 */
1003 	error = hn_synth_attach(sc, ETHERMTU);
1004 	if (error)
1005 		goto failed;
1006 
1007 	error = hn_rndis_get_eaddr(sc, eaddr);
1008 	if (error)
1009 		goto failed;
1010 
1011 #if __FreeBSD_version >= 1100099
1012 	if (sc->hn_rx_ring_inuse > 1) {
1013 		/*
1014 		 * Reduce TCP segment aggregation limit for multiple
1015 		 * RX rings to increase ACK timeliness.
1016 		 */
1017 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1018 	}
1019 #endif
1020 
1021 	/*
1022 	 * Fixup TX stuffs after synthetic parts are attached.
1023 	 */
1024 	hn_fixup_tx_data(sc);
1025 
1026 	ctx = device_get_sysctl_ctx(dev);
1027 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1028 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1029 	    &sc->hn_nvs_ver, 0, "NVS version");
1030 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1031 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1032 	    hn_ndis_version_sysctl, "A", "NDIS version");
1033 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1034 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1035 	    hn_caps_sysctl, "A", "capabilities");
1036 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1037 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1038 	    hn_hwassist_sysctl, "A", "hwassist");
1039 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1040 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1041 	    hn_rxfilter_sysctl, "A", "rxfilter");
1042 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1043 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1044 	    hn_rss_hash_sysctl, "A", "RSS hash");
1045 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1046 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1047 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1048 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1049 	    hn_rss_key_sysctl, "IU", "RSS key");
1050 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1051 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1052 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
1053 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1054 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1055 	    "RNDIS offered packet transmission aggregation size limit");
1056 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1057 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1058 	    "RNDIS offered packet transmission aggregation count limit");
1059 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1060 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1061 	    "RNDIS packet transmission aggregation alignment");
1062 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1063 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1064 	    hn_txagg_size_sysctl, "I",
1065 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1066 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1067 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1068 	    hn_txagg_pkts_sysctl, "I",
1069 	    "Packet transmission aggregation packets, "
1070 	    "0 -- disable, -1 -- auto");
1071 
1072 	/*
1073 	 * Setup the ifmedia, which has been initialized earlier.
1074 	 */
1075 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1076 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1077 	/* XXX ifmedia_set really should do this for us */
1078 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1079 
1080 	/*
1081 	 * Setup the ifnet for this interface.
1082 	 */
1083 
1084 	ifp->if_baudrate = IF_Gbps(10);
1085 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1086 	ifp->if_ioctl = hn_ioctl;
1087 	ifp->if_init = hn_init;
1088 #ifdef HN_IFSTART_SUPPORT
1089 	if (hn_use_if_start) {
1090 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1091 
1092 		ifp->if_start = hn_start;
1093 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1094 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1095 		IFQ_SET_READY(&ifp->if_snd);
1096 	} else
1097 #endif
1098 	{
1099 		ifp->if_transmit = hn_transmit;
1100 		ifp->if_qflush = hn_xmit_qflush;
1101 	}
1102 
1103 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1104 #ifdef foo
1105 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
1106 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1107 #endif
1108 	if (sc->hn_caps & HN_CAP_VLAN) {
1109 		/* XXX not sure about VLAN_MTU. */
1110 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1111 	}
1112 
1113 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1114 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1115 		ifp->if_capabilities |= IFCAP_TXCSUM;
1116 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1117 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1118 	if (sc->hn_caps & HN_CAP_TSO4) {
1119 		ifp->if_capabilities |= IFCAP_TSO4;
1120 		ifp->if_hwassist |= CSUM_IP_TSO;
1121 	}
1122 	if (sc->hn_caps & HN_CAP_TSO6) {
1123 		ifp->if_capabilities |= IFCAP_TSO6;
1124 		ifp->if_hwassist |= CSUM_IP6_TSO;
1125 	}
1126 
1127 	/* Enable all available capabilities by default. */
1128 	ifp->if_capenable = ifp->if_capabilities;
1129 
1130 	/*
1131 	 * Disable IPv6 TSO and TXCSUM by default, they still can
1132 	 * be enabled through SIOCSIFCAP.
1133 	 */
1134 	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1135 	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1136 
1137 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1138 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1139 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1140 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1141 	}
1142 
1143 	ether_ifattach(ifp, eaddr);
1144 
1145 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1146 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1147 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1148 	}
1149 
1150 	/* Inform the upper layer about the long frame support. */
1151 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1152 
1153 	/*
1154 	 * Kick off link status check.
1155 	 */
1156 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1157 	hn_update_link_status(sc);
1158 
1159 	return (0);
1160 failed:
1161 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1162 		hn_synth_detach(sc);
1163 	hn_detach(dev);
1164 	return (error);
1165 }
1166 
1167 static int
1168 hn_detach(device_t dev)
1169 {
1170 	struct hn_softc *sc = device_get_softc(dev);
1171 	struct ifnet *ifp = sc->hn_ifp;
1172 
1173 	if (device_is_attached(dev)) {
1174 		HN_LOCK(sc);
1175 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1176 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1177 				hn_stop(sc);
1178 			/*
1179 			 * NOTE:
1180 			 * hn_stop() only suspends data, so managment
1181 			 * stuffs have to be suspended manually here.
1182 			 */
1183 			hn_suspend_mgmt(sc);
1184 			hn_synth_detach(sc);
1185 		}
1186 		HN_UNLOCK(sc);
1187 		ether_ifdetach(ifp);
1188 	}
1189 
1190 	ifmedia_removeall(&sc->hn_media);
1191 	hn_destroy_rx_data(sc);
1192 	hn_destroy_tx_data(sc);
1193 
1194 	if (sc->hn_tx_taskq != hn_tx_taskq)
1195 		taskqueue_free(sc->hn_tx_taskq);
1196 	taskqueue_free(sc->hn_mgmt_taskq0);
1197 
1198 	if (sc->hn_xact != NULL)
1199 		vmbus_xact_ctx_destroy(sc->hn_xact);
1200 
1201 	if_free(ifp);
1202 
1203 	HN_LOCK_DESTROY(sc);
1204 	return (0);
1205 }
1206 
1207 static int
1208 hn_shutdown(device_t dev)
1209 {
1210 
1211 	return (0);
1212 }
1213 
1214 static void
1215 hn_link_status(struct hn_softc *sc)
1216 {
1217 	uint32_t link_status;
1218 	int error;
1219 
1220 	error = hn_rndis_get_linkstatus(sc, &link_status);
1221 	if (error) {
1222 		/* XXX what to do? */
1223 		return;
1224 	}
1225 
1226 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1227 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1228 	else
1229 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1230 	if_link_state_change(sc->hn_ifp,
1231 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1232 	    LINK_STATE_UP : LINK_STATE_DOWN);
1233 }
1234 
1235 static void
1236 hn_link_taskfunc(void *xsc, int pending __unused)
1237 {
1238 	struct hn_softc *sc = xsc;
1239 
1240 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1241 		return;
1242 	hn_link_status(sc);
1243 }
1244 
1245 static void
1246 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1247 {
1248 	struct hn_softc *sc = xsc;
1249 
1250 	/* Prevent any link status checks from running. */
1251 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1252 
1253 	/*
1254 	 * Fake up a [link down --> link up] state change; 5 seconds
1255 	 * delay is used, which closely simulates miibus reaction
1256 	 * upon link down event.
1257 	 */
1258 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1259 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1260 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1261 	    &sc->hn_netchg_status, 5 * hz);
1262 }
1263 
1264 static void
1265 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1266 {
1267 	struct hn_softc *sc = xsc;
1268 
1269 	/* Re-allow link status checks. */
1270 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1271 	hn_link_status(sc);
1272 }
1273 
1274 static void
1275 hn_update_link_status(struct hn_softc *sc)
1276 {
1277 
1278 	if (sc->hn_mgmt_taskq != NULL)
1279 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1280 }
1281 
1282 static void
1283 hn_change_network(struct hn_softc *sc)
1284 {
1285 
1286 	if (sc->hn_mgmt_taskq != NULL)
1287 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1288 }
1289 
1290 static __inline int
1291 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1292     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1293 {
1294 	struct mbuf *m = *m_head;
1295 	int error;
1296 
1297 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1298 
1299 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1300 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1301 	if (error == EFBIG) {
1302 		struct mbuf *m_new;
1303 
1304 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1305 		if (m_new == NULL)
1306 			return ENOBUFS;
1307 		else
1308 			*m_head = m = m_new;
1309 		txr->hn_tx_collapsed++;
1310 
1311 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1312 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1313 	}
1314 	if (!error) {
1315 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1316 		    BUS_DMASYNC_PREWRITE);
1317 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1318 	}
1319 	return error;
1320 }
1321 
1322 static __inline int
1323 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1324 {
1325 
1326 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1327 	    ("put an onlist txd %#x", txd->flags));
1328 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1329 	    ("put an onagg txd %#x", txd->flags));
1330 
1331 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1332 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1333 		return 0;
1334 
1335 	if (!STAILQ_EMPTY(&txd->agg_list)) {
1336 		struct hn_txdesc *tmp_txd;
1337 
1338 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1339 			int freed;
1340 
1341 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1342 			    ("resursive aggregation on aggregated txdesc"));
1343 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1344 			    ("not aggregated txdesc"));
1345 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1346 			    ("aggregated txdesc uses dmamap"));
1347 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1348 			    ("aggregated txdesc consumes "
1349 			     "chimney sending buffer"));
1350 			KASSERT(tmp_txd->chim_size == 0,
1351 			    ("aggregated txdesc has non-zero "
1352 			     "chimney sending size"));
1353 
1354 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1355 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1356 			freed = hn_txdesc_put(txr, tmp_txd);
1357 			KASSERT(freed, ("failed to free aggregated txdesc"));
1358 		}
1359 	}
1360 
1361 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1362 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1363 		    ("chim txd uses dmamap"));
1364 		hn_chim_free(txr->hn_sc, txd->chim_index);
1365 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1366 		txd->chim_size = 0;
1367 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1368 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1369 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1370 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1371 		    txd->data_dmap);
1372 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1373 	}
1374 
1375 	if (txd->m != NULL) {
1376 		m_freem(txd->m);
1377 		txd->m = NULL;
1378 	}
1379 
1380 	txd->flags |= HN_TXD_FLAG_ONLIST;
1381 #ifndef HN_USE_TXDESC_BUFRING
1382 	mtx_lock_spin(&txr->hn_txlist_spin);
1383 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1384 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1385 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1386 	txr->hn_txdesc_avail++;
1387 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1388 	mtx_unlock_spin(&txr->hn_txlist_spin);
1389 #else
1390 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1391 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1392 #endif
1393 
1394 	return 1;
1395 }
1396 
1397 static __inline struct hn_txdesc *
1398 hn_txdesc_get(struct hn_tx_ring *txr)
1399 {
1400 	struct hn_txdesc *txd;
1401 
1402 #ifndef HN_USE_TXDESC_BUFRING
1403 	mtx_lock_spin(&txr->hn_txlist_spin);
1404 	txd = SLIST_FIRST(&txr->hn_txlist);
1405 	if (txd != NULL) {
1406 		KASSERT(txr->hn_txdesc_avail > 0,
1407 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1408 		txr->hn_txdesc_avail--;
1409 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1410 	}
1411 	mtx_unlock_spin(&txr->hn_txlist_spin);
1412 #else
1413 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1414 #endif
1415 
1416 	if (txd != NULL) {
1417 #ifdef HN_USE_TXDESC_BUFRING
1418 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1419 #endif
1420 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1421 		    STAILQ_EMPTY(&txd->agg_list) &&
1422 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1423 		    txd->chim_size == 0 &&
1424 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1425 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1426 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1427 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1428 		txd->refs = 1;
1429 	}
1430 	return txd;
1431 }
1432 
1433 static __inline void
1434 hn_txdesc_hold(struct hn_txdesc *txd)
1435 {
1436 
1437 	/* 0->1 transition will never work */
1438 	KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
1439 	atomic_add_int(&txd->refs, 1);
1440 }
1441 
1442 static __inline void
1443 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1444 {
1445 
1446 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1447 	    ("recursive aggregation on aggregating txdesc"));
1448 
1449 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1450 	    ("already aggregated"));
1451 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
1452 	    ("recursive aggregation on to-be-aggregated txdesc"));
1453 
1454 	txd->flags |= HN_TXD_FLAG_ONAGG;
1455 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1456 }
1457 
1458 static bool
1459 hn_tx_ring_pending(struct hn_tx_ring *txr)
1460 {
1461 	bool pending = false;
1462 
1463 #ifndef HN_USE_TXDESC_BUFRING
1464 	mtx_lock_spin(&txr->hn_txlist_spin);
1465 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1466 		pending = true;
1467 	mtx_unlock_spin(&txr->hn_txlist_spin);
1468 #else
1469 	if (!buf_ring_full(txr->hn_txdesc_br))
1470 		pending = true;
1471 #endif
1472 	return (pending);
1473 }
1474 
1475 static __inline void
1476 hn_txeof(struct hn_tx_ring *txr)
1477 {
1478 	txr->hn_has_txeof = 0;
1479 	txr->hn_txeof(txr);
1480 }
1481 
1482 static void
1483 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1484     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1485 {
1486 	struct hn_txdesc *txd = sndc->hn_cbarg;
1487 	struct hn_tx_ring *txr;
1488 
1489 	txr = txd->txr;
1490 	KASSERT(txr->hn_chan == chan,
1491 	    ("channel mismatch, on chan%u, should be chan%u",
1492 	     vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1493 
1494 	txr->hn_has_txeof = 1;
1495 	hn_txdesc_put(txr, txd);
1496 
1497 	++txr->hn_txdone_cnt;
1498 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1499 		txr->hn_txdone_cnt = 0;
1500 		if (txr->hn_oactive)
1501 			hn_txeof(txr);
1502 	}
1503 }
1504 
1505 static void
1506 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1507 {
1508 #if defined(INET) || defined(INET6)
1509 	tcp_lro_flush_all(&rxr->hn_lro);
1510 #endif
1511 
1512 	/*
1513 	 * NOTE:
1514 	 * 'txr' could be NULL, if multiple channels and
1515 	 * ifnet.if_start method are enabled.
1516 	 */
1517 	if (txr == NULL || !txr->hn_has_txeof)
1518 		return;
1519 
1520 	txr->hn_txdone_cnt = 0;
1521 	hn_txeof(txr);
1522 }
1523 
1524 static __inline uint32_t
1525 hn_rndis_pktmsg_offset(uint32_t ofs)
1526 {
1527 
1528 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1529 	    ("invalid RNDIS packet msg offset %u", ofs));
1530 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1531 }
1532 
1533 static __inline void *
1534 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1535     size_t pi_dlen, uint32_t pi_type)
1536 {
1537 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1538 	struct rndis_pktinfo *pi;
1539 
1540 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1541 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1542 
1543 	/*
1544 	 * Per-packet-info does not move; it only grows.
1545 	 *
1546 	 * NOTE:
1547 	 * rm_pktinfooffset in this phase counts from the beginning
1548 	 * of rndis_packet_msg.
1549 	 */
1550 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1551 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1552 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1553 	    pkt->rm_pktinfolen);
1554 	pkt->rm_pktinfolen += pi_size;
1555 
1556 	pi->rm_size = pi_size;
1557 	pi->rm_type = pi_type;
1558 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1559 
1560 	/* Data immediately follow per-packet-info. */
1561 	pkt->rm_dataoffset += pi_size;
1562 
1563 	/* Update RNDIS packet msg length */
1564 	pkt->rm_len += pi_size;
1565 
1566 	return (pi->rm_data);
1567 }
1568 
1569 static __inline int
1570 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1571 {
1572 	struct hn_txdesc *txd;
1573 	struct mbuf *m;
1574 	int error, pkts;
1575 
1576 	txd = txr->hn_agg_txd;
1577 	KASSERT(txd != NULL, ("no aggregate txdesc"));
1578 
1579 	/*
1580 	 * Since hn_txpkt() will reset this temporary stat, save
1581 	 * it now, so that oerrors can be updated properly, if
1582 	 * hn_txpkt() ever fails.
1583 	 */
1584 	pkts = txr->hn_stat_pkts;
1585 
1586 	/*
1587 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1588 	 * failure, save it for later freeing, if hn_txpkt() ever
1589 	 * fails.
1590 	 */
1591 	m = txd->m;
1592 	error = hn_txpkt(ifp, txr, txd);
1593 	if (__predict_false(error)) {
1594 		/* txd is freed, but m is not. */
1595 		m_freem(m);
1596 
1597 		txr->hn_flush_failed++;
1598 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1599 	}
1600 
1601 	/* Reset all aggregation states. */
1602 	txr->hn_agg_txd = NULL;
1603 	txr->hn_agg_szleft = 0;
1604 	txr->hn_agg_pktleft = 0;
1605 	txr->hn_agg_prevpkt = NULL;
1606 
1607 	return (error);
1608 }
1609 
1610 static void *
1611 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1612     int pktsize)
1613 {
1614 	void *chim;
1615 
1616 	if (txr->hn_agg_txd != NULL) {
1617 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1618 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1619 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1620 			int olen;
1621 
1622 			/*
1623 			 * Update the previous RNDIS packet's total length,
1624 			 * it can be increased due to the mandatory alignment
1625 			 * padding for this RNDIS packet.  And update the
1626 			 * aggregating txdesc's chimney sending buffer size
1627 			 * accordingly.
1628 			 *
1629 			 * XXX
1630 			 * Zero-out the padding, as required by the RNDIS spec.
1631 			 */
1632 			olen = pkt->rm_len;
1633 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1634 			agg_txd->chim_size += pkt->rm_len - olen;
1635 
1636 			/* Link this txdesc to the parent. */
1637 			hn_txdesc_agg(agg_txd, txd);
1638 
1639 			chim = (uint8_t *)pkt + pkt->rm_len;
1640 			/* Save the current packet for later fixup. */
1641 			txr->hn_agg_prevpkt = chim;
1642 
1643 			txr->hn_agg_pktleft--;
1644 			txr->hn_agg_szleft -= pktsize;
1645 			if (txr->hn_agg_szleft <=
1646 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1647 				/*
1648 				 * Probably can't aggregate more packets,
1649 				 * flush this aggregating txdesc proactively.
1650 				 */
1651 				txr->hn_agg_pktleft = 0;
1652 			}
1653 			/* Done! */
1654 			return (chim);
1655 		}
1656 		hn_flush_txagg(ifp, txr);
1657 	}
1658 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1659 
1660 	txr->hn_tx_chimney_tried++;
1661 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
1662 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1663 		return (NULL);
1664 	txr->hn_tx_chimney++;
1665 
1666 	chim = txr->hn_sc->hn_chim +
1667 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1668 
1669 	if (txr->hn_agg_pktmax > 1 &&
1670 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1671 		txr->hn_agg_txd = txd;
1672 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1673 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1674 		txr->hn_agg_prevpkt = chim;
1675 	}
1676 	return (chim);
1677 }
1678 
1679 /*
1680  * NOTE:
1681  * If this function fails, then both txd and m_head0 will be freed.
1682  */
1683 static int
1684 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1685     struct mbuf **m_head0)
1686 {
1687 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1688 	int error, nsegs, i;
1689 	struct mbuf *m_head = *m_head0;
1690 	struct rndis_packet_msg *pkt;
1691 	uint32_t *pi_data;
1692 	void *chim = NULL;
1693 	int pkt_hlen, pkt_size;
1694 
1695 	pkt = txd->rndis_pkt;
1696 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1697 	if (pkt_size < txr->hn_chim_size) {
1698 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1699 		if (chim != NULL)
1700 			pkt = chim;
1701 	} else {
1702 		if (txr->hn_agg_txd != NULL)
1703 			hn_flush_txagg(ifp, txr);
1704 	}
1705 
1706 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1707 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1708 	pkt->rm_dataoffset = sizeof(*pkt);
1709 	pkt->rm_datalen = m_head->m_pkthdr.len;
1710 	pkt->rm_oobdataoffset = 0;
1711 	pkt->rm_oobdatalen = 0;
1712 	pkt->rm_oobdataelements = 0;
1713 	pkt->rm_pktinfooffset = sizeof(*pkt);
1714 	pkt->rm_pktinfolen = 0;
1715 	pkt->rm_vchandle = 0;
1716 	pkt->rm_reserved = 0;
1717 
1718 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1719 		/*
1720 		 * Set the hash value for this packet, so that the host could
1721 		 * dispatch the TX done event for this packet back to this TX
1722 		 * ring's channel.
1723 		 */
1724 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1725 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1726 		*pi_data = txr->hn_tx_idx;
1727 	}
1728 
1729 	if (m_head->m_flags & M_VLANTAG) {
1730 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1731 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1732 		*pi_data = NDIS_VLAN_INFO_MAKE(
1733 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1734 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1735 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1736 	}
1737 
1738 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1739 #if defined(INET6) || defined(INET)
1740 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1741 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1742 #ifdef INET
1743 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1744 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1745 			    m_head->m_pkthdr.tso_segsz);
1746 		}
1747 #endif
1748 #if defined(INET6) && defined(INET)
1749 		else
1750 #endif
1751 #ifdef INET6
1752 		{
1753 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1754 			    m_head->m_pkthdr.tso_segsz);
1755 		}
1756 #endif
1757 #endif	/* INET6 || INET */
1758 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1759 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1760 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1761 		if (m_head->m_pkthdr.csum_flags &
1762 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1763 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1764 		} else {
1765 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1766 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1767 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1768 		}
1769 
1770 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1771 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1772 		else if (m_head->m_pkthdr.csum_flags &
1773 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1774 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1775 	}
1776 
1777 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1778 	/* Convert RNDIS packet message offsets */
1779 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1780 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1781 
1782 	/*
1783 	 * Fast path: Chimney sending.
1784 	 */
1785 	if (chim != NULL) {
1786 		struct hn_txdesc *tgt_txd = txd;
1787 
1788 		if (txr->hn_agg_txd != NULL) {
1789 			tgt_txd = txr->hn_agg_txd;
1790 #ifdef INVARIANTS
1791 			*m_head0 = NULL;
1792 #endif
1793 		}
1794 
1795 		KASSERT(pkt == chim,
1796 		    ("RNDIS pkt not in chimney sending buffer"));
1797 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1798 		    ("chimney sending buffer is not used"));
1799 		tgt_txd->chim_size += pkt->rm_len;
1800 
1801 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
1802 		    ((uint8_t *)chim) + pkt_hlen);
1803 
1804 		txr->hn_gpa_cnt = 0;
1805 		txr->hn_sendpkt = hn_txpkt_chim;
1806 		goto done;
1807 	}
1808 
1809 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1810 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1811 	    ("chimney buffer is used"));
1812 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1813 
1814 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1815 	if (__predict_false(error)) {
1816 		int freed;
1817 
1818 		/*
1819 		 * This mbuf is not linked w/ the txd yet, so free it now.
1820 		 */
1821 		m_freem(m_head);
1822 		*m_head0 = NULL;
1823 
1824 		freed = hn_txdesc_put(txr, txd);
1825 		KASSERT(freed != 0,
1826 		    ("fail to free txd upon txdma error"));
1827 
1828 		txr->hn_txdma_failed++;
1829 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1830 		return error;
1831 	}
1832 	*m_head0 = m_head;
1833 
1834 	/* +1 RNDIS packet message */
1835 	txr->hn_gpa_cnt = nsegs + 1;
1836 
1837 	/* send packet with page buffer */
1838 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1839 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1840 	txr->hn_gpa[0].gpa_len = pkt_hlen;
1841 
1842 	/*
1843 	 * Fill the page buffers with mbuf info after the page
1844 	 * buffer for RNDIS packet message.
1845 	 */
1846 	for (i = 0; i < nsegs; ++i) {
1847 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1848 
1849 		gpa->gpa_page = atop(segs[i].ds_addr);
1850 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1851 		gpa->gpa_len = segs[i].ds_len;
1852 	}
1853 
1854 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1855 	txd->chim_size = 0;
1856 	txr->hn_sendpkt = hn_txpkt_sglist;
1857 done:
1858 	txd->m = m_head;
1859 
1860 	/* Set the completion routine */
1861 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1862 
1863 	/* Update temporary stats for later use. */
1864 	txr->hn_stat_pkts++;
1865 	txr->hn_stat_size += m_head->m_pkthdr.len;
1866 	if (m_head->m_flags & M_MCAST)
1867 		txr->hn_stat_mcasts++;
1868 
1869 	return 0;
1870 }
1871 
1872 /*
1873  * NOTE:
1874  * If this function fails, then txd will be freed, but the mbuf
1875  * associated w/ the txd will _not_ be freed.
1876  */
1877 static int
1878 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1879 {
1880 	int error, send_failed = 0;
1881 
1882 again:
1883 	/*
1884 	 * Make sure that this txd and any aggregated txds are not freed
1885 	 * before ETHER_BPF_MTAP.
1886 	 */
1887 	hn_txdesc_hold(txd);
1888 	error = txr->hn_sendpkt(txr, txd);
1889 	if (!error) {
1890 		if (bpf_peers_present(ifp->if_bpf)) {
1891 			const struct hn_txdesc *tmp_txd;
1892 
1893 			ETHER_BPF_MTAP(ifp, txd->m);
1894 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1895 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
1896 		}
1897 
1898 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1899 #ifdef HN_IFSTART_SUPPORT
1900 		if (!hn_use_if_start)
1901 #endif
1902 		{
1903 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
1904 			    txr->hn_stat_size);
1905 			if (txr->hn_stat_mcasts != 0) {
1906 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1907 				    txr->hn_stat_mcasts);
1908 			}
1909 		}
1910 		txr->hn_pkts += txr->hn_stat_pkts;
1911 		txr->hn_sends++;
1912 	}
1913 	hn_txdesc_put(txr, txd);
1914 
1915 	if (__predict_false(error)) {
1916 		int freed;
1917 
1918 		/*
1919 		 * This should "really rarely" happen.
1920 		 *
1921 		 * XXX Too many RX to be acked or too many sideband
1922 		 * commands to run?  Ask netvsc_channel_rollup()
1923 		 * to kick start later.
1924 		 */
1925 		txr->hn_has_txeof = 1;
1926 		if (!send_failed) {
1927 			txr->hn_send_failed++;
1928 			send_failed = 1;
1929 			/*
1930 			 * Try sending again after set hn_has_txeof;
1931 			 * in case that we missed the last
1932 			 * netvsc_channel_rollup().
1933 			 */
1934 			goto again;
1935 		}
1936 		if_printf(ifp, "send failed\n");
1937 
1938 		/*
1939 		 * Caller will perform further processing on the
1940 		 * associated mbuf, so don't free it in hn_txdesc_put();
1941 		 * only unload it from the DMA map in hn_txdesc_put(),
1942 		 * if it was loaded.
1943 		 */
1944 		txd->m = NULL;
1945 		freed = hn_txdesc_put(txr, txd);
1946 		KASSERT(freed != 0,
1947 		    ("fail to free txd upon send error"));
1948 
1949 		txr->hn_send_failed++;
1950 	}
1951 
1952 	/* Reset temporary stats, after this sending is done. */
1953 	txr->hn_stat_size = 0;
1954 	txr->hn_stat_pkts = 0;
1955 	txr->hn_stat_mcasts = 0;
1956 
1957 	return (error);
1958 }
1959 
1960 /*
1961  * Append the specified data to the indicated mbuf chain,
1962  * Extend the mbuf chain if the new data does not fit in
1963  * existing space.
1964  *
1965  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1966  * There should be an equivalent in the kernel mbuf code,
1967  * but there does not appear to be one yet.
1968  *
1969  * Differs from m_append() in that additional mbufs are
1970  * allocated with cluster size MJUMPAGESIZE, and filled
1971  * accordingly.
1972  *
1973  * Return 1 if able to complete the job; otherwise 0.
1974  */
1975 static int
1976 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1977 {
1978 	struct mbuf *m, *n;
1979 	int remainder, space;
1980 
1981 	for (m = m0; m->m_next != NULL; m = m->m_next)
1982 		;
1983 	remainder = len;
1984 	space = M_TRAILINGSPACE(m);
1985 	if (space > 0) {
1986 		/*
1987 		 * Copy into available space.
1988 		 */
1989 		if (space > remainder)
1990 			space = remainder;
1991 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1992 		m->m_len += space;
1993 		cp += space;
1994 		remainder -= space;
1995 	}
1996 	while (remainder > 0) {
1997 		/*
1998 		 * Allocate a new mbuf; could check space
1999 		 * and allocate a cluster instead.
2000 		 */
2001 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2002 		if (n == NULL)
2003 			break;
2004 		n->m_len = min(MJUMPAGESIZE, remainder);
2005 		bcopy(cp, mtod(n, caddr_t), n->m_len);
2006 		cp += n->m_len;
2007 		remainder -= n->m_len;
2008 		m->m_next = n;
2009 		m = n;
2010 	}
2011 	if (m0->m_flags & M_PKTHDR)
2012 		m0->m_pkthdr.len += len - remainder;
2013 
2014 	return (remainder == 0);
2015 }
2016 
2017 #if defined(INET) || defined(INET6)
2018 static __inline int
2019 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2020 {
2021 #if __FreeBSD_version >= 1100095
2022 	if (hn_lro_mbufq_depth) {
2023 		tcp_lro_queue_mbuf(lc, m);
2024 		return 0;
2025 	}
2026 #endif
2027 	return tcp_lro_rx(lc, m, 0);
2028 }
2029 #endif
2030 
2031 static int
2032 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2033     const struct hn_rxinfo *info)
2034 {
2035 	struct ifnet *ifp = rxr->hn_ifp;
2036 	struct mbuf *m_new;
2037 	int size, do_lro = 0, do_csum = 1;
2038 	int hash_type;
2039 
2040 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2041 		return (0);
2042 
2043 	/*
2044 	 * Bail out if packet contains more data than configured MTU.
2045 	 */
2046 	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2047 		return (0);
2048 	} else if (dlen <= MHLEN) {
2049 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
2050 		if (m_new == NULL) {
2051 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2052 			return (0);
2053 		}
2054 		memcpy(mtod(m_new, void *), data, dlen);
2055 		m_new->m_pkthdr.len = m_new->m_len = dlen;
2056 		rxr->hn_small_pkts++;
2057 	} else {
2058 		/*
2059 		 * Get an mbuf with a cluster.  For packets 2K or less,
2060 		 * get a standard 2K cluster.  For anything larger, get a
2061 		 * 4K cluster.  Any buffers larger than 4K can cause problems
2062 		 * if looped around to the Hyper-V TX channel, so avoid them.
2063 		 */
2064 		size = MCLBYTES;
2065 		if (dlen > MCLBYTES) {
2066 			/* 4096 */
2067 			size = MJUMPAGESIZE;
2068 		}
2069 
2070 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2071 		if (m_new == NULL) {
2072 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2073 			return (0);
2074 		}
2075 
2076 		hv_m_append(m_new, dlen, data);
2077 	}
2078 	m_new->m_pkthdr.rcvif = ifp;
2079 
2080 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2081 		do_csum = 0;
2082 
2083 	/* receive side checksum offload */
2084 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2085 		/* IP csum offload */
2086 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2087 			m_new->m_pkthdr.csum_flags |=
2088 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2089 			rxr->hn_csum_ip++;
2090 		}
2091 
2092 		/* TCP/UDP csum offload */
2093 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2094 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2095 			m_new->m_pkthdr.csum_flags |=
2096 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2097 			m_new->m_pkthdr.csum_data = 0xffff;
2098 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2099 				rxr->hn_csum_tcp++;
2100 			else
2101 				rxr->hn_csum_udp++;
2102 		}
2103 
2104 		/*
2105 		 * XXX
2106 		 * As of this write (Oct 28th, 2016), host side will turn
2107 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2108 		 * the do_lro setting here is actually _not_ accurate.  We
2109 		 * depend on the RSS hash type check to reset do_lro.
2110 		 */
2111 		if ((info->csum_info &
2112 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2113 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2114 			do_lro = 1;
2115 	} else {
2116 		const struct ether_header *eh;
2117 		uint16_t etype;
2118 		int hoff;
2119 
2120 		hoff = sizeof(*eh);
2121 		if (m_new->m_len < hoff)
2122 			goto skip;
2123 		eh = mtod(m_new, struct ether_header *);
2124 		etype = ntohs(eh->ether_type);
2125 		if (etype == ETHERTYPE_VLAN) {
2126 			const struct ether_vlan_header *evl;
2127 
2128 			hoff = sizeof(*evl);
2129 			if (m_new->m_len < hoff)
2130 				goto skip;
2131 			evl = mtod(m_new, struct ether_vlan_header *);
2132 			etype = ntohs(evl->evl_proto);
2133 		}
2134 
2135 		if (etype == ETHERTYPE_IP) {
2136 			int pr;
2137 
2138 			pr = hn_check_iplen(m_new, hoff);
2139 			if (pr == IPPROTO_TCP) {
2140 				if (do_csum &&
2141 				    (rxr->hn_trust_hcsum &
2142 				     HN_TRUST_HCSUM_TCP)) {
2143 					rxr->hn_csum_trusted++;
2144 					m_new->m_pkthdr.csum_flags |=
2145 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2146 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2147 					m_new->m_pkthdr.csum_data = 0xffff;
2148 				}
2149 				do_lro = 1;
2150 			} else if (pr == IPPROTO_UDP) {
2151 				if (do_csum &&
2152 				    (rxr->hn_trust_hcsum &
2153 				     HN_TRUST_HCSUM_UDP)) {
2154 					rxr->hn_csum_trusted++;
2155 					m_new->m_pkthdr.csum_flags |=
2156 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
2157 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2158 					m_new->m_pkthdr.csum_data = 0xffff;
2159 				}
2160 			} else if (pr != IPPROTO_DONE && do_csum &&
2161 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2162 				rxr->hn_csum_trusted++;
2163 				m_new->m_pkthdr.csum_flags |=
2164 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
2165 			}
2166 		}
2167 	}
2168 skip:
2169 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2170 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2171 		    NDIS_VLAN_INFO_ID(info->vlan_info),
2172 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
2173 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
2174 		m_new->m_flags |= M_VLANTAG;
2175 	}
2176 
2177 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2178 		rxr->hn_rss_pkts++;
2179 		m_new->m_pkthdr.flowid = info->hash_value;
2180 		hash_type = M_HASHTYPE_OPAQUE_HASH;
2181 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2182 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
2183 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2184 
2185 			/*
2186 			 * NOTE:
2187 			 * do_lro is resetted, if the hash types are not TCP
2188 			 * related.  See the comment in the above csum_flags
2189 			 * setup section.
2190 			 */
2191 			switch (type) {
2192 			case NDIS_HASH_IPV4:
2193 				hash_type = M_HASHTYPE_RSS_IPV4;
2194 				do_lro = 0;
2195 				break;
2196 
2197 			case NDIS_HASH_TCP_IPV4:
2198 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2199 				break;
2200 
2201 			case NDIS_HASH_IPV6:
2202 				hash_type = M_HASHTYPE_RSS_IPV6;
2203 				do_lro = 0;
2204 				break;
2205 
2206 			case NDIS_HASH_IPV6_EX:
2207 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
2208 				do_lro = 0;
2209 				break;
2210 
2211 			case NDIS_HASH_TCP_IPV6:
2212 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2213 				break;
2214 
2215 			case NDIS_HASH_TCP_IPV6_EX:
2216 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2217 				break;
2218 			}
2219 		}
2220 	} else {
2221 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2222 		hash_type = M_HASHTYPE_OPAQUE;
2223 	}
2224 	M_HASHTYPE_SET(m_new, hash_type);
2225 
2226 	/*
2227 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
2228 	 * messages (not just data messages) will trigger a response.
2229 	 */
2230 
2231 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2232 	rxr->hn_pkts++;
2233 
2234 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2235 #if defined(INET) || defined(INET6)
2236 		struct lro_ctrl *lro = &rxr->hn_lro;
2237 
2238 		if (lro->lro_cnt) {
2239 			rxr->hn_lro_tried++;
2240 			if (hn_lro_rx(lro, m_new) == 0) {
2241 				/* DONE! */
2242 				return 0;
2243 			}
2244 		}
2245 #endif
2246 	}
2247 
2248 	/* We're not holding the lock here, so don't release it */
2249 	(*ifp->if_input)(ifp, m_new);
2250 
2251 	return (0);
2252 }
2253 
2254 static int
2255 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2256 {
2257 	struct hn_softc *sc = ifp->if_softc;
2258 	struct ifreq *ifr = (struct ifreq *)data;
2259 	int mask, error = 0;
2260 
2261 	switch (cmd) {
2262 	case SIOCSIFMTU:
2263 		if (ifr->ifr_mtu > HN_MTU_MAX) {
2264 			error = EINVAL;
2265 			break;
2266 		}
2267 
2268 		HN_LOCK(sc);
2269 
2270 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2271 			HN_UNLOCK(sc);
2272 			break;
2273 		}
2274 
2275 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2276 			/* Can't change MTU */
2277 			HN_UNLOCK(sc);
2278 			error = EOPNOTSUPP;
2279 			break;
2280 		}
2281 
2282 		if (ifp->if_mtu == ifr->ifr_mtu) {
2283 			HN_UNLOCK(sc);
2284 			break;
2285 		}
2286 
2287 		/*
2288 		 * Suspend this interface before the synthetic parts
2289 		 * are ripped.
2290 		 */
2291 		hn_suspend(sc);
2292 
2293 		/*
2294 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
2295 		 */
2296 		hn_synth_detach(sc);
2297 
2298 		/*
2299 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2300 		 * with the new MTU setting.
2301 		 */
2302 		error = hn_synth_attach(sc, ifr->ifr_mtu);
2303 		if (error) {
2304 			HN_UNLOCK(sc);
2305 			break;
2306 		}
2307 
2308 		/*
2309 		 * Commit the requested MTU, after the synthetic parts
2310 		 * have been successfully attached.
2311 		 */
2312 		ifp->if_mtu = ifr->ifr_mtu;
2313 
2314 		/*
2315 		 * Make sure that various parameters based on MTU are
2316 		 * still valid, after the MTU change.
2317 		 */
2318 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2319 			hn_set_chim_size(sc, sc->hn_chim_szmax);
2320 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2321 #if __FreeBSD_version >= 1100099
2322 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2323 		    HN_LRO_LENLIM_MIN(ifp))
2324 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2325 #endif
2326 
2327 		/*
2328 		 * All done!  Resume the interface now.
2329 		 */
2330 		hn_resume(sc);
2331 
2332 		HN_UNLOCK(sc);
2333 		break;
2334 
2335 	case SIOCSIFFLAGS:
2336 		HN_LOCK(sc);
2337 
2338 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2339 			HN_UNLOCK(sc);
2340 			break;
2341 		}
2342 
2343 		if (ifp->if_flags & IFF_UP) {
2344 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2345 				/*
2346 				 * Caller meight hold mutex, e.g.
2347 				 * bpf; use busy-wait for the RNDIS
2348 				 * reply.
2349 				 */
2350 				HN_NO_SLEEPING(sc);
2351 				hn_set_rxfilter(sc);
2352 				HN_SLEEPING_OK(sc);
2353 			} else {
2354 				hn_init_locked(sc);
2355 			}
2356 		} else {
2357 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2358 				hn_stop(sc);
2359 		}
2360 		sc->hn_if_flags = ifp->if_flags;
2361 
2362 		HN_UNLOCK(sc);
2363 		break;
2364 
2365 	case SIOCSIFCAP:
2366 		HN_LOCK(sc);
2367 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2368 
2369 		if (mask & IFCAP_TXCSUM) {
2370 			ifp->if_capenable ^= IFCAP_TXCSUM;
2371 			if (ifp->if_capenable & IFCAP_TXCSUM)
2372 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2373 			else
2374 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2375 		}
2376 		if (mask & IFCAP_TXCSUM_IPV6) {
2377 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2378 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2379 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2380 			else
2381 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2382 		}
2383 
2384 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2385 		if (mask & IFCAP_RXCSUM)
2386 			ifp->if_capenable ^= IFCAP_RXCSUM;
2387 #ifdef foo
2388 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2389 		if (mask & IFCAP_RXCSUM_IPV6)
2390 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2391 #endif
2392 
2393 		if (mask & IFCAP_LRO)
2394 			ifp->if_capenable ^= IFCAP_LRO;
2395 
2396 		if (mask & IFCAP_TSO4) {
2397 			ifp->if_capenable ^= IFCAP_TSO4;
2398 			if (ifp->if_capenable & IFCAP_TSO4)
2399 				ifp->if_hwassist |= CSUM_IP_TSO;
2400 			else
2401 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2402 		}
2403 		if (mask & IFCAP_TSO6) {
2404 			ifp->if_capenable ^= IFCAP_TSO6;
2405 			if (ifp->if_capenable & IFCAP_TSO6)
2406 				ifp->if_hwassist |= CSUM_IP6_TSO;
2407 			else
2408 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2409 		}
2410 
2411 		HN_UNLOCK(sc);
2412 		break;
2413 
2414 	case SIOCADDMULTI:
2415 	case SIOCDELMULTI:
2416 		HN_LOCK(sc);
2417 
2418 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2419 			HN_UNLOCK(sc);
2420 			break;
2421 		}
2422 		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2423 			/*
2424 			 * Multicast uses mutex; use busy-wait for
2425 			 * the RNDIS reply.
2426 			 */
2427 			HN_NO_SLEEPING(sc);
2428 			hn_set_rxfilter(sc);
2429 			HN_SLEEPING_OK(sc);
2430 		}
2431 
2432 		HN_UNLOCK(sc);
2433 		break;
2434 
2435 	case SIOCSIFMEDIA:
2436 	case SIOCGIFMEDIA:
2437 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2438 		break;
2439 
2440 	default:
2441 		error = ether_ioctl(ifp, cmd, data);
2442 		break;
2443 	}
2444 	return (error);
2445 }
2446 
2447 static void
2448 hn_stop(struct hn_softc *sc)
2449 {
2450 	struct ifnet *ifp = sc->hn_ifp;
2451 	int i;
2452 
2453 	HN_LOCK_ASSERT(sc);
2454 
2455 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2456 	    ("synthetic parts were not attached"));
2457 
2458 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2459 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2460 	hn_suspend_data(sc);
2461 
2462 	/* Clear OACTIVE bit. */
2463 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2464 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2465 		sc->hn_tx_ring[i].hn_oactive = 0;
2466 }
2467 
2468 static void
2469 hn_init_locked(struct hn_softc *sc)
2470 {
2471 	struct ifnet *ifp = sc->hn_ifp;
2472 	int i;
2473 
2474 	HN_LOCK_ASSERT(sc);
2475 
2476 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2477 		return;
2478 
2479 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2480 		return;
2481 
2482 	/* Configure RX filter */
2483 	hn_set_rxfilter(sc);
2484 
2485 	/* Clear OACTIVE bit. */
2486 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2487 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2488 		sc->hn_tx_ring[i].hn_oactive = 0;
2489 
2490 	/* Clear TX 'suspended' bit. */
2491 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2492 
2493 	/* Everything is ready; unleash! */
2494 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2495 }
2496 
2497 static void
2498 hn_init(void *xsc)
2499 {
2500 	struct hn_softc *sc = xsc;
2501 
2502 	HN_LOCK(sc);
2503 	hn_init_locked(sc);
2504 	HN_UNLOCK(sc);
2505 }
2506 
2507 #if __FreeBSD_version >= 1100099
2508 
2509 static int
2510 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2511 {
2512 	struct hn_softc *sc = arg1;
2513 	unsigned int lenlim;
2514 	int error;
2515 
2516 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2517 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2518 	if (error || req->newptr == NULL)
2519 		return error;
2520 
2521 	HN_LOCK(sc);
2522 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2523 	    lenlim > TCP_LRO_LENGTH_MAX) {
2524 		HN_UNLOCK(sc);
2525 		return EINVAL;
2526 	}
2527 	hn_set_lro_lenlim(sc, lenlim);
2528 	HN_UNLOCK(sc);
2529 
2530 	return 0;
2531 }
2532 
2533 static int
2534 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2535 {
2536 	struct hn_softc *sc = arg1;
2537 	int ackcnt, error, i;
2538 
2539 	/*
2540 	 * lro_ackcnt_lim is append count limit,
2541 	 * +1 to turn it into aggregation limit.
2542 	 */
2543 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2544 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2545 	if (error || req->newptr == NULL)
2546 		return error;
2547 
2548 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2549 		return EINVAL;
2550 
2551 	/*
2552 	 * Convert aggregation limit back to append
2553 	 * count limit.
2554 	 */
2555 	--ackcnt;
2556 	HN_LOCK(sc);
2557 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2558 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2559 	HN_UNLOCK(sc);
2560 	return 0;
2561 }
2562 
2563 #endif
2564 
2565 static int
2566 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2567 {
2568 	struct hn_softc *sc = arg1;
2569 	int hcsum = arg2;
2570 	int on, error, i;
2571 
2572 	on = 0;
2573 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2574 		on = 1;
2575 
2576 	error = sysctl_handle_int(oidp, &on, 0, req);
2577 	if (error || req->newptr == NULL)
2578 		return error;
2579 
2580 	HN_LOCK(sc);
2581 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2582 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2583 
2584 		if (on)
2585 			rxr->hn_trust_hcsum |= hcsum;
2586 		else
2587 			rxr->hn_trust_hcsum &= ~hcsum;
2588 	}
2589 	HN_UNLOCK(sc);
2590 	return 0;
2591 }
2592 
2593 static int
2594 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2595 {
2596 	struct hn_softc *sc = arg1;
2597 	int chim_size, error;
2598 
2599 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2600 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2601 	if (error || req->newptr == NULL)
2602 		return error;
2603 
2604 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2605 		return EINVAL;
2606 
2607 	HN_LOCK(sc);
2608 	hn_set_chim_size(sc, chim_size);
2609 	HN_UNLOCK(sc);
2610 	return 0;
2611 }
2612 
2613 #if __FreeBSD_version < 1100095
2614 static int
2615 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2616 {
2617 	struct hn_softc *sc = arg1;
2618 	int ofs = arg2, i, error;
2619 	struct hn_rx_ring *rxr;
2620 	uint64_t stat;
2621 
2622 	stat = 0;
2623 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2624 		rxr = &sc->hn_rx_ring[i];
2625 		stat += *((int *)((uint8_t *)rxr + ofs));
2626 	}
2627 
2628 	error = sysctl_handle_64(oidp, &stat, 0, req);
2629 	if (error || req->newptr == NULL)
2630 		return error;
2631 
2632 	/* Zero out this stat. */
2633 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2634 		rxr = &sc->hn_rx_ring[i];
2635 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2636 	}
2637 	return 0;
2638 }
2639 #else
2640 static int
2641 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2642 {
2643 	struct hn_softc *sc = arg1;
2644 	int ofs = arg2, i, error;
2645 	struct hn_rx_ring *rxr;
2646 	uint64_t stat;
2647 
2648 	stat = 0;
2649 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2650 		rxr = &sc->hn_rx_ring[i];
2651 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2652 	}
2653 
2654 	error = sysctl_handle_64(oidp, &stat, 0, req);
2655 	if (error || req->newptr == NULL)
2656 		return error;
2657 
2658 	/* Zero out this stat. */
2659 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2660 		rxr = &sc->hn_rx_ring[i];
2661 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2662 	}
2663 	return 0;
2664 }
2665 
2666 #endif
2667 
2668 static int
2669 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2670 {
2671 	struct hn_softc *sc = arg1;
2672 	int ofs = arg2, i, error;
2673 	struct hn_rx_ring *rxr;
2674 	u_long stat;
2675 
2676 	stat = 0;
2677 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2678 		rxr = &sc->hn_rx_ring[i];
2679 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2680 	}
2681 
2682 	error = sysctl_handle_long(oidp, &stat, 0, req);
2683 	if (error || req->newptr == NULL)
2684 		return error;
2685 
2686 	/* Zero out this stat. */
2687 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2688 		rxr = &sc->hn_rx_ring[i];
2689 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2690 	}
2691 	return 0;
2692 }
2693 
2694 static int
2695 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2696 {
2697 	struct hn_softc *sc = arg1;
2698 	int ofs = arg2, i, error;
2699 	struct hn_tx_ring *txr;
2700 	u_long stat;
2701 
2702 	stat = 0;
2703 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2704 		txr = &sc->hn_tx_ring[i];
2705 		stat += *((u_long *)((uint8_t *)txr + ofs));
2706 	}
2707 
2708 	error = sysctl_handle_long(oidp, &stat, 0, req);
2709 	if (error || req->newptr == NULL)
2710 		return error;
2711 
2712 	/* Zero out this stat. */
2713 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2714 		txr = &sc->hn_tx_ring[i];
2715 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2716 	}
2717 	return 0;
2718 }
2719 
2720 static int
2721 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2722 {
2723 	struct hn_softc *sc = arg1;
2724 	int ofs = arg2, i, error, conf;
2725 	struct hn_tx_ring *txr;
2726 
2727 	txr = &sc->hn_tx_ring[0];
2728 	conf = *((int *)((uint8_t *)txr + ofs));
2729 
2730 	error = sysctl_handle_int(oidp, &conf, 0, req);
2731 	if (error || req->newptr == NULL)
2732 		return error;
2733 
2734 	HN_LOCK(sc);
2735 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2736 		txr = &sc->hn_tx_ring[i];
2737 		*((int *)((uint8_t *)txr + ofs)) = conf;
2738 	}
2739 	HN_UNLOCK(sc);
2740 
2741 	return 0;
2742 }
2743 
2744 static int
2745 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2746 {
2747 	struct hn_softc *sc = arg1;
2748 	int error, size;
2749 
2750 	size = sc->hn_agg_size;
2751 	error = sysctl_handle_int(oidp, &size, 0, req);
2752 	if (error || req->newptr == NULL)
2753 		return (error);
2754 
2755 	HN_LOCK(sc);
2756 	sc->hn_agg_size = size;
2757 	hn_set_txagg(sc);
2758 	HN_UNLOCK(sc);
2759 
2760 	return (0);
2761 }
2762 
2763 static int
2764 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2765 {
2766 	struct hn_softc *sc = arg1;
2767 	int error, pkts;
2768 
2769 	pkts = sc->hn_agg_pkts;
2770 	error = sysctl_handle_int(oidp, &pkts, 0, req);
2771 	if (error || req->newptr == NULL)
2772 		return (error);
2773 
2774 	HN_LOCK(sc);
2775 	sc->hn_agg_pkts = pkts;
2776 	hn_set_txagg(sc);
2777 	HN_UNLOCK(sc);
2778 
2779 	return (0);
2780 }
2781 
2782 static int
2783 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2784 {
2785 	struct hn_softc *sc = arg1;
2786 	int pkts;
2787 
2788 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2789 	return (sysctl_handle_int(oidp, &pkts, 0, req));
2790 }
2791 
2792 static int
2793 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2794 {
2795 	struct hn_softc *sc = arg1;
2796 	int align;
2797 
2798 	align = sc->hn_tx_ring[0].hn_agg_align;
2799 	return (sysctl_handle_int(oidp, &align, 0, req));
2800 }
2801 
2802 static int
2803 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2804 {
2805 	struct hn_softc *sc = arg1;
2806 	char verstr[16];
2807 
2808 	snprintf(verstr, sizeof(verstr), "%u.%u",
2809 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2810 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2811 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2812 }
2813 
2814 static int
2815 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2816 {
2817 	struct hn_softc *sc = arg1;
2818 	char caps_str[128];
2819 	uint32_t caps;
2820 
2821 	HN_LOCK(sc);
2822 	caps = sc->hn_caps;
2823 	HN_UNLOCK(sc);
2824 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2825 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2826 }
2827 
2828 static int
2829 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2830 {
2831 	struct hn_softc *sc = arg1;
2832 	char assist_str[128];
2833 	uint32_t hwassist;
2834 
2835 	HN_LOCK(sc);
2836 	hwassist = sc->hn_ifp->if_hwassist;
2837 	HN_UNLOCK(sc);
2838 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2839 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2840 }
2841 
2842 static int
2843 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2844 {
2845 	struct hn_softc *sc = arg1;
2846 	char filter_str[128];
2847 	uint32_t filter;
2848 
2849 	HN_LOCK(sc);
2850 	filter = sc->hn_rx_filter;
2851 	HN_UNLOCK(sc);
2852 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
2853 	    NDIS_PACKET_TYPES);
2854 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2855 }
2856 
2857 static int
2858 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2859 {
2860 	struct hn_softc *sc = arg1;
2861 	int error;
2862 
2863 	HN_LOCK(sc);
2864 
2865 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2866 	if (error || req->newptr == NULL)
2867 		goto back;
2868 
2869 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2870 	if (error)
2871 		goto back;
2872 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2873 
2874 	if (sc->hn_rx_ring_inuse > 1) {
2875 		error = hn_rss_reconfig(sc);
2876 	} else {
2877 		/* Not RSS capable, at least for now; just save the RSS key. */
2878 		error = 0;
2879 	}
2880 back:
2881 	HN_UNLOCK(sc);
2882 	return (error);
2883 }
2884 
2885 static int
2886 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2887 {
2888 	struct hn_softc *sc = arg1;
2889 	int error;
2890 
2891 	HN_LOCK(sc);
2892 
2893 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2894 	if (error || req->newptr == NULL)
2895 		goto back;
2896 
2897 	/*
2898 	 * Don't allow RSS indirect table change, if this interface is not
2899 	 * RSS capable currently.
2900 	 */
2901 	if (sc->hn_rx_ring_inuse == 1) {
2902 		error = EOPNOTSUPP;
2903 		goto back;
2904 	}
2905 
2906 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2907 	if (error)
2908 		goto back;
2909 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2910 
2911 	hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2912 	error = hn_rss_reconfig(sc);
2913 back:
2914 	HN_UNLOCK(sc);
2915 	return (error);
2916 }
2917 
2918 static int
2919 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2920 {
2921 	struct hn_softc *sc = arg1;
2922 	char hash_str[128];
2923 	uint32_t hash;
2924 
2925 	HN_LOCK(sc);
2926 	hash = sc->hn_rss_hash;
2927 	HN_UNLOCK(sc);
2928 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2929 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2930 }
2931 
2932 static int
2933 hn_check_iplen(const struct mbuf *m, int hoff)
2934 {
2935 	const struct ip *ip;
2936 	int len, iphlen, iplen;
2937 	const struct tcphdr *th;
2938 	int thoff;				/* TCP data offset */
2939 
2940 	len = hoff + sizeof(struct ip);
2941 
2942 	/* The packet must be at least the size of an IP header. */
2943 	if (m->m_pkthdr.len < len)
2944 		return IPPROTO_DONE;
2945 
2946 	/* The fixed IP header must reside completely in the first mbuf. */
2947 	if (m->m_len < len)
2948 		return IPPROTO_DONE;
2949 
2950 	ip = mtodo(m, hoff);
2951 
2952 	/* Bound check the packet's stated IP header length. */
2953 	iphlen = ip->ip_hl << 2;
2954 	if (iphlen < sizeof(struct ip))		/* minimum header length */
2955 		return IPPROTO_DONE;
2956 
2957 	/* The full IP header must reside completely in the one mbuf. */
2958 	if (m->m_len < hoff + iphlen)
2959 		return IPPROTO_DONE;
2960 
2961 	iplen = ntohs(ip->ip_len);
2962 
2963 	/*
2964 	 * Check that the amount of data in the buffers is as
2965 	 * at least much as the IP header would have us expect.
2966 	 */
2967 	if (m->m_pkthdr.len < hoff + iplen)
2968 		return IPPROTO_DONE;
2969 
2970 	/*
2971 	 * Ignore IP fragments.
2972 	 */
2973 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2974 		return IPPROTO_DONE;
2975 
2976 	/*
2977 	 * The TCP/IP or UDP/IP header must be entirely contained within
2978 	 * the first fragment of a packet.
2979 	 */
2980 	switch (ip->ip_p) {
2981 	case IPPROTO_TCP:
2982 		if (iplen < iphlen + sizeof(struct tcphdr))
2983 			return IPPROTO_DONE;
2984 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2985 			return IPPROTO_DONE;
2986 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2987 		thoff = th->th_off << 2;
2988 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2989 			return IPPROTO_DONE;
2990 		if (m->m_len < hoff + iphlen + thoff)
2991 			return IPPROTO_DONE;
2992 		break;
2993 	case IPPROTO_UDP:
2994 		if (iplen < iphlen + sizeof(struct udphdr))
2995 			return IPPROTO_DONE;
2996 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2997 			return IPPROTO_DONE;
2998 		break;
2999 	default:
3000 		if (iplen < iphlen)
3001 			return IPPROTO_DONE;
3002 		break;
3003 	}
3004 	return ip->ip_p;
3005 }
3006 
3007 static int
3008 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3009 {
3010 	struct sysctl_oid_list *child;
3011 	struct sysctl_ctx_list *ctx;
3012 	device_t dev = sc->hn_dev;
3013 #if defined(INET) || defined(INET6)
3014 #if __FreeBSD_version >= 1100095
3015 	int lroent_cnt;
3016 #endif
3017 #endif
3018 	int i;
3019 
3020 	/*
3021 	 * Create RXBUF for reception.
3022 	 *
3023 	 * NOTE:
3024 	 * - It is shared by all channels.
3025 	 * - A large enough buffer is allocated, certain version of NVSes
3026 	 *   may further limit the usable space.
3027 	 */
3028 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3029 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3030 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3031 	if (sc->hn_rxbuf == NULL) {
3032 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3033 		return (ENOMEM);
3034 	}
3035 
3036 	sc->hn_rx_ring_cnt = ring_cnt;
3037 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3038 
3039 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3040 	    M_DEVBUF, M_WAITOK | M_ZERO);
3041 
3042 #if defined(INET) || defined(INET6)
3043 #if __FreeBSD_version >= 1100095
3044 	lroent_cnt = hn_lro_entry_count;
3045 	if (lroent_cnt < TCP_LRO_ENTRIES)
3046 		lroent_cnt = TCP_LRO_ENTRIES;
3047 	if (bootverbose)
3048 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3049 #endif
3050 #endif	/* INET || INET6 */
3051 
3052 	ctx = device_get_sysctl_ctx(dev);
3053 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3054 
3055 	/* Create dev.hn.UNIT.rx sysctl tree */
3056 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3057 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3058 
3059 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3060 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3061 
3062 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3063 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3064 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
3065 		if (rxr->hn_br == NULL) {
3066 			device_printf(dev, "allocate bufring failed\n");
3067 			return (ENOMEM);
3068 		}
3069 
3070 		if (hn_trust_hosttcp)
3071 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3072 		if (hn_trust_hostudp)
3073 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3074 		if (hn_trust_hostip)
3075 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3076 		rxr->hn_ifp = sc->hn_ifp;
3077 		if (i < sc->hn_tx_ring_cnt)
3078 			rxr->hn_txr = &sc->hn_tx_ring[i];
3079 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3080 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3081 		rxr->hn_rx_idx = i;
3082 		rxr->hn_rxbuf = sc->hn_rxbuf;
3083 
3084 		/*
3085 		 * Initialize LRO.
3086 		 */
3087 #if defined(INET) || defined(INET6)
3088 #if __FreeBSD_version >= 1100095
3089 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3090 		    hn_lro_mbufq_depth);
3091 #else
3092 		tcp_lro_init(&rxr->hn_lro);
3093 		rxr->hn_lro.ifp = sc->hn_ifp;
3094 #endif
3095 #if __FreeBSD_version >= 1100099
3096 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3097 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3098 #endif
3099 #endif	/* INET || INET6 */
3100 
3101 		if (sc->hn_rx_sysctl_tree != NULL) {
3102 			char name[16];
3103 
3104 			/*
3105 			 * Create per RX ring sysctl tree:
3106 			 * dev.hn.UNIT.rx.RINGID
3107 			 */
3108 			snprintf(name, sizeof(name), "%d", i);
3109 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3110 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3111 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3112 
3113 			if (rxr->hn_rx_sysctl_tree != NULL) {
3114 				SYSCTL_ADD_ULONG(ctx,
3115 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3116 				    OID_AUTO, "packets", CTLFLAG_RW,
3117 				    &rxr->hn_pkts, "# of packets received");
3118 				SYSCTL_ADD_ULONG(ctx,
3119 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3120 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
3121 				    &rxr->hn_rss_pkts,
3122 				    "# of packets w/ RSS info received");
3123 				SYSCTL_ADD_INT(ctx,
3124 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3125 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3126 				    &rxr->hn_pktbuf_len, 0,
3127 				    "Temporary channel packet buffer length");
3128 			}
3129 		}
3130 	}
3131 
3132 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3133 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3134 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3135 #if __FreeBSD_version < 1100095
3136 	    hn_rx_stat_int_sysctl,
3137 #else
3138 	    hn_rx_stat_u64_sysctl,
3139 #endif
3140 	    "LU", "LRO queued");
3141 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3142 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3143 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3144 #if __FreeBSD_version < 1100095
3145 	    hn_rx_stat_int_sysctl,
3146 #else
3147 	    hn_rx_stat_u64_sysctl,
3148 #endif
3149 	    "LU", "LRO flushed");
3150 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3151 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3152 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
3153 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3154 #if __FreeBSD_version >= 1100099
3155 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3156 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3157 	    hn_lro_lenlim_sysctl, "IU",
3158 	    "Max # of data bytes to be aggregated by LRO");
3159 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3160 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3161 	    hn_lro_ackcnt_sysctl, "I",
3162 	    "Max # of ACKs to be aggregated by LRO");
3163 #endif
3164 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3165 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3166 	    hn_trust_hcsum_sysctl, "I",
3167 	    "Trust tcp segement verification on host side, "
3168 	    "when csum info is missing");
3169 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3170 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3171 	    hn_trust_hcsum_sysctl, "I",
3172 	    "Trust udp datagram verification on host side, "
3173 	    "when csum info is missing");
3174 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3175 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3176 	    hn_trust_hcsum_sysctl, "I",
3177 	    "Trust ip packet verification on host side, "
3178 	    "when csum info is missing");
3179 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3180 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3181 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
3182 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3183 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3184 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3185 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
3186 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3187 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3188 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3189 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
3190 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3191 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3192 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3193 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
3194 	    hn_rx_stat_ulong_sysctl, "LU",
3195 	    "# of packets that we trust host's csum verification");
3196 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3197 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3198 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
3199 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3200 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3201 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3202 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
3203 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3204 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3205 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3206 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3207 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3208 
3209 	return (0);
3210 }
3211 
3212 static void
3213 hn_destroy_rx_data(struct hn_softc *sc)
3214 {
3215 	int i;
3216 
3217 	if (sc->hn_rxbuf != NULL) {
3218 		hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3219 		sc->hn_rxbuf = NULL;
3220 	}
3221 
3222 	if (sc->hn_rx_ring_cnt == 0)
3223 		return;
3224 
3225 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3226 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3227 
3228 		if (rxr->hn_br == NULL)
3229 			continue;
3230 		hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3231 		rxr->hn_br = NULL;
3232 
3233 #if defined(INET) || defined(INET6)
3234 		tcp_lro_free(&rxr->hn_lro);
3235 #endif
3236 		free(rxr->hn_pktbuf, M_DEVBUF);
3237 	}
3238 	free(sc->hn_rx_ring, M_DEVBUF);
3239 	sc->hn_rx_ring = NULL;
3240 
3241 	sc->hn_rx_ring_cnt = 0;
3242 	sc->hn_rx_ring_inuse = 0;
3243 }
3244 
3245 static int
3246 hn_tx_ring_create(struct hn_softc *sc, int id)
3247 {
3248 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3249 	device_t dev = sc->hn_dev;
3250 	bus_dma_tag_t parent_dtag;
3251 	int error, i;
3252 
3253 	txr->hn_sc = sc;
3254 	txr->hn_tx_idx = id;
3255 
3256 #ifndef HN_USE_TXDESC_BUFRING
3257 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3258 #endif
3259 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3260 
3261 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3262 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3263 	    M_DEVBUF, M_WAITOK | M_ZERO);
3264 #ifndef HN_USE_TXDESC_BUFRING
3265 	SLIST_INIT(&txr->hn_txlist);
3266 #else
3267 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3268 	    M_WAITOK, &txr->hn_tx_lock);
3269 #endif
3270 
3271 	txr->hn_tx_taskq = sc->hn_tx_taskq;
3272 
3273 #ifdef HN_IFSTART_SUPPORT
3274 	if (hn_use_if_start) {
3275 		txr->hn_txeof = hn_start_txeof;
3276 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3277 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3278 	} else
3279 #endif
3280 	{
3281 		int br_depth;
3282 
3283 		txr->hn_txeof = hn_xmit_txeof;
3284 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3285 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3286 
3287 		br_depth = hn_get_txswq_depth(txr);
3288 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3289 		    M_WAITOK, &txr->hn_tx_lock);
3290 	}
3291 
3292 	txr->hn_direct_tx_size = hn_direct_tx_size;
3293 
3294 	/*
3295 	 * Always schedule transmission instead of trying to do direct
3296 	 * transmission.  This one gives the best performance so far.
3297 	 */
3298 	txr->hn_sched_tx = 1;
3299 
3300 	parent_dtag = bus_get_dma_tag(dev);
3301 
3302 	/* DMA tag for RNDIS packet messages. */
3303 	error = bus_dma_tag_create(parent_dtag, /* parent */
3304 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
3305 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
3306 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3307 	    BUS_SPACE_MAXADDR,		/* highaddr */
3308 	    NULL, NULL,			/* filter, filterarg */
3309 	    HN_RNDIS_PKT_LEN,		/* maxsize */
3310 	    1,				/* nsegments */
3311 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
3312 	    0,				/* flags */
3313 	    NULL,			/* lockfunc */
3314 	    NULL,			/* lockfuncarg */
3315 	    &txr->hn_tx_rndis_dtag);
3316 	if (error) {
3317 		device_printf(dev, "failed to create rndis dmatag\n");
3318 		return error;
3319 	}
3320 
3321 	/* DMA tag for data. */
3322 	error = bus_dma_tag_create(parent_dtag, /* parent */
3323 	    1,				/* alignment */
3324 	    HN_TX_DATA_BOUNDARY,	/* boundary */
3325 	    BUS_SPACE_MAXADDR,		/* lowaddr */
3326 	    BUS_SPACE_MAXADDR,		/* highaddr */
3327 	    NULL, NULL,			/* filter, filterarg */
3328 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
3329 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
3330 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
3331 	    0,				/* flags */
3332 	    NULL,			/* lockfunc */
3333 	    NULL,			/* lockfuncarg */
3334 	    &txr->hn_tx_data_dtag);
3335 	if (error) {
3336 		device_printf(dev, "failed to create data dmatag\n");
3337 		return error;
3338 	}
3339 
3340 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3341 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
3342 
3343 		txd->txr = txr;
3344 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3345 		STAILQ_INIT(&txd->agg_list);
3346 
3347 		/*
3348 		 * Allocate and load RNDIS packet message.
3349 		 */
3350         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3351 		    (void **)&txd->rndis_pkt,
3352 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3353 		    &txd->rndis_pkt_dmap);
3354 		if (error) {
3355 			device_printf(dev,
3356 			    "failed to allocate rndis_packet_msg, %d\n", i);
3357 			return error;
3358 		}
3359 
3360 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3361 		    txd->rndis_pkt_dmap,
3362 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3363 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3364 		    BUS_DMA_NOWAIT);
3365 		if (error) {
3366 			device_printf(dev,
3367 			    "failed to load rndis_packet_msg, %d\n", i);
3368 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3369 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3370 			return error;
3371 		}
3372 
3373 		/* DMA map for TX data. */
3374 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3375 		    &txd->data_dmap);
3376 		if (error) {
3377 			device_printf(dev,
3378 			    "failed to allocate tx data dmamap\n");
3379 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3380 			    txd->rndis_pkt_dmap);
3381 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
3382 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
3383 			return error;
3384 		}
3385 
3386 		/* All set, put it to list */
3387 		txd->flags |= HN_TXD_FLAG_ONLIST;
3388 #ifndef HN_USE_TXDESC_BUFRING
3389 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3390 #else
3391 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3392 #endif
3393 	}
3394 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3395 
3396 	if (sc->hn_tx_sysctl_tree != NULL) {
3397 		struct sysctl_oid_list *child;
3398 		struct sysctl_ctx_list *ctx;
3399 		char name[16];
3400 
3401 		/*
3402 		 * Create per TX ring sysctl tree:
3403 		 * dev.hn.UNIT.tx.RINGID
3404 		 */
3405 		ctx = device_get_sysctl_ctx(dev);
3406 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3407 
3408 		snprintf(name, sizeof(name), "%d", id);
3409 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3410 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3411 
3412 		if (txr->hn_tx_sysctl_tree != NULL) {
3413 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3414 
3415 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3416 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3417 			    "# of available TX descs");
3418 #ifdef HN_IFSTART_SUPPORT
3419 			if (!hn_use_if_start)
3420 #endif
3421 			{
3422 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3423 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3424 				    "over active");
3425 			}
3426 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3427 			    CTLFLAG_RW, &txr->hn_pkts,
3428 			    "# of packets transmitted");
3429 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3430 			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
3431 		}
3432 	}
3433 
3434 	return 0;
3435 }
3436 
3437 static void
3438 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3439 {
3440 	struct hn_tx_ring *txr = txd->txr;
3441 
3442 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3443 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3444 
3445 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3446 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3447 	    txd->rndis_pkt_dmap);
3448 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3449 }
3450 
3451 static void
3452 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3453 {
3454 	struct hn_txdesc *txd;
3455 
3456 	if (txr->hn_txdesc == NULL)
3457 		return;
3458 
3459 #ifndef HN_USE_TXDESC_BUFRING
3460 	while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
3461 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
3462 		hn_txdesc_dmamap_destroy(txd);
3463 	}
3464 #else
3465 	mtx_lock(&txr->hn_tx_lock);
3466 	while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
3467 		hn_txdesc_dmamap_destroy(txd);
3468 	mtx_unlock(&txr->hn_tx_lock);
3469 #endif
3470 
3471 	if (txr->hn_tx_data_dtag != NULL)
3472 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3473 	if (txr->hn_tx_rndis_dtag != NULL)
3474 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3475 
3476 #ifdef HN_USE_TXDESC_BUFRING
3477 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3478 #endif
3479 
3480 	free(txr->hn_txdesc, M_DEVBUF);
3481 	txr->hn_txdesc = NULL;
3482 
3483 	if (txr->hn_mbuf_br != NULL)
3484 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3485 
3486 #ifndef HN_USE_TXDESC_BUFRING
3487 	mtx_destroy(&txr->hn_txlist_spin);
3488 #endif
3489 	mtx_destroy(&txr->hn_tx_lock);
3490 }
3491 
3492 static int
3493 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3494 {
3495 	struct sysctl_oid_list *child;
3496 	struct sysctl_ctx_list *ctx;
3497 	int i;
3498 
3499 	/*
3500 	 * Create TXBUF for chimney sending.
3501 	 *
3502 	 * NOTE: It is shared by all channels.
3503 	 */
3504 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3505 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3506 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3507 	if (sc->hn_chim == NULL) {
3508 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3509 		return (ENOMEM);
3510 	}
3511 
3512 	sc->hn_tx_ring_cnt = ring_cnt;
3513 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3514 
3515 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3516 	    M_DEVBUF, M_WAITOK | M_ZERO);
3517 
3518 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3519 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3520 
3521 	/* Create dev.hn.UNIT.tx sysctl tree */
3522 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3523 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3524 
3525 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3526 		int error;
3527 
3528 		error = hn_tx_ring_create(sc, i);
3529 		if (error)
3530 			return error;
3531 	}
3532 
3533 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3534 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3535 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3536 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3537 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3538 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3539 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3540 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3541 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3542 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3543 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3544 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3545 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3546 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3547 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
3548 	    hn_tx_stat_ulong_sysctl, "LU",
3549 	    "# of packet transmission aggregation flush failure");
3550 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3551 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3552 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3553 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3554 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3555 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3556 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3557 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3558 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3559 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3560 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3561 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3562 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3563 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3564 	    "# of total TX descs");
3565 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3566 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3567 	    "Chimney send packet size upper boundary");
3568 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3569 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3570 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3571 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3572 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3573 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3574 	    hn_tx_conf_int_sysctl, "I",
3575 	    "Size of the packet for direct transmission");
3576 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3577 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3578 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3579 	    hn_tx_conf_int_sysctl, "I",
3580 	    "Always schedule transmission "
3581 	    "instead of doing direct transmission");
3582 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3583 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3584 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3585 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3586 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3587 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3588 	    "Applied packet transmission aggregation size");
3589 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3590 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3591 	    hn_txagg_pktmax_sysctl, "I",
3592 	    "Applied packet transmission aggregation packets");
3593 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3594 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3595 	    hn_txagg_align_sysctl, "I",
3596 	    "Applied packet transmission aggregation alignment");
3597 
3598 	return 0;
3599 }
3600 
3601 static void
3602 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3603 {
3604 	int i;
3605 
3606 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3607 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3608 }
3609 
3610 static void
3611 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3612 {
3613 	struct ifnet *ifp = sc->hn_ifp;
3614 	int tso_minlen;
3615 
3616 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3617 		return;
3618 
3619 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3620 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3621 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3622 
3623 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3624 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3625 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3626 
3627 	if (tso_maxlen < tso_minlen)
3628 		tso_maxlen = tso_minlen;
3629 	else if (tso_maxlen > IP_MAXPACKET)
3630 		tso_maxlen = IP_MAXPACKET;
3631 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3632 		tso_maxlen = sc->hn_ndis_tso_szmax;
3633 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3634 	if (bootverbose)
3635 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3636 }
3637 
3638 static void
3639 hn_fixup_tx_data(struct hn_softc *sc)
3640 {
3641 	uint64_t csum_assist;
3642 	int i;
3643 
3644 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3645 	if (hn_tx_chimney_size > 0 &&
3646 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3647 		hn_set_chim_size(sc, hn_tx_chimney_size);
3648 
3649 	csum_assist = 0;
3650 	if (sc->hn_caps & HN_CAP_IPCS)
3651 		csum_assist |= CSUM_IP;
3652 	if (sc->hn_caps & HN_CAP_TCP4CS)
3653 		csum_assist |= CSUM_IP_TCP;
3654 	if (sc->hn_caps & HN_CAP_UDP4CS)
3655 		csum_assist |= CSUM_IP_UDP;
3656 	if (sc->hn_caps & HN_CAP_TCP6CS)
3657 		csum_assist |= CSUM_IP6_TCP;
3658 	if (sc->hn_caps & HN_CAP_UDP6CS)
3659 		csum_assist |= CSUM_IP6_UDP;
3660 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3661 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3662 
3663 	if (sc->hn_caps & HN_CAP_HASHVAL) {
3664 		/*
3665 		 * Support HASHVAL pktinfo on TX path.
3666 		 */
3667 		if (bootverbose)
3668 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3669 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3670 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3671 	}
3672 }
3673 
3674 static void
3675 hn_destroy_tx_data(struct hn_softc *sc)
3676 {
3677 	int i;
3678 
3679 	if (sc->hn_chim != NULL) {
3680 		hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3681 		sc->hn_chim = NULL;
3682 	}
3683 
3684 	if (sc->hn_tx_ring_cnt == 0)
3685 		return;
3686 
3687 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3688 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3689 
3690 	free(sc->hn_tx_ring, M_DEVBUF);
3691 	sc->hn_tx_ring = NULL;
3692 
3693 	sc->hn_tx_ring_cnt = 0;
3694 	sc->hn_tx_ring_inuse = 0;
3695 }
3696 
3697 #ifdef HN_IFSTART_SUPPORT
3698 
3699 static void
3700 hn_start_taskfunc(void *xtxr, int pending __unused)
3701 {
3702 	struct hn_tx_ring *txr = xtxr;
3703 
3704 	mtx_lock(&txr->hn_tx_lock);
3705 	hn_start_locked(txr, 0);
3706 	mtx_unlock(&txr->hn_tx_lock);
3707 }
3708 
3709 static int
3710 hn_start_locked(struct hn_tx_ring *txr, int len)
3711 {
3712 	struct hn_softc *sc = txr->hn_sc;
3713 	struct ifnet *ifp = sc->hn_ifp;
3714 	int sched = 0;
3715 
3716 	KASSERT(hn_use_if_start,
3717 	    ("hn_start_locked is called, when if_start is disabled"));
3718 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3719 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3720 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3721 
3722 	if (__predict_false(txr->hn_suspended))
3723 		return (0);
3724 
3725 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3726 	    IFF_DRV_RUNNING)
3727 		return (0);
3728 
3729 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3730 		struct hn_txdesc *txd;
3731 		struct mbuf *m_head;
3732 		int error;
3733 
3734 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3735 		if (m_head == NULL)
3736 			break;
3737 
3738 		if (len > 0 && m_head->m_pkthdr.len > len) {
3739 			/*
3740 			 * This sending could be time consuming; let callers
3741 			 * dispatch this packet sending (and sending of any
3742 			 * following up packets) to tx taskqueue.
3743 			 */
3744 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3745 			sched = 1;
3746 			break;
3747 		}
3748 
3749 #if defined(INET6) || defined(INET)
3750 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3751 			m_head = hn_tso_fixup(m_head);
3752 			if (__predict_false(m_head == NULL)) {
3753 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3754 				continue;
3755 			}
3756 		}
3757 #endif
3758 
3759 		txd = hn_txdesc_get(txr);
3760 		if (txd == NULL) {
3761 			txr->hn_no_txdescs++;
3762 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3763 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3764 			break;
3765 		}
3766 
3767 		error = hn_encap(ifp, txr, txd, &m_head);
3768 		if (error) {
3769 			/* Both txd and m_head are freed */
3770 			KASSERT(txr->hn_agg_txd == NULL,
3771 			    ("encap failed w/ pending aggregating txdesc"));
3772 			continue;
3773 		}
3774 
3775 		if (txr->hn_agg_pktleft == 0) {
3776 			if (txr->hn_agg_txd != NULL) {
3777 				KASSERT(m_head == NULL,
3778 				    ("pending mbuf for aggregating txdesc"));
3779 				error = hn_flush_txagg(ifp, txr);
3780 				if (__predict_false(error)) {
3781 					atomic_set_int(&ifp->if_drv_flags,
3782 					    IFF_DRV_OACTIVE);
3783 					break;
3784 				}
3785 			} else {
3786 				KASSERT(m_head != NULL, ("mbuf was freed"));
3787 				error = hn_txpkt(ifp, txr, txd);
3788 				if (__predict_false(error)) {
3789 					/* txd is freed, but m_head is not */
3790 					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3791 					atomic_set_int(&ifp->if_drv_flags,
3792 					    IFF_DRV_OACTIVE);
3793 					break;
3794 				}
3795 			}
3796 		}
3797 #ifdef INVARIANTS
3798 		else {
3799 			KASSERT(txr->hn_agg_txd != NULL,
3800 			    ("no aggregating txdesc"));
3801 			KASSERT(m_head == NULL,
3802 			    ("pending mbuf for aggregating txdesc"));
3803 		}
3804 #endif
3805 	}
3806 
3807 	/* Flush pending aggerated transmission. */
3808 	if (txr->hn_agg_txd != NULL)
3809 		hn_flush_txagg(ifp, txr);
3810 	return (sched);
3811 }
3812 
3813 static void
3814 hn_start(struct ifnet *ifp)
3815 {
3816 	struct hn_softc *sc = ifp->if_softc;
3817 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3818 
3819 	if (txr->hn_sched_tx)
3820 		goto do_sched;
3821 
3822 	if (mtx_trylock(&txr->hn_tx_lock)) {
3823 		int sched;
3824 
3825 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3826 		mtx_unlock(&txr->hn_tx_lock);
3827 		if (!sched)
3828 			return;
3829 	}
3830 do_sched:
3831 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3832 }
3833 
3834 static void
3835 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3836 {
3837 	struct hn_tx_ring *txr = xtxr;
3838 
3839 	mtx_lock(&txr->hn_tx_lock);
3840 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3841 	hn_start_locked(txr, 0);
3842 	mtx_unlock(&txr->hn_tx_lock);
3843 }
3844 
3845 static void
3846 hn_start_txeof(struct hn_tx_ring *txr)
3847 {
3848 	struct hn_softc *sc = txr->hn_sc;
3849 	struct ifnet *ifp = sc->hn_ifp;
3850 
3851 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3852 
3853 	if (txr->hn_sched_tx)
3854 		goto do_sched;
3855 
3856 	if (mtx_trylock(&txr->hn_tx_lock)) {
3857 		int sched;
3858 
3859 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3860 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3861 		mtx_unlock(&txr->hn_tx_lock);
3862 		if (sched) {
3863 			taskqueue_enqueue(txr->hn_tx_taskq,
3864 			    &txr->hn_tx_task);
3865 		}
3866 	} else {
3867 do_sched:
3868 		/*
3869 		 * Release the OACTIVE earlier, with the hope, that
3870 		 * others could catch up.  The task will clear the
3871 		 * flag again with the hn_tx_lock to avoid possible
3872 		 * races.
3873 		 */
3874 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3875 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3876 	}
3877 }
3878 
3879 #endif	/* HN_IFSTART_SUPPORT */
3880 
3881 static int
3882 hn_xmit(struct hn_tx_ring *txr, int len)
3883 {
3884 	struct hn_softc *sc = txr->hn_sc;
3885 	struct ifnet *ifp = sc->hn_ifp;
3886 	struct mbuf *m_head;
3887 	int sched = 0;
3888 
3889 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3890 #ifdef HN_IFSTART_SUPPORT
3891 	KASSERT(hn_use_if_start == 0,
3892 	    ("hn_xmit is called, when if_start is enabled"));
3893 #endif
3894 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3895 
3896 	if (__predict_false(txr->hn_suspended))
3897 		return (0);
3898 
3899 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3900 		return (0);
3901 
3902 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3903 		struct hn_txdesc *txd;
3904 		int error;
3905 
3906 		if (len > 0 && m_head->m_pkthdr.len > len) {
3907 			/*
3908 			 * This sending could be time consuming; let callers
3909 			 * dispatch this packet sending (and sending of any
3910 			 * following up packets) to tx taskqueue.
3911 			 */
3912 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3913 			sched = 1;
3914 			break;
3915 		}
3916 
3917 		txd = hn_txdesc_get(txr);
3918 		if (txd == NULL) {
3919 			txr->hn_no_txdescs++;
3920 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3921 			txr->hn_oactive = 1;
3922 			break;
3923 		}
3924 
3925 		error = hn_encap(ifp, txr, txd, &m_head);
3926 		if (error) {
3927 			/* Both txd and m_head are freed; discard */
3928 			KASSERT(txr->hn_agg_txd == NULL,
3929 			    ("encap failed w/ pending aggregating txdesc"));
3930 			drbr_advance(ifp, txr->hn_mbuf_br);
3931 			continue;
3932 		}
3933 
3934 		if (txr->hn_agg_pktleft == 0) {
3935 			if (txr->hn_agg_txd != NULL) {
3936 				KASSERT(m_head == NULL,
3937 				    ("pending mbuf for aggregating txdesc"));
3938 				error = hn_flush_txagg(ifp, txr);
3939 				if (__predict_false(error)) {
3940 					txr->hn_oactive = 1;
3941 					break;
3942 				}
3943 			} else {
3944 				KASSERT(m_head != NULL, ("mbuf was freed"));
3945 				error = hn_txpkt(ifp, txr, txd);
3946 				if (__predict_false(error)) {
3947 					/* txd is freed, but m_head is not */
3948 					drbr_putback(ifp, txr->hn_mbuf_br,
3949 					    m_head);
3950 					txr->hn_oactive = 1;
3951 					break;
3952 				}
3953 			}
3954 		}
3955 #ifdef INVARIANTS
3956 		else {
3957 			KASSERT(txr->hn_agg_txd != NULL,
3958 			    ("no aggregating txdesc"));
3959 			KASSERT(m_head == NULL,
3960 			    ("pending mbuf for aggregating txdesc"));
3961 		}
3962 #endif
3963 
3964 		/* Sent */
3965 		drbr_advance(ifp, txr->hn_mbuf_br);
3966 	}
3967 
3968 	/* Flush pending aggerated transmission. */
3969 	if (txr->hn_agg_txd != NULL)
3970 		hn_flush_txagg(ifp, txr);
3971 	return (sched);
3972 }
3973 
3974 static int
3975 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3976 {
3977 	struct hn_softc *sc = ifp->if_softc;
3978 	struct hn_tx_ring *txr;
3979 	int error, idx = 0;
3980 
3981 #if defined(INET6) || defined(INET)
3982 	/*
3983 	 * Perform TSO packet header fixup now, since the TSO
3984 	 * packet header should be cache-hot.
3985 	 */
3986 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
3987 		m = hn_tso_fixup(m);
3988 		if (__predict_false(m == NULL)) {
3989 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3990 			return EIO;
3991 		}
3992 	}
3993 #endif
3994 
3995 	/*
3996 	 * Select the TX ring based on flowid
3997 	 */
3998 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
3999 		idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4000 	txr = &sc->hn_tx_ring[idx];
4001 
4002 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4003 	if (error) {
4004 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4005 		return error;
4006 	}
4007 
4008 	if (txr->hn_oactive)
4009 		return 0;
4010 
4011 	if (txr->hn_sched_tx)
4012 		goto do_sched;
4013 
4014 	if (mtx_trylock(&txr->hn_tx_lock)) {
4015 		int sched;
4016 
4017 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4018 		mtx_unlock(&txr->hn_tx_lock);
4019 		if (!sched)
4020 			return 0;
4021 	}
4022 do_sched:
4023 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4024 	return 0;
4025 }
4026 
4027 static void
4028 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4029 {
4030 	struct mbuf *m;
4031 
4032 	mtx_lock(&txr->hn_tx_lock);
4033 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4034 		m_freem(m);
4035 	mtx_unlock(&txr->hn_tx_lock);
4036 }
4037 
4038 static void
4039 hn_xmit_qflush(struct ifnet *ifp)
4040 {
4041 	struct hn_softc *sc = ifp->if_softc;
4042 	int i;
4043 
4044 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4045 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4046 	if_qflush(ifp);
4047 }
4048 
4049 static void
4050 hn_xmit_txeof(struct hn_tx_ring *txr)
4051 {
4052 
4053 	if (txr->hn_sched_tx)
4054 		goto do_sched;
4055 
4056 	if (mtx_trylock(&txr->hn_tx_lock)) {
4057 		int sched;
4058 
4059 		txr->hn_oactive = 0;
4060 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
4061 		mtx_unlock(&txr->hn_tx_lock);
4062 		if (sched) {
4063 			taskqueue_enqueue(txr->hn_tx_taskq,
4064 			    &txr->hn_tx_task);
4065 		}
4066 	} else {
4067 do_sched:
4068 		/*
4069 		 * Release the oactive earlier, with the hope, that
4070 		 * others could catch up.  The task will clear the
4071 		 * oactive again with the hn_tx_lock to avoid possible
4072 		 * races.
4073 		 */
4074 		txr->hn_oactive = 0;
4075 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4076 	}
4077 }
4078 
4079 static void
4080 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4081 {
4082 	struct hn_tx_ring *txr = xtxr;
4083 
4084 	mtx_lock(&txr->hn_tx_lock);
4085 	hn_xmit(txr, 0);
4086 	mtx_unlock(&txr->hn_tx_lock);
4087 }
4088 
4089 static void
4090 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4091 {
4092 	struct hn_tx_ring *txr = xtxr;
4093 
4094 	mtx_lock(&txr->hn_tx_lock);
4095 	txr->hn_oactive = 0;
4096 	hn_xmit(txr, 0);
4097 	mtx_unlock(&txr->hn_tx_lock);
4098 }
4099 
4100 static int
4101 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4102 {
4103 	struct vmbus_chan_br cbr;
4104 	struct hn_rx_ring *rxr;
4105 	struct hn_tx_ring *txr = NULL;
4106 	int idx, error;
4107 
4108 	idx = vmbus_chan_subidx(chan);
4109 
4110 	/*
4111 	 * Link this channel to RX/TX ring.
4112 	 */
4113 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4114 	    ("invalid channel index %d, should > 0 && < %d",
4115 	     idx, sc->hn_rx_ring_inuse));
4116 	rxr = &sc->hn_rx_ring[idx];
4117 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4118 	    ("RX ring %d already attached", idx));
4119 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4120 
4121 	if (bootverbose) {
4122 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4123 		    idx, vmbus_chan_id(chan));
4124 	}
4125 
4126 	if (idx < sc->hn_tx_ring_inuse) {
4127 		txr = &sc->hn_tx_ring[idx];
4128 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4129 		    ("TX ring %d already attached", idx));
4130 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4131 
4132 		txr->hn_chan = chan;
4133 		if (bootverbose) {
4134 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4135 			    idx, vmbus_chan_id(chan));
4136 		}
4137 	}
4138 
4139 	/* Bind this channel to a proper CPU. */
4140 	vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
4141 
4142 	/*
4143 	 * Open this channel
4144 	 */
4145 	cbr.cbr = rxr->hn_br;
4146 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4147 	cbr.cbr_txsz = HN_TXBR_SIZE;
4148 	cbr.cbr_rxsz = HN_RXBR_SIZE;
4149 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4150 	if (error) {
4151 		if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4152 		    vmbus_chan_id(chan), error);
4153 		rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4154 		if (txr != NULL)
4155 			txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4156 	}
4157 	return (error);
4158 }
4159 
4160 static void
4161 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4162 {
4163 	struct hn_rx_ring *rxr;
4164 	int idx;
4165 
4166 	idx = vmbus_chan_subidx(chan);
4167 
4168 	/*
4169 	 * Link this channel to RX/TX ring.
4170 	 */
4171 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4172 	    ("invalid channel index %d, should > 0 && < %d",
4173 	     idx, sc->hn_rx_ring_inuse));
4174 	rxr = &sc->hn_rx_ring[idx];
4175 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4176 	    ("RX ring %d is not attached", idx));
4177 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4178 
4179 	if (idx < sc->hn_tx_ring_inuse) {
4180 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4181 
4182 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4183 		    ("TX ring %d is not attached attached", idx));
4184 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4185 	}
4186 
4187 	/*
4188 	 * Close this channel.
4189 	 *
4190 	 * NOTE:
4191 	 * Channel closing does _not_ destroy the target channel.
4192 	 */
4193 	vmbus_chan_close(chan);
4194 }
4195 
4196 static int
4197 hn_attach_subchans(struct hn_softc *sc)
4198 {
4199 	struct vmbus_channel **subchans;
4200 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4201 	int i, error = 0;
4202 
4203 	if (subchan_cnt == 0)
4204 		return (0);
4205 
4206 	/* Attach the sub-channels. */
4207 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4208 	for (i = 0; i < subchan_cnt; ++i) {
4209 		error = hn_chan_attach(sc, subchans[i]);
4210 		if (error)
4211 			break;
4212 	}
4213 	vmbus_subchan_rel(subchans, subchan_cnt);
4214 
4215 	if (error) {
4216 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4217 	} else {
4218 		if (bootverbose) {
4219 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4220 			    subchan_cnt);
4221 		}
4222 	}
4223 	return (error);
4224 }
4225 
4226 static void
4227 hn_detach_allchans(struct hn_softc *sc)
4228 {
4229 	struct vmbus_channel **subchans;
4230 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4231 	int i;
4232 
4233 	if (subchan_cnt == 0)
4234 		goto back;
4235 
4236 	/* Detach the sub-channels. */
4237 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4238 	for (i = 0; i < subchan_cnt; ++i)
4239 		hn_chan_detach(sc, subchans[i]);
4240 	vmbus_subchan_rel(subchans, subchan_cnt);
4241 
4242 back:
4243 	/*
4244 	 * Detach the primary channel, _after_ all sub-channels
4245 	 * are detached.
4246 	 */
4247 	hn_chan_detach(sc, sc->hn_prichan);
4248 
4249 	/* Wait for sub-channels to be destroyed, if any. */
4250 	vmbus_subchan_drain(sc->hn_prichan);
4251 
4252 #ifdef INVARIANTS
4253 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4254 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4255 		    HN_RX_FLAG_ATTACHED) == 0,
4256 		    ("%dth RX ring is still attached", i));
4257 	}
4258 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4259 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4260 		    HN_TX_FLAG_ATTACHED) == 0,
4261 		    ("%dth TX ring is still attached", i));
4262 	}
4263 #endif
4264 }
4265 
4266 static int
4267 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4268 {
4269 	struct vmbus_channel **subchans;
4270 	int nchan, rxr_cnt, error;
4271 
4272 	nchan = *nsubch + 1;
4273 	if (nchan == 1) {
4274 		/*
4275 		 * Multiple RX/TX rings are not requested.
4276 		 */
4277 		*nsubch = 0;
4278 		return (0);
4279 	}
4280 
4281 	/*
4282 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4283 	 * table entries.
4284 	 */
4285 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4286 	if (error) {
4287 		/* No RSS; this is benign. */
4288 		*nsubch = 0;
4289 		return (0);
4290 	}
4291 	if (bootverbose) {
4292 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4293 		    rxr_cnt, nchan);
4294 	}
4295 
4296 	if (nchan > rxr_cnt)
4297 		nchan = rxr_cnt;
4298 	if (nchan == 1) {
4299 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4300 		*nsubch = 0;
4301 		return (0);
4302 	}
4303 
4304 	/*
4305 	 * Allocate sub-channels from NVS.
4306 	 */
4307 	*nsubch = nchan - 1;
4308 	error = hn_nvs_alloc_subchans(sc, nsubch);
4309 	if (error || *nsubch == 0) {
4310 		/* Failed to allocate sub-channels. */
4311 		*nsubch = 0;
4312 		return (0);
4313 	}
4314 
4315 	/*
4316 	 * Wait for all sub-channels to become ready before moving on.
4317 	 */
4318 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4319 	vmbus_subchan_rel(subchans, *nsubch);
4320 	return (0);
4321 }
4322 
4323 static int
4324 hn_synth_attach(struct hn_softc *sc, int mtu)
4325 {
4326 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4327 	int error, nsubch, nchan, i;
4328 	uint32_t old_caps;
4329 
4330 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4331 	    ("synthetic parts were attached"));
4332 
4333 	/* Save capabilities for later verification. */
4334 	old_caps = sc->hn_caps;
4335 	sc->hn_caps = 0;
4336 
4337 	/* Clear RSS stuffs. */
4338 	sc->hn_rss_ind_size = 0;
4339 	sc->hn_rss_hash = 0;
4340 
4341 	/*
4342 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
4343 	 */
4344 	error = hn_chan_attach(sc, sc->hn_prichan);
4345 	if (error)
4346 		return (error);
4347 
4348 	/*
4349 	 * Attach NVS.
4350 	 */
4351 	error = hn_nvs_attach(sc, mtu);
4352 	if (error)
4353 		return (error);
4354 
4355 	/*
4356 	 * Attach RNDIS _after_ NVS is attached.
4357 	 */
4358 	error = hn_rndis_attach(sc, mtu);
4359 	if (error)
4360 		return (error);
4361 
4362 	/*
4363 	 * Make sure capabilities are not changed.
4364 	 */
4365 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4366 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4367 		    old_caps, sc->hn_caps);
4368 		/* Restore old capabilities and abort. */
4369 		sc->hn_caps = old_caps;
4370 		return ENXIO;
4371 	}
4372 
4373 	/*
4374 	 * Allocate sub-channels for multi-TX/RX rings.
4375 	 *
4376 	 * NOTE:
4377 	 * The # of RX rings that can be used is equivalent to the # of
4378 	 * channels to be requested.
4379 	 */
4380 	nsubch = sc->hn_rx_ring_cnt - 1;
4381 	error = hn_synth_alloc_subchans(sc, &nsubch);
4382 	if (error)
4383 		return (error);
4384 
4385 	nchan = nsubch + 1;
4386 	if (nchan == 1) {
4387 		/* Only the primary channel can be used; done */
4388 		goto back;
4389 	}
4390 
4391 	/*
4392 	 * Configure RSS key and indirect table _after_ all sub-channels
4393 	 * are allocated.
4394 	 */
4395 
4396 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4397 		/*
4398 		 * RSS key is not set yet; set it to the default RSS key.
4399 		 */
4400 		if (bootverbose)
4401 			if_printf(sc->hn_ifp, "setup default RSS key\n");
4402 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4403 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4404 	}
4405 
4406 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4407 		/*
4408 		 * RSS indirect table is not set yet; set it up in round-
4409 		 * robin fashion.
4410 		 */
4411 		if (bootverbose) {
4412 			if_printf(sc->hn_ifp, "setup default RSS indirect "
4413 			    "table\n");
4414 		}
4415 		for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4416 			rss->rss_ind[i] = i % nchan;
4417 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4418 	} else {
4419 		/*
4420 		 * # of usable channels may be changed, so we have to
4421 		 * make sure that all entries in RSS indirect table
4422 		 * are valid.
4423 		 */
4424 		hn_rss_ind_fixup(sc, nchan);
4425 	}
4426 
4427 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4428 	if (error) {
4429 		/*
4430 		 * Failed to configure RSS key or indirect table; only
4431 		 * the primary channel can be used.
4432 		 */
4433 		nchan = 1;
4434 	}
4435 back:
4436 	/*
4437 	 * Set the # of TX/RX rings that could be used according to
4438 	 * the # of channels that NVS offered.
4439 	 */
4440 	hn_set_ring_inuse(sc, nchan);
4441 
4442 	/*
4443 	 * Attach the sub-channels, if any.
4444 	 */
4445 	error = hn_attach_subchans(sc);
4446 	if (error)
4447 		return (error);
4448 
4449 	/*
4450 	 * Fixup transmission aggregation setup.
4451 	 */
4452 	hn_set_txagg(sc);
4453 
4454 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4455 	return (0);
4456 }
4457 
4458 /*
4459  * NOTE:
4460  * The interface must have been suspended though hn_suspend(), before
4461  * this function get called.
4462  */
4463 static void
4464 hn_synth_detach(struct hn_softc *sc)
4465 {
4466 	HN_LOCK_ASSERT(sc);
4467 
4468 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4469 	    ("synthetic parts were not attached"));
4470 
4471 	/* Detach the RNDIS first. */
4472 	hn_rndis_detach(sc);
4473 
4474 	/* Detach NVS. */
4475 	hn_nvs_detach(sc);
4476 
4477 	/* Detach all of the channels. */
4478 	hn_detach_allchans(sc);
4479 
4480 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4481 }
4482 
4483 static void
4484 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4485 {
4486 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4487 	    ("invalid ring count %d", ring_cnt));
4488 
4489 	if (sc->hn_tx_ring_cnt > ring_cnt)
4490 		sc->hn_tx_ring_inuse = ring_cnt;
4491 	else
4492 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4493 	sc->hn_rx_ring_inuse = ring_cnt;
4494 
4495 	if (bootverbose) {
4496 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4497 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4498 	}
4499 }
4500 
4501 static void
4502 hn_chan_drain(struct vmbus_channel *chan)
4503 {
4504 
4505 	while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
4506 		pause("waitch", 1);
4507 	vmbus_chan_intr_drain(chan);
4508 }
4509 
4510 static void
4511 hn_suspend_data(struct hn_softc *sc)
4512 {
4513 	struct vmbus_channel **subch = NULL;
4514 	int i, nsubch;
4515 
4516 	HN_LOCK_ASSERT(sc);
4517 
4518 	/*
4519 	 * Suspend TX.
4520 	 */
4521 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4522 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4523 
4524 		mtx_lock(&txr->hn_tx_lock);
4525 		txr->hn_suspended = 1;
4526 		mtx_unlock(&txr->hn_tx_lock);
4527 		/* No one is able send more packets now. */
4528 
4529 		/* Wait for all pending sends to finish. */
4530 		while (hn_tx_ring_pending(txr))
4531 			pause("hnwtx", 1 /* 1 tick */);
4532 
4533 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4534 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4535 	}
4536 
4537 	/*
4538 	 * Disable RX by clearing RX filter.
4539 	 */
4540 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4541 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4542 
4543 	/*
4544 	 * Give RNDIS enough time to flush all pending data packets.
4545 	 */
4546 	pause("waitrx", (200 * hz) / 1000);
4547 
4548 	/*
4549 	 * Drain RX/TX bufrings and interrupts.
4550 	 */
4551 	nsubch = sc->hn_rx_ring_inuse - 1;
4552 	if (nsubch > 0)
4553 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4554 
4555 	if (subch != NULL) {
4556 		for (i = 0; i < nsubch; ++i)
4557 			hn_chan_drain(subch[i]);
4558 	}
4559 	hn_chan_drain(sc->hn_prichan);
4560 
4561 	if (subch != NULL)
4562 		vmbus_subchan_rel(subch, nsubch);
4563 }
4564 
4565 static void
4566 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4567 {
4568 
4569 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4570 }
4571 
4572 static void
4573 hn_suspend_mgmt(struct hn_softc *sc)
4574 {
4575 	struct task task;
4576 
4577 	HN_LOCK_ASSERT(sc);
4578 
4579 	/*
4580 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4581 	 * through hn_mgmt_taskq.
4582 	 */
4583 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4584 	vmbus_chan_run_task(sc->hn_prichan, &task);
4585 
4586 	/*
4587 	 * Make sure that all pending management tasks are completed.
4588 	 */
4589 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4590 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4591 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
4592 }
4593 
4594 static void
4595 hn_suspend(struct hn_softc *sc)
4596 {
4597 
4598 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4599 		hn_suspend_data(sc);
4600 	hn_suspend_mgmt(sc);
4601 }
4602 
4603 static void
4604 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4605 {
4606 	int i;
4607 
4608 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4609 	    ("invalid TX ring count %d", tx_ring_cnt));
4610 
4611 	for (i = 0; i < tx_ring_cnt; ++i) {
4612 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4613 
4614 		mtx_lock(&txr->hn_tx_lock);
4615 		txr->hn_suspended = 0;
4616 		mtx_unlock(&txr->hn_tx_lock);
4617 	}
4618 }
4619 
4620 static void
4621 hn_resume_data(struct hn_softc *sc)
4622 {
4623 	int i;
4624 
4625 	HN_LOCK_ASSERT(sc);
4626 
4627 	/*
4628 	 * Re-enable RX.
4629 	 */
4630 	hn_set_rxfilter(sc);
4631 
4632 	/*
4633 	 * Make sure to clear suspend status on "all" TX rings,
4634 	 * since hn_tx_ring_inuse can be changed after
4635 	 * hn_suspend_data().
4636 	 */
4637 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4638 
4639 #ifdef HN_IFSTART_SUPPORT
4640 	if (!hn_use_if_start)
4641 #endif
4642 	{
4643 		/*
4644 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
4645 		 * reduced.
4646 		 */
4647 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4648 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4649 	}
4650 
4651 	/*
4652 	 * Kick start TX.
4653 	 */
4654 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4655 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4656 
4657 		/*
4658 		 * Use txeof task, so that any pending oactive can be
4659 		 * cleared properly.
4660 		 */
4661 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4662 	}
4663 }
4664 
4665 static void
4666 hn_resume_mgmt(struct hn_softc *sc)
4667 {
4668 
4669 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4670 
4671 	/*
4672 	 * Kick off network change detection, if it was pending.
4673 	 * If no network change was pending, start link status
4674 	 * checks, which is more lightweight than network change
4675 	 * detection.
4676 	 */
4677 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4678 		hn_change_network(sc);
4679 	else
4680 		hn_update_link_status(sc);
4681 }
4682 
4683 static void
4684 hn_resume(struct hn_softc *sc)
4685 {
4686 
4687 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4688 		hn_resume_data(sc);
4689 	hn_resume_mgmt(sc);
4690 }
4691 
4692 static void
4693 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4694 {
4695 	const struct rndis_status_msg *msg;
4696 	int ofs;
4697 
4698 	if (dlen < sizeof(*msg)) {
4699 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4700 		return;
4701 	}
4702 	msg = data;
4703 
4704 	switch (msg->rm_status) {
4705 	case RNDIS_STATUS_MEDIA_CONNECT:
4706 	case RNDIS_STATUS_MEDIA_DISCONNECT:
4707 		hn_update_link_status(sc);
4708 		break;
4709 
4710 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4711 		/* Not really useful; ignore. */
4712 		break;
4713 
4714 	case RNDIS_STATUS_NETWORK_CHANGE:
4715 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4716 		if (dlen < ofs + msg->rm_stbuflen ||
4717 		    msg->rm_stbuflen < sizeof(uint32_t)) {
4718 			if_printf(sc->hn_ifp, "network changed\n");
4719 		} else {
4720 			uint32_t change;
4721 
4722 			memcpy(&change, ((const uint8_t *)msg) + ofs,
4723 			    sizeof(change));
4724 			if_printf(sc->hn_ifp, "network changed, change %u\n",
4725 			    change);
4726 		}
4727 		hn_change_network(sc);
4728 		break;
4729 
4730 	default:
4731 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4732 		    msg->rm_status);
4733 		break;
4734 	}
4735 }
4736 
4737 static int
4738 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4739 {
4740 	const struct rndis_pktinfo *pi = info_data;
4741 	uint32_t mask = 0;
4742 
4743 	while (info_dlen != 0) {
4744 		const void *data;
4745 		uint32_t dlen;
4746 
4747 		if (__predict_false(info_dlen < sizeof(*pi)))
4748 			return (EINVAL);
4749 		if (__predict_false(info_dlen < pi->rm_size))
4750 			return (EINVAL);
4751 		info_dlen -= pi->rm_size;
4752 
4753 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4754 			return (EINVAL);
4755 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4756 			return (EINVAL);
4757 		dlen = pi->rm_size - pi->rm_pktinfooffset;
4758 		data = pi->rm_data;
4759 
4760 		switch (pi->rm_type) {
4761 		case NDIS_PKTINFO_TYPE_VLAN:
4762 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4763 				return (EINVAL);
4764 			info->vlan_info = *((const uint32_t *)data);
4765 			mask |= HN_RXINFO_VLAN;
4766 			break;
4767 
4768 		case NDIS_PKTINFO_TYPE_CSUM:
4769 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4770 				return (EINVAL);
4771 			info->csum_info = *((const uint32_t *)data);
4772 			mask |= HN_RXINFO_CSUM;
4773 			break;
4774 
4775 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4776 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4777 				return (EINVAL);
4778 			info->hash_value = *((const uint32_t *)data);
4779 			mask |= HN_RXINFO_HASHVAL;
4780 			break;
4781 
4782 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
4783 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4784 				return (EINVAL);
4785 			info->hash_info = *((const uint32_t *)data);
4786 			mask |= HN_RXINFO_HASHINF;
4787 			break;
4788 
4789 		default:
4790 			goto next;
4791 		}
4792 
4793 		if (mask == HN_RXINFO_ALL) {
4794 			/* All found; done */
4795 			break;
4796 		}
4797 next:
4798 		pi = (const struct rndis_pktinfo *)
4799 		    ((const uint8_t *)pi + pi->rm_size);
4800 	}
4801 
4802 	/*
4803 	 * Final fixup.
4804 	 * - If there is no hash value, invalidate the hash info.
4805 	 */
4806 	if ((mask & HN_RXINFO_HASHVAL) == 0)
4807 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4808 	return (0);
4809 }
4810 
4811 static __inline bool
4812 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4813 {
4814 
4815 	if (off < check_off) {
4816 		if (__predict_true(off + len <= check_off))
4817 			return (false);
4818 	} else if (off > check_off) {
4819 		if (__predict_true(check_off + check_len <= off))
4820 			return (false);
4821 	}
4822 	return (true);
4823 }
4824 
4825 static void
4826 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4827 {
4828 	const struct rndis_packet_msg *pkt;
4829 	struct hn_rxinfo info;
4830 	int data_off, pktinfo_off, data_len, pktinfo_len;
4831 
4832 	/*
4833 	 * Check length.
4834 	 */
4835 	if (__predict_false(dlen < sizeof(*pkt))) {
4836 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4837 		return;
4838 	}
4839 	pkt = data;
4840 
4841 	if (__predict_false(dlen < pkt->rm_len)) {
4842 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4843 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4844 		return;
4845 	}
4846 	if (__predict_false(pkt->rm_len <
4847 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4848 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4849 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
4850 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4851 		    pkt->rm_pktinfolen);
4852 		return;
4853 	}
4854 	if (__predict_false(pkt->rm_datalen == 0)) {
4855 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4856 		return;
4857 	}
4858 
4859 	/*
4860 	 * Check offests.
4861 	 */
4862 #define IS_OFFSET_INVALID(ofs)			\
4863 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
4864 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4865 
4866 	/* XXX Hyper-V does not meet data offset alignment requirement */
4867 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4868 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4869 		    "data offset %u\n", pkt->rm_dataoffset);
4870 		return;
4871 	}
4872 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4873 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4874 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4875 		    "oob offset %u\n", pkt->rm_oobdataoffset);
4876 		return;
4877 	}
4878 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4879 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4880 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4881 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4882 		return;
4883 	}
4884 
4885 #undef IS_OFFSET_INVALID
4886 
4887 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4888 	data_len = pkt->rm_datalen;
4889 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4890 	pktinfo_len = pkt->rm_pktinfolen;
4891 
4892 	/*
4893 	 * Check OOB coverage.
4894 	 */
4895 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
4896 		int oob_off, oob_len;
4897 
4898 		if_printf(rxr->hn_ifp, "got oobdata\n");
4899 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
4900 		oob_len = pkt->rm_oobdatalen;
4901 
4902 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
4903 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4904 			    "oob overflow, msglen %u, oob abs %d len %d\n",
4905 			    pkt->rm_len, oob_off, oob_len);
4906 			return;
4907 		}
4908 
4909 		/*
4910 		 * Check against data.
4911 		 */
4912 		if (hn_rndis_check_overlap(oob_off, oob_len,
4913 		    data_off, data_len)) {
4914 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4915 			    "oob overlaps data, oob abs %d len %d, "
4916 			    "data abs %d len %d\n",
4917 			    oob_off, oob_len, data_off, data_len);
4918 			return;
4919 		}
4920 
4921 		/*
4922 		 * Check against pktinfo.
4923 		 */
4924 		if (pktinfo_len != 0 &&
4925 		    hn_rndis_check_overlap(oob_off, oob_len,
4926 		    pktinfo_off, pktinfo_len)) {
4927 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4928 			    "oob overlaps pktinfo, oob abs %d len %d, "
4929 			    "pktinfo abs %d len %d\n",
4930 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
4931 			return;
4932 		}
4933 	}
4934 
4935 	/*
4936 	 * Check per-packet-info coverage and find useful per-packet-info.
4937 	 */
4938 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
4939 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
4940 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
4941 	if (__predict_true(pktinfo_len != 0)) {
4942 		bool overlap;
4943 		int error;
4944 
4945 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
4946 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4947 			    "pktinfo overflow, msglen %u, "
4948 			    "pktinfo abs %d len %d\n",
4949 			    pkt->rm_len, pktinfo_off, pktinfo_len);
4950 			return;
4951 		}
4952 
4953 		/*
4954 		 * Check packet info coverage.
4955 		 */
4956 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
4957 		    data_off, data_len);
4958 		if (__predict_false(overlap)) {
4959 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4960 			    "pktinfo overlap data, pktinfo abs %d len %d, "
4961 			    "data abs %d len %d\n",
4962 			    pktinfo_off, pktinfo_len, data_off, data_len);
4963 			return;
4964 		}
4965 
4966 		/*
4967 		 * Find useful per-packet-info.
4968 		 */
4969 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
4970 		    pktinfo_len, &info);
4971 		if (__predict_false(error)) {
4972 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
4973 			    "pktinfo\n");
4974 			return;
4975 		}
4976 	}
4977 
4978 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
4979 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4980 		    "data overflow, msglen %u, data abs %d len %d\n",
4981 		    pkt->rm_len, data_off, data_len);
4982 		return;
4983 	}
4984 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
4985 }
4986 
4987 static __inline void
4988 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
4989 {
4990 	const struct rndis_msghdr *hdr;
4991 
4992 	if (__predict_false(dlen < sizeof(*hdr))) {
4993 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
4994 		return;
4995 	}
4996 	hdr = data;
4997 
4998 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
4999 		/* Hot data path. */
5000 		hn_rndis_rx_data(rxr, data, dlen);
5001 		/* Done! */
5002 		return;
5003 	}
5004 
5005 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5006 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5007 	else
5008 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5009 }
5010 
5011 static void
5012 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5013 {
5014 	const struct hn_nvs_hdr *hdr;
5015 
5016 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5017 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
5018 		return;
5019 	}
5020 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5021 
5022 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5023 		/* Useless; ignore */
5024 		return;
5025 	}
5026 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5027 }
5028 
5029 static void
5030 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5031     const struct vmbus_chanpkt_hdr *pkt)
5032 {
5033 	struct hn_nvs_sendctx *sndc;
5034 
5035 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5036 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5037 	    VMBUS_CHANPKT_DATALEN(pkt));
5038 	/*
5039 	 * NOTE:
5040 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5041 	 * its callback.
5042 	 */
5043 }
5044 
5045 static void
5046 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5047     const struct vmbus_chanpkt_hdr *pkthdr)
5048 {
5049 	const struct vmbus_chanpkt_rxbuf *pkt;
5050 	const struct hn_nvs_hdr *nvs_hdr;
5051 	int count, i, hlen;
5052 
5053 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5054 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5055 		return;
5056 	}
5057 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5058 
5059 	/* Make sure that this is a RNDIS message. */
5060 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5061 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5062 		    nvs_hdr->nvs_type);
5063 		return;
5064 	}
5065 
5066 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5067 	if (__predict_false(hlen < sizeof(*pkt))) {
5068 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5069 		return;
5070 	}
5071 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5072 
5073 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5074 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5075 		    pkt->cp_rxbuf_id);
5076 		return;
5077 	}
5078 
5079 	count = pkt->cp_rxbuf_cnt;
5080 	if (__predict_false(hlen <
5081 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5082 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5083 		return;
5084 	}
5085 
5086 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5087 	for (i = 0; i < count; ++i) {
5088 		int ofs, len;
5089 
5090 		ofs = pkt->cp_rxbuf[i].rb_ofs;
5091 		len = pkt->cp_rxbuf[i].rb_len;
5092 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5093 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5094 			    "ofs %d, len %d\n", i, ofs, len);
5095 			continue;
5096 		}
5097 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5098 	}
5099 
5100 	/*
5101 	 * Ack the consumed RXBUF associated w/ this channel packet,
5102 	 * so that this RXBUF can be recycled by the hypervisor.
5103 	 */
5104 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5105 }
5106 
5107 static void
5108 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5109     uint64_t tid)
5110 {
5111 	struct hn_nvs_rndis_ack ack;
5112 	int retries, error;
5113 
5114 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5115 	ack.nvs_status = HN_NVS_STATUS_OK;
5116 
5117 	retries = 0;
5118 again:
5119 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5120 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5121 	if (__predict_false(error == EAGAIN)) {
5122 		/*
5123 		 * NOTE:
5124 		 * This should _not_ happen in real world, since the
5125 		 * consumption of the TX bufring from the TX path is
5126 		 * controlled.
5127 		 */
5128 		if (rxr->hn_ack_failed == 0)
5129 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5130 		rxr->hn_ack_failed++;
5131 		retries++;
5132 		if (retries < 10) {
5133 			DELAY(100);
5134 			goto again;
5135 		}
5136 		/* RXBUF leaks! */
5137 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5138 	}
5139 }
5140 
5141 static void
5142 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5143 {
5144 	struct hn_rx_ring *rxr = xrxr;
5145 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
5146 
5147 	for (;;) {
5148 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5149 		int error, pktlen;
5150 
5151 		pktlen = rxr->hn_pktbuf_len;
5152 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5153 		if (__predict_false(error == ENOBUFS)) {
5154 			void *nbuf;
5155 			int nlen;
5156 
5157 			/*
5158 			 * Expand channel packet buffer.
5159 			 *
5160 			 * XXX
5161 			 * Use M_WAITOK here, since allocation failure
5162 			 * is fatal.
5163 			 */
5164 			nlen = rxr->hn_pktbuf_len * 2;
5165 			while (nlen < pktlen)
5166 				nlen *= 2;
5167 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5168 
5169 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5170 			    rxr->hn_pktbuf_len, nlen);
5171 
5172 			free(rxr->hn_pktbuf, M_DEVBUF);
5173 			rxr->hn_pktbuf = nbuf;
5174 			rxr->hn_pktbuf_len = nlen;
5175 			/* Retry! */
5176 			continue;
5177 		} else if (__predict_false(error == EAGAIN)) {
5178 			/* No more channel packets; done! */
5179 			break;
5180 		}
5181 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5182 
5183 		switch (pkt->cph_type) {
5184 		case VMBUS_CHANPKT_TYPE_COMP:
5185 			hn_nvs_handle_comp(sc, chan, pkt);
5186 			break;
5187 
5188 		case VMBUS_CHANPKT_TYPE_RXBUF:
5189 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
5190 			break;
5191 
5192 		case VMBUS_CHANPKT_TYPE_INBAND:
5193 			hn_nvs_handle_notify(sc, pkt);
5194 			break;
5195 
5196 		default:
5197 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5198 			    pkt->cph_type);
5199 			break;
5200 		}
5201 	}
5202 	hn_chan_rollup(rxr, rxr->hn_txr);
5203 }
5204 
5205 static void
5206 hn_tx_taskq_create(void *arg __unused)
5207 {
5208 
5209 	if (vm_guest != VM_GUEST_HV)
5210 		return;
5211 
5212 	if (!hn_share_tx_taskq)
5213 		return;
5214 
5215 	hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
5216 	    taskqueue_thread_enqueue, &hn_tx_taskq);
5217 	if (hn_bind_tx_taskq >= 0) {
5218 		int cpu = hn_bind_tx_taskq;
5219 		cpuset_t cpu_set;
5220 
5221 		if (cpu > mp_ncpus - 1)
5222 			cpu = mp_ncpus - 1;
5223 		CPU_SETOF(cpu, &cpu_set);
5224 		taskqueue_start_threads_cpuset(&hn_tx_taskq, 1, PI_NET,
5225 		    &cpu_set, "hn tx");
5226 	} else {
5227 		taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
5228 	}
5229 }
5230 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5231     hn_tx_taskq_create, NULL);
5232 
5233 static void
5234 hn_tx_taskq_destroy(void *arg __unused)
5235 {
5236 
5237 	if (hn_tx_taskq != NULL)
5238 		taskqueue_free(hn_tx_taskq);
5239 }
5240 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5241     hn_tx_taskq_destroy, NULL);
5242