xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 168fce73b59d6023cab45d063a452551a1f2103e)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57 
58 #include "opt_inet6.h"
59 #include "opt_inet.h"
60 
61 #include <sys/param.h>
62 #include <sys/bus.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
66 #include <sys/mbuf.h>
67 #include <sys/module.h>
68 #include <sys/queue.h>
69 #include <sys/lock.h>
70 #include <sys/smp.h>
71 #include <sys/socket.h>
72 #include <sys/sockio.h>
73 #include <sys/sx.h>
74 #include <sys/sysctl.h>
75 #include <sys/systm.h>
76 #include <sys/taskqueue.h>
77 #include <sys/buf_ring.h>
78 
79 #include <machine/atomic.h>
80 #include <machine/in_cksum.h>
81 
82 #include <net/bpf.h>
83 #include <net/ethernet.h>
84 #include <net/if.h>
85 #include <net/if_media.h>
86 #include <net/if_types.h>
87 #include <net/if_var.h>
88 #include <net/rndis.h>
89 
90 #include <netinet/in_systm.h>
91 #include <netinet/in.h>
92 #include <netinet/ip.h>
93 #include <netinet/ip6.h>
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_lro.h>
96 #include <netinet/udp.h>
97 
98 #include <dev/hyperv/include/hyperv.h>
99 #include <dev/hyperv/include/hyperv_busdma.h>
100 #include <dev/hyperv/include/vmbus.h>
101 #include <dev/hyperv/include/vmbus_xact.h>
102 
103 #include <dev/hyperv/netvsc/ndis.h>
104 #include <dev/hyperv/netvsc/if_hnreg.h>
105 #include <dev/hyperv/netvsc/if_hnvar.h>
106 #include <dev/hyperv/netvsc/hn_nvs.h>
107 #include <dev/hyperv/netvsc/hn_rndis.h>
108 
109 #include "vmbus_if.h"
110 
111 #define HN_IFSTART_SUPPORT
112 
113 #define HN_RING_CNT_DEF_MAX		8
114 
115 /* YYY should get it from the underlying channel */
116 #define HN_TX_DESC_CNT			512
117 
118 #define HN_RNDIS_PKT_LEN					\
119 	(sizeof(struct rndis_packet_msg) +			\
120 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
121 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
122 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
123 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
124 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
125 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
126 
127 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
128 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
129 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
130 /* -1 for RNDIS packet message */
131 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
132 
133 #define HN_DIRECT_TX_SIZE_DEF		128
134 
135 #define HN_EARLY_TXEOF_THRESH		8
136 
137 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
138 
139 #define HN_LROENT_CNT_DEF		128
140 
141 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
142 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
143 /* YYY 2*MTU is a bit rough, but should be good enough. */
144 #define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
145 
146 #define HN_LRO_ACKCNT_DEF		1
147 
148 #define HN_LOCK_INIT(sc)		\
149 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
150 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
151 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
152 #define HN_LOCK(sc)			sx_xlock(&(sc)->hn_lock)
153 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
154 
155 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
156 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
157 #define HN_CSUM_IP_HWASSIST(sc)		\
158 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
159 #define HN_CSUM_IP6_HWASSIST(sc)	\
160 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
161 
162 struct hn_txdesc {
163 #ifndef HN_USE_TXDESC_BUFRING
164 	SLIST_ENTRY(hn_txdesc)		link;
165 #endif
166 	struct mbuf			*m;
167 	struct hn_tx_ring		*txr;
168 	int				refs;
169 	uint32_t			flags;	/* HN_TXD_FLAG_ */
170 	struct hn_nvs_sendctx		send_ctx;
171 	uint32_t			chim_index;
172 	int				chim_size;
173 
174 	bus_dmamap_t			data_dmap;
175 
176 	bus_addr_t			rndis_pkt_paddr;
177 	struct rndis_packet_msg		*rndis_pkt;
178 	bus_dmamap_t			rndis_pkt_dmap;
179 };
180 
181 #define HN_TXD_FLAG_ONLIST		0x0001
182 #define HN_TXD_FLAG_DMAMAP		0x0002
183 
184 struct hn_rxinfo {
185 	uint32_t			vlan_info;
186 	uint32_t			csum_info;
187 	uint32_t			hash_info;
188 	uint32_t			hash_value;
189 };
190 
191 #define HN_RXINFO_VLAN			0x0001
192 #define HN_RXINFO_CSUM			0x0002
193 #define HN_RXINFO_HASHINF		0x0004
194 #define HN_RXINFO_HASHVAL		0x0008
195 #define HN_RXINFO_ALL			\
196 	(HN_RXINFO_VLAN |		\
197 	 HN_RXINFO_CSUM |		\
198 	 HN_RXINFO_HASHINF |		\
199 	 HN_RXINFO_HASHVAL)
200 
201 #define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
202 #define HN_NDIS_RXCSUM_INFO_INVALID	0
203 #define HN_NDIS_HASH_INFO_INVALID	0
204 
205 static int			hn_probe(device_t);
206 static int			hn_attach(device_t);
207 static int			hn_detach(device_t);
208 static int			hn_shutdown(device_t);
209 static void			hn_chan_callback(struct vmbus_channel *,
210 				    void *);
211 
212 static void			hn_init(void *);
213 static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
214 #ifdef HN_IFSTART_SUPPORT
215 static void			hn_start(struct ifnet *);
216 #endif
217 static int			hn_transmit(struct ifnet *, struct mbuf *);
218 static void			hn_xmit_qflush(struct ifnet *);
219 static int			hn_ifmedia_upd(struct ifnet *);
220 static void			hn_ifmedia_sts(struct ifnet *,
221 				    struct ifmediareq *);
222 
223 static int			hn_rndis_rxinfo(const void *, int,
224 				    struct hn_rxinfo *);
225 static void			hn_rndis_rx_data(struct hn_rx_ring *,
226 				    const void *, int);
227 static void			hn_rndis_rx_status(struct hn_softc *,
228 				    const void *, int);
229 
230 static void			hn_nvs_handle_notify(struct hn_softc *,
231 				    const struct vmbus_chanpkt_hdr *);
232 static void			hn_nvs_handle_comp(struct hn_softc *,
233 				    struct vmbus_channel *,
234 				    const struct vmbus_chanpkt_hdr *);
235 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
236 				    struct vmbus_channel *,
237 				    const struct vmbus_chanpkt_hdr *);
238 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
239 				    struct vmbus_channel *, uint64_t);
240 
241 #if __FreeBSD_version >= 1100099
242 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
243 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
244 #endif
245 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
246 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
247 #if __FreeBSD_version < 1100095
248 static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
249 #else
250 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
251 #endif
252 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
253 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
254 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
255 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
256 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
257 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
258 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
259 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
260 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
261 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
262 
263 static void			hn_stop(struct hn_softc *);
264 static void			hn_init_locked(struct hn_softc *);
265 static int			hn_chan_attach(struct hn_softc *,
266 				    struct vmbus_channel *);
267 static void			hn_chan_detach(struct hn_softc *,
268 				    struct vmbus_channel *);
269 static int			hn_attach_subchans(struct hn_softc *);
270 static void			hn_detach_allchans(struct hn_softc *);
271 static void			hn_chan_rollup(struct hn_rx_ring *,
272 				    struct hn_tx_ring *);
273 static void			hn_set_ring_inuse(struct hn_softc *, int);
274 static int			hn_synth_attach(struct hn_softc *, int);
275 static void			hn_synth_detach(struct hn_softc *);
276 static int			hn_synth_alloc_subchans(struct hn_softc *,
277 				    int *);
278 static void			hn_suspend(struct hn_softc *);
279 static void			hn_suspend_data(struct hn_softc *);
280 static void			hn_suspend_mgmt(struct hn_softc *);
281 static void			hn_resume(struct hn_softc *);
282 static void			hn_resume_data(struct hn_softc *);
283 static void			hn_resume_mgmt(struct hn_softc *);
284 static void			hn_suspend_mgmt_taskfunc(void *, int);
285 static void			hn_chan_drain(struct vmbus_channel *);
286 
287 static void			hn_update_link_status(struct hn_softc *);
288 static void			hn_change_network(struct hn_softc *);
289 static void			hn_link_taskfunc(void *, int);
290 static void			hn_netchg_init_taskfunc(void *, int);
291 static void			hn_netchg_status_taskfunc(void *, int);
292 static void			hn_link_status(struct hn_softc *);
293 
294 static int			hn_create_rx_data(struct hn_softc *, int);
295 static void			hn_destroy_rx_data(struct hn_softc *);
296 static int			hn_check_iplen(const struct mbuf *, int);
297 static int			hn_set_rxfilter(struct hn_softc *);
298 static int			hn_rss_reconfig(struct hn_softc *);
299 static void			hn_rss_ind_fixup(struct hn_softc *, int);
300 static int			hn_rxpkt(struct hn_rx_ring *, const void *,
301 				    int, const struct hn_rxinfo *);
302 
303 static int			hn_tx_ring_create(struct hn_softc *, int);
304 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
305 static int			hn_create_tx_data(struct hn_softc *, int);
306 static void			hn_fixup_tx_data(struct hn_softc *);
307 static void			hn_destroy_tx_data(struct hn_softc *);
308 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
309 static int			hn_encap(struct hn_tx_ring *,
310 				    struct hn_txdesc *, struct mbuf **);
311 static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
312 				    struct hn_txdesc *);
313 static void			hn_set_chim_size(struct hn_softc *, int);
314 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
315 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
316 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
317 static void			hn_resume_tx(struct hn_softc *, int);
318 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
319 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
320 				    struct hn_softc *, struct vmbus_channel *,
321 				    const void *, int);
322 static int			hn_txpkt_sglist(struct hn_tx_ring *,
323 				    struct hn_txdesc *);
324 static int			hn_txpkt_chim(struct hn_tx_ring *,
325 				    struct hn_txdesc *);
326 static int			hn_xmit(struct hn_tx_ring *, int);
327 static void			hn_xmit_taskfunc(void *, int);
328 static void			hn_xmit_txeof(struct hn_tx_ring *);
329 static void			hn_xmit_txeof_taskfunc(void *, int);
330 #ifdef HN_IFSTART_SUPPORT
331 static int			hn_start_locked(struct hn_tx_ring *, int);
332 static void			hn_start_taskfunc(void *, int);
333 static void			hn_start_txeof(struct hn_tx_ring *);
334 static void			hn_start_txeof_taskfunc(void *, int);
335 #endif
336 
337 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
338     "Hyper-V network interface");
339 
340 /* Trust tcp segements verification on host side. */
341 static int			hn_trust_hosttcp = 1;
342 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
343     &hn_trust_hosttcp, 0,
344     "Trust tcp segement verification on host side, "
345     "when csum info is missing (global setting)");
346 
347 /* Trust udp datagrams verification on host side. */
348 static int			hn_trust_hostudp = 1;
349 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
350     &hn_trust_hostudp, 0,
351     "Trust udp datagram verification on host side, "
352     "when csum info is missing (global setting)");
353 
354 /* Trust ip packets verification on host side. */
355 static int			hn_trust_hostip = 1;
356 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
357     &hn_trust_hostip, 0,
358     "Trust ip packet verification on host side, "
359     "when csum info is missing (global setting)");
360 
361 /* Limit TSO burst size */
362 static int			hn_tso_maxlen = IP_MAXPACKET;
363 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
364     &hn_tso_maxlen, 0, "TSO burst limit");
365 
366 /* Limit chimney send size */
367 static int			hn_tx_chimney_size = 0;
368 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
369     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
370 
371 /* Limit the size of packet for direct transmission */
372 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
373 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
374     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
375 
376 /* # of LRO entries per RX ring */
377 #if defined(INET) || defined(INET6)
378 #if __FreeBSD_version >= 1100095
379 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
380 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
381     &hn_lro_entry_count, 0, "LRO entry count");
382 #endif
383 #endif
384 
385 /* Use shared TX taskqueue */
386 static int			hn_share_tx_taskq = 0;
387 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
388     &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
389 
390 #ifndef HN_USE_TXDESC_BUFRING
391 static int			hn_use_txdesc_bufring = 0;
392 #else
393 static int			hn_use_txdesc_bufring = 1;
394 #endif
395 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
396     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
397 
398 /* Bind TX taskqueue to the target CPU */
399 static int			hn_bind_tx_taskq = -1;
400 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
401     &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
402 
403 #ifdef HN_IFSTART_SUPPORT
404 /* Use ifnet.if_start instead of ifnet.if_transmit */
405 static int			hn_use_if_start = 0;
406 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
407     &hn_use_if_start, 0, "Use if_start TX method");
408 #endif
409 
410 /* # of channels to use */
411 static int			hn_chan_cnt = 0;
412 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
413     &hn_chan_cnt, 0,
414     "# of channels to use; each channel has one RX ring and one TX ring");
415 
416 /* # of transmit rings to use */
417 static int			hn_tx_ring_cnt = 0;
418 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
419     &hn_tx_ring_cnt, 0, "# of TX rings to use");
420 
421 /* Software TX ring deptch */
422 static int			hn_tx_swq_depth = 0;
423 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
424     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
425 
426 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
427 #if __FreeBSD_version >= 1100095
428 static u_int			hn_lro_mbufq_depth = 0;
429 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
430     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
431 #endif
432 
433 static u_int			hn_cpu_index;	/* next CPU for channel */
434 static struct taskqueue		*hn_tx_taskq;	/* shared TX taskqueue */
435 
436 static const uint8_t
437 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
438 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
439 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
440 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
441 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
442 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
443 };
444 
445 static device_method_t hn_methods[] = {
446 	/* Device interface */
447 	DEVMETHOD(device_probe,		hn_probe),
448 	DEVMETHOD(device_attach,	hn_attach),
449 	DEVMETHOD(device_detach,	hn_detach),
450 	DEVMETHOD(device_shutdown,	hn_shutdown),
451 	DEVMETHOD_END
452 };
453 
454 static driver_t hn_driver = {
455 	"hn",
456 	hn_methods,
457 	sizeof(struct hn_softc)
458 };
459 
460 static devclass_t hn_devclass;
461 
462 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
463 MODULE_VERSION(hn, 1);
464 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
465 
466 #if __FreeBSD_version >= 1100099
467 static void
468 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
469 {
470 	int i;
471 
472 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
473 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
474 }
475 #endif
476 
477 static int
478 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
479 {
480 
481 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
482 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
483 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
484 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
485 }
486 
487 static int
488 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
489 {
490 	struct hn_nvs_rndis rndis;
491 
492 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
493 	    txd->chim_size > 0, ("invalid rndis chim txd"));
494 
495 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
496 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
497 	rndis.nvs_chim_idx = txd->chim_index;
498 	rndis.nvs_chim_sz = txd->chim_size;
499 
500 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
501 	    &rndis, sizeof(rndis), &txd->send_ctx));
502 }
503 
504 static __inline uint32_t
505 hn_chim_alloc(struct hn_softc *sc)
506 {
507 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
508 	u_long *bmap = sc->hn_chim_bmap;
509 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
510 
511 	for (i = 0; i < bmap_cnt; ++i) {
512 		int idx;
513 
514 		idx = ffsl(~bmap[i]);
515 		if (idx == 0)
516 			continue;
517 
518 		--idx; /* ffsl is 1-based */
519 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
520 		    ("invalid i %d and idx %d", i, idx));
521 
522 		if (atomic_testandset_long(&bmap[i], idx))
523 			continue;
524 
525 		ret = i * LONG_BIT + idx;
526 		break;
527 	}
528 	return (ret);
529 }
530 
531 static __inline void
532 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
533 {
534 	u_long mask;
535 	uint32_t idx;
536 
537 	idx = chim_idx / LONG_BIT;
538 	KASSERT(idx < sc->hn_chim_bmap_cnt,
539 	    ("invalid chimney index 0x%x", chim_idx));
540 
541 	mask = 1UL << (chim_idx % LONG_BIT);
542 	KASSERT(sc->hn_chim_bmap[idx] & mask,
543 	    ("index bitmap 0x%lx, chimney index %u, "
544 	     "bitmap idx %d, bitmask 0x%lx",
545 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
546 
547 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
548 }
549 
550 #if defined(INET6) || defined(INET)
551 /*
552  * NOTE: If this function failed, the m_head would be freed.
553  */
554 static __inline struct mbuf *
555 hn_tso_fixup(struct mbuf *m_head)
556 {
557 	struct ether_vlan_header *evl;
558 	struct tcphdr *th;
559 	int ehlen;
560 
561 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
562 
563 #define PULLUP_HDR(m, len)				\
564 do {							\
565 	if (__predict_false((m)->m_len < (len))) {	\
566 		(m) = m_pullup((m), (len));		\
567 		if ((m) == NULL)			\
568 			return (NULL);			\
569 	}						\
570 } while (0)
571 
572 	PULLUP_HDR(m_head, sizeof(*evl));
573 	evl = mtod(m_head, struct ether_vlan_header *);
574 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
575 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
576 	else
577 		ehlen = ETHER_HDR_LEN;
578 
579 #ifdef INET
580 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
581 		struct ip *ip;
582 		int iphlen;
583 
584 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
585 		ip = mtodo(m_head, ehlen);
586 		iphlen = ip->ip_hl << 2;
587 
588 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
589 		th = mtodo(m_head, ehlen + iphlen);
590 
591 		ip->ip_len = 0;
592 		ip->ip_sum = 0;
593 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
594 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
595 	}
596 #endif
597 #if defined(INET6) && defined(INET)
598 	else
599 #endif
600 #ifdef INET6
601 	{
602 		struct ip6_hdr *ip6;
603 
604 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
605 		ip6 = mtodo(m_head, ehlen);
606 		if (ip6->ip6_nxt != IPPROTO_TCP) {
607 			m_freem(m_head);
608 			return (NULL);
609 		}
610 
611 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
612 		th = mtodo(m_head, ehlen + sizeof(*ip6));
613 
614 		ip6->ip6_plen = 0;
615 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
616 	}
617 #endif
618 	return (m_head);
619 
620 #undef PULLUP_HDR
621 }
622 #endif	/* INET6 || INET */
623 
624 static int
625 hn_set_rxfilter(struct hn_softc *sc)
626 {
627 	struct ifnet *ifp = sc->hn_ifp;
628 	uint32_t filter;
629 	int error = 0;
630 
631 	HN_LOCK_ASSERT(sc);
632 
633 	if (ifp->if_flags & IFF_PROMISC) {
634 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
635 	} else {
636 		filter = NDIS_PACKET_TYPE_DIRECTED;
637 		if (ifp->if_flags & IFF_BROADCAST)
638 			filter |= NDIS_PACKET_TYPE_BROADCAST;
639 #ifdef notyet
640 		/*
641 		 * See the comment in SIOCADDMULTI/SIOCDELMULTI.
642 		 */
643 		/* TODO: support multicast list */
644 		if ((ifp->if_flags & IFF_ALLMULTI) ||
645 		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
646 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
647 #else
648 		/* Always enable ALLMULTI */
649 		filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
650 #endif
651 	}
652 
653 	if (sc->hn_rx_filter != filter) {
654 		error = hn_rndis_set_rxfilter(sc, filter);
655 		if (!error)
656 			sc->hn_rx_filter = filter;
657 	}
658 	return (error);
659 }
660 
661 static int
662 hn_get_txswq_depth(const struct hn_tx_ring *txr)
663 {
664 
665 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
666 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
667 		return txr->hn_txdesc_cnt;
668 	return hn_tx_swq_depth;
669 }
670 
671 static int
672 hn_rss_reconfig(struct hn_softc *sc)
673 {
674 	int error;
675 
676 	HN_LOCK_ASSERT(sc);
677 
678 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
679 		return (ENXIO);
680 
681 	/*
682 	 * Disable RSS first.
683 	 *
684 	 * NOTE:
685 	 * Direct reconfiguration by setting the UNCHG flags does
686 	 * _not_ work properly.
687 	 */
688 	if (bootverbose)
689 		if_printf(sc->hn_ifp, "disable RSS\n");
690 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
691 	if (error) {
692 		if_printf(sc->hn_ifp, "RSS disable failed\n");
693 		return (error);
694 	}
695 
696 	/*
697 	 * Reenable the RSS w/ the updated RSS key or indirect
698 	 * table.
699 	 */
700 	if (bootverbose)
701 		if_printf(sc->hn_ifp, "reconfig RSS\n");
702 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
703 	if (error) {
704 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
705 		return (error);
706 	}
707 	return (0);
708 }
709 
710 static void
711 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
712 {
713 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
714 	int i;
715 
716 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
717 
718 	/*
719 	 * Check indirect table to make sure that all channels in it
720 	 * can be used.
721 	 */
722 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
723 		if (rss->rss_ind[i] >= nchan) {
724 			if_printf(sc->hn_ifp,
725 			    "RSS indirect table %d fixup: %u -> %d\n",
726 			    i, rss->rss_ind[i], nchan - 1);
727 			rss->rss_ind[i] = nchan - 1;
728 		}
729 	}
730 }
731 
732 static int
733 hn_ifmedia_upd(struct ifnet *ifp __unused)
734 {
735 
736 	return EOPNOTSUPP;
737 }
738 
739 static void
740 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
741 {
742 	struct hn_softc *sc = ifp->if_softc;
743 
744 	ifmr->ifm_status = IFM_AVALID;
745 	ifmr->ifm_active = IFM_ETHER;
746 
747 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
748 		ifmr->ifm_active |= IFM_NONE;
749 		return;
750 	}
751 	ifmr->ifm_status |= IFM_ACTIVE;
752 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
753 }
754 
755 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
756 static const struct hyperv_guid g_net_vsc_device_type = {
757 	.hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
758 		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
759 };
760 
761 static int
762 hn_probe(device_t dev)
763 {
764 
765 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
766 	    &g_net_vsc_device_type) == 0) {
767 		device_set_desc(dev, "Hyper-V Network Interface");
768 		return BUS_PROBE_DEFAULT;
769 	}
770 	return ENXIO;
771 }
772 
773 static int
774 hn_attach(device_t dev)
775 {
776 	struct hn_softc *sc = device_get_softc(dev);
777 	struct sysctl_oid_list *child;
778 	struct sysctl_ctx_list *ctx;
779 	uint8_t eaddr[ETHER_ADDR_LEN];
780 	struct ifnet *ifp = NULL;
781 	int error, ring_cnt, tx_ring_cnt;
782 
783 	sc->hn_dev = dev;
784 	sc->hn_prichan = vmbus_get_channel(dev);
785 	HN_LOCK_INIT(sc);
786 
787 	/*
788 	 * Setup taskqueue for transmission.
789 	 */
790 	if (hn_tx_taskq == NULL) {
791 		sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
792 		    taskqueue_thread_enqueue, &sc->hn_tx_taskq);
793 		if (hn_bind_tx_taskq >= 0) {
794 			int cpu = hn_bind_tx_taskq;
795 			cpuset_t cpu_set;
796 
797 			if (cpu > mp_ncpus - 1)
798 				cpu = mp_ncpus - 1;
799 			CPU_SETOF(cpu, &cpu_set);
800 			taskqueue_start_threads_cpuset(&sc->hn_tx_taskq, 1,
801 			    PI_NET, &cpu_set, "%s tx",
802 			    device_get_nameunit(dev));
803 		} else {
804 			taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET,
805 			    "%s tx", device_get_nameunit(dev));
806 		}
807 	} else {
808 		sc->hn_tx_taskq = hn_tx_taskq;
809 	}
810 
811 	/*
812 	 * Setup taskqueue for mangement tasks, e.g. link status.
813 	 */
814 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
815 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
816 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
817 	    device_get_nameunit(dev));
818 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
819 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
820 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
821 	    hn_netchg_status_taskfunc, sc);
822 
823 	/*
824 	 * Allocate ifnet and setup its name earlier, so that if_printf
825 	 * can be used by functions, which will be called after
826 	 * ether_ifattach().
827 	 */
828 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
829 	ifp->if_softc = sc;
830 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
831 
832 	/*
833 	 * Initialize ifmedia earlier so that it can be unconditionally
834 	 * destroyed, if error happened later on.
835 	 */
836 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
837 
838 	/*
839 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
840 	 * to use (tx_ring_cnt).
841 	 *
842 	 * NOTE:
843 	 * The # of RX rings to use is same as the # of channels to use.
844 	 */
845 	ring_cnt = hn_chan_cnt;
846 	if (ring_cnt <= 0) {
847 		/* Default */
848 		ring_cnt = mp_ncpus;
849 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
850 			ring_cnt = HN_RING_CNT_DEF_MAX;
851 	} else if (ring_cnt > mp_ncpus) {
852 		ring_cnt = mp_ncpus;
853 	}
854 
855 	tx_ring_cnt = hn_tx_ring_cnt;
856 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
857 		tx_ring_cnt = ring_cnt;
858 #ifdef HN_IFSTART_SUPPORT
859 	if (hn_use_if_start) {
860 		/* ifnet.if_start only needs one TX ring. */
861 		tx_ring_cnt = 1;
862 	}
863 #endif
864 
865 	/*
866 	 * Set the leader CPU for channels.
867 	 */
868 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
869 
870 	/*
871 	 * Create enough TX/RX rings, even if only limited number of
872 	 * channels can be allocated.
873 	 */
874 	error = hn_create_tx_data(sc, tx_ring_cnt);
875 	if (error)
876 		goto failed;
877 	error = hn_create_rx_data(sc, ring_cnt);
878 	if (error)
879 		goto failed;
880 
881 	/*
882 	 * Create transaction context for NVS and RNDIS transactions.
883 	 */
884 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
885 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
886 	if (sc->hn_xact == NULL)
887 		goto failed;
888 
889 	/*
890 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
891 	 */
892 	error = hn_synth_attach(sc, ETHERMTU);
893 	if (error)
894 		goto failed;
895 
896 	error = hn_rndis_get_eaddr(sc, eaddr);
897 	if (error)
898 		goto failed;
899 
900 #if __FreeBSD_version >= 1100099
901 	if (sc->hn_rx_ring_inuse > 1) {
902 		/*
903 		 * Reduce TCP segment aggregation limit for multiple
904 		 * RX rings to increase ACK timeliness.
905 		 */
906 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
907 	}
908 #endif
909 
910 	/*
911 	 * Fixup TX stuffs after synthetic parts are attached.
912 	 */
913 	hn_fixup_tx_data(sc);
914 
915 	ctx = device_get_sysctl_ctx(dev);
916 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
917 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
918 	    &sc->hn_nvs_ver, 0, "NVS version");
919 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
920 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
921 	    hn_ndis_version_sysctl, "A", "NDIS version");
922 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
923 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
924 	    hn_caps_sysctl, "A", "capabilities");
925 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
926 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
927 	    hn_hwassist_sysctl, "A", "hwassist");
928 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
929 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
930 	    hn_rxfilter_sysctl, "A", "rxfilter");
931 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
932 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
933 	    hn_rss_hash_sysctl, "A", "RSS hash");
934 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
935 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
936 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
937 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
938 	    hn_rss_key_sysctl, "IU", "RSS key");
939 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
940 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
941 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
942 
943 	/*
944 	 * Setup the ifmedia, which has been initialized earlier.
945 	 */
946 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
947 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
948 	/* XXX ifmedia_set really should do this for us */
949 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
950 
951 	/*
952 	 * Setup the ifnet for this interface.
953 	 */
954 
955 	ifp->if_baudrate = IF_Gbps(10);
956 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
957 	ifp->if_ioctl = hn_ioctl;
958 	ifp->if_init = hn_init;
959 #ifdef HN_IFSTART_SUPPORT
960 	if (hn_use_if_start) {
961 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
962 
963 		ifp->if_start = hn_start;
964 		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
965 		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
966 		IFQ_SET_READY(&ifp->if_snd);
967 	} else
968 #endif
969 	{
970 		ifp->if_transmit = hn_transmit;
971 		ifp->if_qflush = hn_xmit_qflush;
972 	}
973 
974 	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
975 #ifdef foo
976 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
977 	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
978 #endif
979 	if (sc->hn_caps & HN_CAP_VLAN) {
980 		/* XXX not sure about VLAN_MTU. */
981 		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
982 	}
983 
984 	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
985 	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
986 		ifp->if_capabilities |= IFCAP_TXCSUM;
987 	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
988 		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
989 	if (sc->hn_caps & HN_CAP_TSO4) {
990 		ifp->if_capabilities |= IFCAP_TSO4;
991 		ifp->if_hwassist |= CSUM_IP_TSO;
992 	}
993 	if (sc->hn_caps & HN_CAP_TSO6) {
994 		ifp->if_capabilities |= IFCAP_TSO6;
995 		ifp->if_hwassist |= CSUM_IP6_TSO;
996 	}
997 
998 	/* Enable all available capabilities by default. */
999 	ifp->if_capenable = ifp->if_capabilities;
1000 
1001 	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1002 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1003 		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1004 		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1005 	}
1006 
1007 	ether_ifattach(ifp, eaddr);
1008 
1009 	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1010 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
1011 		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1012 	}
1013 
1014 	/* Inform the upper layer about the long frame support. */
1015 	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1016 
1017 	/*
1018 	 * Kick off link status check.
1019 	 */
1020 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1021 	hn_update_link_status(sc);
1022 
1023 	return (0);
1024 failed:
1025 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1026 		hn_synth_detach(sc);
1027 	hn_detach(dev);
1028 	return (error);
1029 }
1030 
1031 static int
1032 hn_detach(device_t dev)
1033 {
1034 	struct hn_softc *sc = device_get_softc(dev);
1035 	struct ifnet *ifp = sc->hn_ifp;
1036 
1037 	if (device_is_attached(dev)) {
1038 		HN_LOCK(sc);
1039 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1040 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1041 				hn_stop(sc);
1042 			/*
1043 			 * NOTE:
1044 			 * hn_stop() only suspends data, so managment
1045 			 * stuffs have to be suspended manually here.
1046 			 */
1047 			hn_suspend_mgmt(sc);
1048 			hn_synth_detach(sc);
1049 		}
1050 		HN_UNLOCK(sc);
1051 		ether_ifdetach(ifp);
1052 	}
1053 
1054 	ifmedia_removeall(&sc->hn_media);
1055 	hn_destroy_rx_data(sc);
1056 	hn_destroy_tx_data(sc);
1057 
1058 	if (sc->hn_tx_taskq != hn_tx_taskq)
1059 		taskqueue_free(sc->hn_tx_taskq);
1060 	taskqueue_free(sc->hn_mgmt_taskq0);
1061 
1062 	if (sc->hn_xact != NULL)
1063 		vmbus_xact_ctx_destroy(sc->hn_xact);
1064 
1065 	if_free(ifp);
1066 
1067 	HN_LOCK_DESTROY(sc);
1068 	return (0);
1069 }
1070 
1071 static int
1072 hn_shutdown(device_t dev)
1073 {
1074 
1075 	return (0);
1076 }
1077 
1078 static void
1079 hn_link_status(struct hn_softc *sc)
1080 {
1081 	uint32_t link_status;
1082 	int error;
1083 
1084 	error = hn_rndis_get_linkstatus(sc, &link_status);
1085 	if (error) {
1086 		/* XXX what to do? */
1087 		return;
1088 	}
1089 
1090 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1091 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1092 	else
1093 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1094 	if_link_state_change(sc->hn_ifp,
1095 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1096 	    LINK_STATE_UP : LINK_STATE_DOWN);
1097 }
1098 
1099 static void
1100 hn_link_taskfunc(void *xsc, int pending __unused)
1101 {
1102 	struct hn_softc *sc = xsc;
1103 
1104 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1105 		return;
1106 	hn_link_status(sc);
1107 }
1108 
1109 static void
1110 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1111 {
1112 	struct hn_softc *sc = xsc;
1113 
1114 	/* Prevent any link status checks from running. */
1115 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1116 
1117 	/*
1118 	 * Fake up a [link down --> link up] state change; 5 seconds
1119 	 * delay is used, which closely simulates miibus reaction
1120 	 * upon link down event.
1121 	 */
1122 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1123 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1124 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1125 	    &sc->hn_netchg_status, 5 * hz);
1126 }
1127 
1128 static void
1129 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1130 {
1131 	struct hn_softc *sc = xsc;
1132 
1133 	/* Re-allow link status checks. */
1134 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1135 	hn_link_status(sc);
1136 }
1137 
1138 static void
1139 hn_update_link_status(struct hn_softc *sc)
1140 {
1141 
1142 	if (sc->hn_mgmt_taskq != NULL)
1143 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1144 }
1145 
1146 static void
1147 hn_change_network(struct hn_softc *sc)
1148 {
1149 
1150 	if (sc->hn_mgmt_taskq != NULL)
1151 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1152 }
1153 
1154 static __inline int
1155 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1156     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1157 {
1158 	struct mbuf *m = *m_head;
1159 	int error;
1160 
1161 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1162 
1163 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1164 	    m, segs, nsegs, BUS_DMA_NOWAIT);
1165 	if (error == EFBIG) {
1166 		struct mbuf *m_new;
1167 
1168 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1169 		if (m_new == NULL)
1170 			return ENOBUFS;
1171 		else
1172 			*m_head = m = m_new;
1173 		txr->hn_tx_collapsed++;
1174 
1175 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1176 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1177 	}
1178 	if (!error) {
1179 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1180 		    BUS_DMASYNC_PREWRITE);
1181 		txd->flags |= HN_TXD_FLAG_DMAMAP;
1182 	}
1183 	return error;
1184 }
1185 
1186 static __inline int
1187 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1188 {
1189 
1190 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1191 	    ("put an onlist txd %#x", txd->flags));
1192 
1193 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1194 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1195 		return 0;
1196 
1197 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1198 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1199 		    ("chim txd uses dmamap"));
1200 		hn_chim_free(txr->hn_sc, txd->chim_index);
1201 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1202 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1203 		bus_dmamap_sync(txr->hn_tx_data_dtag,
1204 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1205 		bus_dmamap_unload(txr->hn_tx_data_dtag,
1206 		    txd->data_dmap);
1207 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1208 	}
1209 
1210 	if (txd->m != NULL) {
1211 		m_freem(txd->m);
1212 		txd->m = NULL;
1213 	}
1214 
1215 	txd->flags |= HN_TXD_FLAG_ONLIST;
1216 #ifndef HN_USE_TXDESC_BUFRING
1217 	mtx_lock_spin(&txr->hn_txlist_spin);
1218 	KASSERT(txr->hn_txdesc_avail >= 0 &&
1219 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1220 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1221 	txr->hn_txdesc_avail++;
1222 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1223 	mtx_unlock_spin(&txr->hn_txlist_spin);
1224 #else
1225 	atomic_add_int(&txr->hn_txdesc_avail, 1);
1226 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
1227 #endif
1228 
1229 	return 1;
1230 }
1231 
1232 static __inline struct hn_txdesc *
1233 hn_txdesc_get(struct hn_tx_ring *txr)
1234 {
1235 	struct hn_txdesc *txd;
1236 
1237 #ifndef HN_USE_TXDESC_BUFRING
1238 	mtx_lock_spin(&txr->hn_txlist_spin);
1239 	txd = SLIST_FIRST(&txr->hn_txlist);
1240 	if (txd != NULL) {
1241 		KASSERT(txr->hn_txdesc_avail > 0,
1242 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1243 		txr->hn_txdesc_avail--;
1244 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1245 	}
1246 	mtx_unlock_spin(&txr->hn_txlist_spin);
1247 #else
1248 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1249 #endif
1250 
1251 	if (txd != NULL) {
1252 #ifdef HN_USE_TXDESC_BUFRING
1253 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1254 #endif
1255 		KASSERT(txd->m == NULL && txd->refs == 0 &&
1256 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1257 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
1258 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1259 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
1260 		txd->refs = 1;
1261 	}
1262 	return txd;
1263 }
1264 
1265 static __inline void
1266 hn_txdesc_hold(struct hn_txdesc *txd)
1267 {
1268 
1269 	/* 0->1 transition will never work */
1270 	KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
1271 	atomic_add_int(&txd->refs, 1);
1272 }
1273 
1274 static bool
1275 hn_tx_ring_pending(struct hn_tx_ring *txr)
1276 {
1277 	bool pending = false;
1278 
1279 #ifndef HN_USE_TXDESC_BUFRING
1280 	mtx_lock_spin(&txr->hn_txlist_spin);
1281 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1282 		pending = true;
1283 	mtx_unlock_spin(&txr->hn_txlist_spin);
1284 #else
1285 	if (!buf_ring_full(txr->hn_txdesc_br))
1286 		pending = true;
1287 #endif
1288 	return (pending);
1289 }
1290 
1291 static __inline void
1292 hn_txeof(struct hn_tx_ring *txr)
1293 {
1294 	txr->hn_has_txeof = 0;
1295 	txr->hn_txeof(txr);
1296 }
1297 
1298 static void
1299 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1300     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1301 {
1302 	struct hn_txdesc *txd = sndc->hn_cbarg;
1303 	struct hn_tx_ring *txr;
1304 
1305 	txr = txd->txr;
1306 	KASSERT(txr->hn_chan == chan,
1307 	    ("channel mismatch, on chan%u, should be chan%u",
1308 	     vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1309 
1310 	txr->hn_has_txeof = 1;
1311 	hn_txdesc_put(txr, txd);
1312 
1313 	++txr->hn_txdone_cnt;
1314 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1315 		txr->hn_txdone_cnt = 0;
1316 		if (txr->hn_oactive)
1317 			hn_txeof(txr);
1318 	}
1319 }
1320 
1321 static void
1322 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1323 {
1324 #if defined(INET) || defined(INET6)
1325 	tcp_lro_flush_all(&rxr->hn_lro);
1326 #endif
1327 
1328 	/*
1329 	 * NOTE:
1330 	 * 'txr' could be NULL, if multiple channels and
1331 	 * ifnet.if_start method are enabled.
1332 	 */
1333 	if (txr == NULL || !txr->hn_has_txeof)
1334 		return;
1335 
1336 	txr->hn_txdone_cnt = 0;
1337 	hn_txeof(txr);
1338 }
1339 
1340 static __inline uint32_t
1341 hn_rndis_pktmsg_offset(uint32_t ofs)
1342 {
1343 
1344 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1345 	    ("invalid RNDIS packet msg offset %u", ofs));
1346 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1347 }
1348 
1349 static __inline void *
1350 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1351     size_t pi_dlen, uint32_t pi_type)
1352 {
1353 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1354 	struct rndis_pktinfo *pi;
1355 
1356 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1357 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1358 
1359 	/*
1360 	 * Per-packet-info does not move; it only grows.
1361 	 *
1362 	 * NOTE:
1363 	 * rm_pktinfooffset in this phase counts from the beginning
1364 	 * of rndis_packet_msg.
1365 	 */
1366 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1367 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
1368 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1369 	    pkt->rm_pktinfolen);
1370 	pkt->rm_pktinfolen += pi_size;
1371 
1372 	pi->rm_size = pi_size;
1373 	pi->rm_type = pi_type;
1374 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1375 
1376 	/* Data immediately follow per-packet-info. */
1377 	pkt->rm_dataoffset += pi_size;
1378 
1379 	/* Update RNDIS packet msg length */
1380 	pkt->rm_len += pi_size;
1381 
1382 	return (pi->rm_data);
1383 }
1384 
1385 /*
1386  * NOTE:
1387  * If this function fails, then both txd and m_head0 will be freed.
1388  */
1389 static int
1390 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
1391 {
1392 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1393 	int error, nsegs, i;
1394 	struct mbuf *m_head = *m_head0;
1395 	struct rndis_packet_msg *pkt;
1396 	uint32_t *pi_data;
1397 	void *chim = NULL;
1398 	int pktlen;
1399 
1400 	pkt = txd->rndis_pkt;
1401 	if (m_head->m_pkthdr.len + HN_RNDIS_PKT_LEN < txr->hn_chim_size) {
1402 		/*
1403 		 * This packet is small enough to fit into a chimney sending
1404 		 * buffer.  Try allocating one chimney sending buffer now.
1405 		 */
1406 		txr->hn_tx_chimney_tried++;
1407 		txd->chim_index = hn_chim_alloc(txr->hn_sc);
1408 		if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1409 			chim = txr->hn_sc->hn_chim +
1410 			    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1411 			/*
1412 			 * Directly fill the chimney sending buffer w/ the
1413 			 * RNDIS packet message.
1414 			 */
1415 			pkt = chim;
1416 		}
1417 	}
1418 
1419 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1420 	pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1421 	pkt->rm_dataoffset = sizeof(*pkt);
1422 	pkt->rm_datalen = m_head->m_pkthdr.len;
1423 	pkt->rm_pktinfooffset = sizeof(*pkt);
1424 	pkt->rm_pktinfolen = 0;
1425 
1426 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1427 		/*
1428 		 * Set the hash value for this packet, so that the host could
1429 		 * dispatch the TX done event for this packet back to this TX
1430 		 * ring's channel.
1431 		 */
1432 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1433 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1434 		*pi_data = txr->hn_tx_idx;
1435 	}
1436 
1437 	if (m_head->m_flags & M_VLANTAG) {
1438 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1439 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1440 		*pi_data = NDIS_VLAN_INFO_MAKE(
1441 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1442 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1443 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1444 	}
1445 
1446 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1447 #if defined(INET6) || defined(INET)
1448 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1449 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1450 #ifdef INET
1451 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1452 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1453 			    m_head->m_pkthdr.tso_segsz);
1454 		}
1455 #endif
1456 #if defined(INET6) && defined(INET)
1457 		else
1458 #endif
1459 #ifdef INET6
1460 		{
1461 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1462 			    m_head->m_pkthdr.tso_segsz);
1463 		}
1464 #endif
1465 #endif	/* INET6 || INET */
1466 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1467 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1468 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1469 		if (m_head->m_pkthdr.csum_flags &
1470 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1471 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
1472 		} else {
1473 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
1474 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1475 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
1476 		}
1477 
1478 		if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1479 			*pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1480 		else if (m_head->m_pkthdr.csum_flags &
1481 		    (CSUM_IP_UDP | CSUM_IP6_UDP))
1482 			*pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1483 	}
1484 
1485 	pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1486 	/* Convert RNDIS packet message offsets */
1487 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1488 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1489 
1490 	/*
1491 	 * Fast path: Chimney sending.
1492 	 */
1493 	if (chim != NULL) {
1494 		KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1495 		    ("chimney buffer is not used"));
1496 		KASSERT(pkt == chim, ("RNDIS pkt not in chimney buffer"));
1497 
1498 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
1499 		    ((uint8_t *)chim) + pktlen);
1500 
1501 		txd->chim_size = pkt->rm_len;
1502 		txr->hn_gpa_cnt = 0;
1503 		txr->hn_tx_chimney++;
1504 		txr->hn_sendpkt = hn_txpkt_chim;
1505 		goto done;
1506 	}
1507 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1508 	    ("chimney buffer is used"));
1509 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1510 
1511 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1512 	if (error) {
1513 		int freed;
1514 
1515 		/*
1516 		 * This mbuf is not linked w/ the txd yet, so free it now.
1517 		 */
1518 		m_freem(m_head);
1519 		*m_head0 = NULL;
1520 
1521 		freed = hn_txdesc_put(txr, txd);
1522 		KASSERT(freed != 0,
1523 		    ("fail to free txd upon txdma error"));
1524 
1525 		txr->hn_txdma_failed++;
1526 		if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
1527 		return error;
1528 	}
1529 	*m_head0 = m_head;
1530 
1531 	/* +1 RNDIS packet message */
1532 	txr->hn_gpa_cnt = nsegs + 1;
1533 
1534 	/* send packet with page buffer */
1535 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1536 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1537 	txr->hn_gpa[0].gpa_len = pktlen;
1538 
1539 	/*
1540 	 * Fill the page buffers with mbuf info after the page
1541 	 * buffer for RNDIS packet message.
1542 	 */
1543 	for (i = 0; i < nsegs; ++i) {
1544 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1545 
1546 		gpa->gpa_page = atop(segs[i].ds_addr);
1547 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1548 		gpa->gpa_len = segs[i].ds_len;
1549 	}
1550 
1551 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1552 	txd->chim_size = 0;
1553 	txr->hn_sendpkt = hn_txpkt_sglist;
1554 done:
1555 	txd->m = m_head;
1556 
1557 	/* Set the completion routine */
1558 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1559 
1560 	return 0;
1561 }
1562 
1563 /*
1564  * NOTE:
1565  * If this function fails, then txd will be freed, but the mbuf
1566  * associated w/ the txd will _not_ be freed.
1567  */
1568 static int
1569 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1570 {
1571 	int error, send_failed = 0;
1572 
1573 again:
1574 	/*
1575 	 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1576 	 */
1577 	hn_txdesc_hold(txd);
1578 	error = txr->hn_sendpkt(txr, txd);
1579 	if (!error) {
1580 		ETHER_BPF_MTAP(ifp, txd->m);
1581 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1582 #ifdef HN_IFSTART_SUPPORT
1583 		if (!hn_use_if_start)
1584 #endif
1585 		{
1586 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
1587 			    txd->m->m_pkthdr.len);
1588 			if (txd->m->m_flags & M_MCAST)
1589 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1590 		}
1591 		txr->hn_pkts++;
1592 	}
1593 	hn_txdesc_put(txr, txd);
1594 
1595 	if (__predict_false(error)) {
1596 		int freed;
1597 
1598 		/*
1599 		 * This should "really rarely" happen.
1600 		 *
1601 		 * XXX Too many RX to be acked or too many sideband
1602 		 * commands to run?  Ask netvsc_channel_rollup()
1603 		 * to kick start later.
1604 		 */
1605 		txr->hn_has_txeof = 1;
1606 		if (!send_failed) {
1607 			txr->hn_send_failed++;
1608 			send_failed = 1;
1609 			/*
1610 			 * Try sending again after set hn_has_txeof;
1611 			 * in case that we missed the last
1612 			 * netvsc_channel_rollup().
1613 			 */
1614 			goto again;
1615 		}
1616 		if_printf(ifp, "send failed\n");
1617 
1618 		/*
1619 		 * Caller will perform further processing on the
1620 		 * associated mbuf, so don't free it in hn_txdesc_put();
1621 		 * only unload it from the DMA map in hn_txdesc_put(),
1622 		 * if it was loaded.
1623 		 */
1624 		txd->m = NULL;
1625 		freed = hn_txdesc_put(txr, txd);
1626 		KASSERT(freed != 0,
1627 		    ("fail to free txd upon send error"));
1628 
1629 		txr->hn_send_failed++;
1630 	}
1631 	return error;
1632 }
1633 
1634 /*
1635  * Append the specified data to the indicated mbuf chain,
1636  * Extend the mbuf chain if the new data does not fit in
1637  * existing space.
1638  *
1639  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1640  * There should be an equivalent in the kernel mbuf code,
1641  * but there does not appear to be one yet.
1642  *
1643  * Differs from m_append() in that additional mbufs are
1644  * allocated with cluster size MJUMPAGESIZE, and filled
1645  * accordingly.
1646  *
1647  * Return 1 if able to complete the job; otherwise 0.
1648  */
1649 static int
1650 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1651 {
1652 	struct mbuf *m, *n;
1653 	int remainder, space;
1654 
1655 	for (m = m0; m->m_next != NULL; m = m->m_next)
1656 		;
1657 	remainder = len;
1658 	space = M_TRAILINGSPACE(m);
1659 	if (space > 0) {
1660 		/*
1661 		 * Copy into available space.
1662 		 */
1663 		if (space > remainder)
1664 			space = remainder;
1665 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1666 		m->m_len += space;
1667 		cp += space;
1668 		remainder -= space;
1669 	}
1670 	while (remainder > 0) {
1671 		/*
1672 		 * Allocate a new mbuf; could check space
1673 		 * and allocate a cluster instead.
1674 		 */
1675 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
1676 		if (n == NULL)
1677 			break;
1678 		n->m_len = min(MJUMPAGESIZE, remainder);
1679 		bcopy(cp, mtod(n, caddr_t), n->m_len);
1680 		cp += n->m_len;
1681 		remainder -= n->m_len;
1682 		m->m_next = n;
1683 		m = n;
1684 	}
1685 	if (m0->m_flags & M_PKTHDR)
1686 		m0->m_pkthdr.len += len - remainder;
1687 
1688 	return (remainder == 0);
1689 }
1690 
1691 #if defined(INET) || defined(INET6)
1692 static __inline int
1693 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
1694 {
1695 #if __FreeBSD_version >= 1100095
1696 	if (hn_lro_mbufq_depth) {
1697 		tcp_lro_queue_mbuf(lc, m);
1698 		return 0;
1699 	}
1700 #endif
1701 	return tcp_lro_rx(lc, m, 0);
1702 }
1703 #endif
1704 
1705 static int
1706 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
1707     const struct hn_rxinfo *info)
1708 {
1709 	struct ifnet *ifp = rxr->hn_ifp;
1710 	struct mbuf *m_new;
1711 	int size, do_lro = 0, do_csum = 1;
1712 	int hash_type;
1713 
1714 	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1715 		return (0);
1716 
1717 	/*
1718 	 * Bail out if packet contains more data than configured MTU.
1719 	 */
1720 	if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
1721 		return (0);
1722 	} else if (dlen <= MHLEN) {
1723 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
1724 		if (m_new == NULL) {
1725 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1726 			return (0);
1727 		}
1728 		memcpy(mtod(m_new, void *), data, dlen);
1729 		m_new->m_pkthdr.len = m_new->m_len = dlen;
1730 		rxr->hn_small_pkts++;
1731 	} else {
1732 		/*
1733 		 * Get an mbuf with a cluster.  For packets 2K or less,
1734 		 * get a standard 2K cluster.  For anything larger, get a
1735 		 * 4K cluster.  Any buffers larger than 4K can cause problems
1736 		 * if looped around to the Hyper-V TX channel, so avoid them.
1737 		 */
1738 		size = MCLBYTES;
1739 		if (dlen > MCLBYTES) {
1740 			/* 4096 */
1741 			size = MJUMPAGESIZE;
1742 		}
1743 
1744 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1745 		if (m_new == NULL) {
1746 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1747 			return (0);
1748 		}
1749 
1750 		hv_m_append(m_new, dlen, data);
1751 	}
1752 	m_new->m_pkthdr.rcvif = ifp;
1753 
1754 	if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1755 		do_csum = 0;
1756 
1757 	/* receive side checksum offload */
1758 	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
1759 		/* IP csum offload */
1760 		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
1761 			m_new->m_pkthdr.csum_flags |=
1762 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
1763 			rxr->hn_csum_ip++;
1764 		}
1765 
1766 		/* TCP/UDP csum offload */
1767 		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
1768 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
1769 			m_new->m_pkthdr.csum_flags |=
1770 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1771 			m_new->m_pkthdr.csum_data = 0xffff;
1772 			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
1773 				rxr->hn_csum_tcp++;
1774 			else
1775 				rxr->hn_csum_udp++;
1776 		}
1777 
1778 		/*
1779 		 * XXX
1780 		 * As of this write (Oct 28th, 2016), host side will turn
1781 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
1782 		 * the do_lro setting here is actually _not_ accurate.  We
1783 		 * depend on the RSS hash type check to reset do_lro.
1784 		 */
1785 		if ((info->csum_info &
1786 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
1787 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
1788 			do_lro = 1;
1789 	} else {
1790 		const struct ether_header *eh;
1791 		uint16_t etype;
1792 		int hoff;
1793 
1794 		hoff = sizeof(*eh);
1795 		if (m_new->m_len < hoff)
1796 			goto skip;
1797 		eh = mtod(m_new, struct ether_header *);
1798 		etype = ntohs(eh->ether_type);
1799 		if (etype == ETHERTYPE_VLAN) {
1800 			const struct ether_vlan_header *evl;
1801 
1802 			hoff = sizeof(*evl);
1803 			if (m_new->m_len < hoff)
1804 				goto skip;
1805 			evl = mtod(m_new, struct ether_vlan_header *);
1806 			etype = ntohs(evl->evl_proto);
1807 		}
1808 
1809 		if (etype == ETHERTYPE_IP) {
1810 			int pr;
1811 
1812 			pr = hn_check_iplen(m_new, hoff);
1813 			if (pr == IPPROTO_TCP) {
1814 				if (do_csum &&
1815 				    (rxr->hn_trust_hcsum &
1816 				     HN_TRUST_HCSUM_TCP)) {
1817 					rxr->hn_csum_trusted++;
1818 					m_new->m_pkthdr.csum_flags |=
1819 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
1820 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1821 					m_new->m_pkthdr.csum_data = 0xffff;
1822 				}
1823 				do_lro = 1;
1824 			} else if (pr == IPPROTO_UDP) {
1825 				if (do_csum &&
1826 				    (rxr->hn_trust_hcsum &
1827 				     HN_TRUST_HCSUM_UDP)) {
1828 					rxr->hn_csum_trusted++;
1829 					m_new->m_pkthdr.csum_flags |=
1830 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
1831 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1832 					m_new->m_pkthdr.csum_data = 0xffff;
1833 				}
1834 			} else if (pr != IPPROTO_DONE && do_csum &&
1835 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1836 				rxr->hn_csum_trusted++;
1837 				m_new->m_pkthdr.csum_flags |=
1838 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
1839 			}
1840 		}
1841 	}
1842 skip:
1843 	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
1844 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
1845 		    NDIS_VLAN_INFO_ID(info->vlan_info),
1846 		    NDIS_VLAN_INFO_PRI(info->vlan_info),
1847 		    NDIS_VLAN_INFO_CFI(info->vlan_info));
1848 		m_new->m_flags |= M_VLANTAG;
1849 	}
1850 
1851 	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
1852 		rxr->hn_rss_pkts++;
1853 		m_new->m_pkthdr.flowid = info->hash_value;
1854 		hash_type = M_HASHTYPE_OPAQUE_HASH;
1855 		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
1856 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
1857 			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
1858 
1859 			/*
1860 			 * NOTE:
1861 			 * do_lro is resetted, if the hash types are not TCP
1862 			 * related.  See the comment in the above csum_flags
1863 			 * setup section.
1864 			 */
1865 			switch (type) {
1866 			case NDIS_HASH_IPV4:
1867 				hash_type = M_HASHTYPE_RSS_IPV4;
1868 				do_lro = 0;
1869 				break;
1870 
1871 			case NDIS_HASH_TCP_IPV4:
1872 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
1873 				break;
1874 
1875 			case NDIS_HASH_IPV6:
1876 				hash_type = M_HASHTYPE_RSS_IPV6;
1877 				do_lro = 0;
1878 				break;
1879 
1880 			case NDIS_HASH_IPV6_EX:
1881 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
1882 				do_lro = 0;
1883 				break;
1884 
1885 			case NDIS_HASH_TCP_IPV6:
1886 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
1887 				break;
1888 
1889 			case NDIS_HASH_TCP_IPV6_EX:
1890 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
1891 				break;
1892 			}
1893 		}
1894 	} else {
1895 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
1896 		hash_type = M_HASHTYPE_OPAQUE;
1897 	}
1898 	M_HASHTYPE_SET(m_new, hash_type);
1899 
1900 	/*
1901 	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
1902 	 * messages (not just data messages) will trigger a response.
1903 	 */
1904 
1905 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
1906 	rxr->hn_pkts++;
1907 
1908 	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1909 #if defined(INET) || defined(INET6)
1910 		struct lro_ctrl *lro = &rxr->hn_lro;
1911 
1912 		if (lro->lro_cnt) {
1913 			rxr->hn_lro_tried++;
1914 			if (hn_lro_rx(lro, m_new) == 0) {
1915 				/* DONE! */
1916 				return 0;
1917 			}
1918 		}
1919 #endif
1920 	}
1921 
1922 	/* We're not holding the lock here, so don't release it */
1923 	(*ifp->if_input)(ifp, m_new);
1924 
1925 	return (0);
1926 }
1927 
1928 static int
1929 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1930 {
1931 	struct hn_softc *sc = ifp->if_softc;
1932 	struct ifreq *ifr = (struct ifreq *)data;
1933 	int mask, error = 0;
1934 
1935 	switch (cmd) {
1936 	case SIOCSIFMTU:
1937 		if (ifr->ifr_mtu > HN_MTU_MAX) {
1938 			error = EINVAL;
1939 			break;
1940 		}
1941 
1942 		HN_LOCK(sc);
1943 
1944 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
1945 			HN_UNLOCK(sc);
1946 			break;
1947 		}
1948 
1949 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
1950 			/* Can't change MTU */
1951 			HN_UNLOCK(sc);
1952 			error = EOPNOTSUPP;
1953 			break;
1954 		}
1955 
1956 		if (ifp->if_mtu == ifr->ifr_mtu) {
1957 			HN_UNLOCK(sc);
1958 			break;
1959 		}
1960 
1961 		/*
1962 		 * Suspend this interface before the synthetic parts
1963 		 * are ripped.
1964 		 */
1965 		hn_suspend(sc);
1966 
1967 		/*
1968 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
1969 		 */
1970 		hn_synth_detach(sc);
1971 
1972 		/*
1973 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
1974 		 * with the new MTU setting.
1975 		 */
1976 		error = hn_synth_attach(sc, ifr->ifr_mtu);
1977 		if (error) {
1978 			HN_UNLOCK(sc);
1979 			break;
1980 		}
1981 
1982 		/*
1983 		 * Commit the requested MTU, after the synthetic parts
1984 		 * have been successfully attached.
1985 		 */
1986 		ifp->if_mtu = ifr->ifr_mtu;
1987 
1988 		/*
1989 		 * Make sure that various parameters based on MTU are
1990 		 * still valid, after the MTU change.
1991 		 */
1992 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
1993 			hn_set_chim_size(sc, sc->hn_chim_szmax);
1994 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1995 #if __FreeBSD_version >= 1100099
1996 		if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
1997 		    HN_LRO_LENLIM_MIN(ifp))
1998 			hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1999 #endif
2000 
2001 		/*
2002 		 * All done!  Resume the interface now.
2003 		 */
2004 		hn_resume(sc);
2005 
2006 		HN_UNLOCK(sc);
2007 		break;
2008 
2009 	case SIOCSIFFLAGS:
2010 		HN_LOCK(sc);
2011 
2012 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2013 			HN_UNLOCK(sc);
2014 			break;
2015 		}
2016 
2017 		if (ifp->if_flags & IFF_UP) {
2018 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2019 				hn_set_rxfilter(sc);
2020 			else
2021 				hn_init_locked(sc);
2022 		} else {
2023 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2024 				hn_stop(sc);
2025 		}
2026 		sc->hn_if_flags = ifp->if_flags;
2027 
2028 		HN_UNLOCK(sc);
2029 		break;
2030 
2031 	case SIOCSIFCAP:
2032 		HN_LOCK(sc);
2033 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2034 
2035 		if (mask & IFCAP_TXCSUM) {
2036 			ifp->if_capenable ^= IFCAP_TXCSUM;
2037 			if (ifp->if_capenable & IFCAP_TXCSUM)
2038 				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2039 			else
2040 				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2041 		}
2042 		if (mask & IFCAP_TXCSUM_IPV6) {
2043 			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2044 			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2045 				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2046 			else
2047 				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2048 		}
2049 
2050 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
2051 		if (mask & IFCAP_RXCSUM)
2052 			ifp->if_capenable ^= IFCAP_RXCSUM;
2053 #ifdef foo
2054 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2055 		if (mask & IFCAP_RXCSUM_IPV6)
2056 			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2057 #endif
2058 
2059 		if (mask & IFCAP_LRO)
2060 			ifp->if_capenable ^= IFCAP_LRO;
2061 
2062 		if (mask & IFCAP_TSO4) {
2063 			ifp->if_capenable ^= IFCAP_TSO4;
2064 			if (ifp->if_capenable & IFCAP_TSO4)
2065 				ifp->if_hwassist |= CSUM_IP_TSO;
2066 			else
2067 				ifp->if_hwassist &= ~CSUM_IP_TSO;
2068 		}
2069 		if (mask & IFCAP_TSO6) {
2070 			ifp->if_capenable ^= IFCAP_TSO6;
2071 			if (ifp->if_capenable & IFCAP_TSO6)
2072 				ifp->if_hwassist |= CSUM_IP6_TSO;
2073 			else
2074 				ifp->if_hwassist &= ~CSUM_IP6_TSO;
2075 		}
2076 
2077 		HN_UNLOCK(sc);
2078 		break;
2079 
2080 	case SIOCADDMULTI:
2081 	case SIOCDELMULTI:
2082 #ifdef notyet
2083 		/*
2084 		 * XXX
2085 		 * Multicast uses mutex, while RNDIS RX filter setting
2086 		 * sleeps.  We workaround this by always enabling
2087 		 * ALLMULTI.  ALLMULTI would actually always be on, even
2088 		 * if we supported the SIOCADDMULTI/SIOCDELMULTI, since
2089 		 * we don't support multicast address list configuration
2090 		 * for this driver.
2091 		 */
2092 		HN_LOCK(sc);
2093 
2094 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2095 			HN_UNLOCK(sc);
2096 			break;
2097 		}
2098 		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2099 			hn_set_rxfilter(sc);
2100 
2101 		HN_UNLOCK(sc);
2102 #endif
2103 		break;
2104 
2105 	case SIOCSIFMEDIA:
2106 	case SIOCGIFMEDIA:
2107 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2108 		break;
2109 
2110 	default:
2111 		error = ether_ioctl(ifp, cmd, data);
2112 		break;
2113 	}
2114 	return (error);
2115 }
2116 
2117 static void
2118 hn_stop(struct hn_softc *sc)
2119 {
2120 	struct ifnet *ifp = sc->hn_ifp;
2121 	int i;
2122 
2123 	HN_LOCK_ASSERT(sc);
2124 
2125 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2126 	    ("synthetic parts were not attached"));
2127 
2128 	/* Clear RUNNING bit _before_ hn_suspend_data() */
2129 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2130 	hn_suspend_data(sc);
2131 
2132 	/* Clear OACTIVE bit. */
2133 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2134 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2135 		sc->hn_tx_ring[i].hn_oactive = 0;
2136 }
2137 
2138 static void
2139 hn_init_locked(struct hn_softc *sc)
2140 {
2141 	struct ifnet *ifp = sc->hn_ifp;
2142 	int i;
2143 
2144 	HN_LOCK_ASSERT(sc);
2145 
2146 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2147 		return;
2148 
2149 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2150 		return;
2151 
2152 	/* Configure RX filter */
2153 	hn_set_rxfilter(sc);
2154 
2155 	/* Clear OACTIVE bit. */
2156 	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2157 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2158 		sc->hn_tx_ring[i].hn_oactive = 0;
2159 
2160 	/* Clear TX 'suspended' bit. */
2161 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2162 
2163 	/* Everything is ready; unleash! */
2164 	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2165 }
2166 
2167 static void
2168 hn_init(void *xsc)
2169 {
2170 	struct hn_softc *sc = xsc;
2171 
2172 	HN_LOCK(sc);
2173 	hn_init_locked(sc);
2174 	HN_UNLOCK(sc);
2175 }
2176 
2177 #if __FreeBSD_version >= 1100099
2178 
2179 static int
2180 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2181 {
2182 	struct hn_softc *sc = arg1;
2183 	unsigned int lenlim;
2184 	int error;
2185 
2186 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2187 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
2188 	if (error || req->newptr == NULL)
2189 		return error;
2190 
2191 	HN_LOCK(sc);
2192 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2193 	    lenlim > TCP_LRO_LENGTH_MAX) {
2194 		HN_UNLOCK(sc);
2195 		return EINVAL;
2196 	}
2197 	hn_set_lro_lenlim(sc, lenlim);
2198 	HN_UNLOCK(sc);
2199 
2200 	return 0;
2201 }
2202 
2203 static int
2204 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2205 {
2206 	struct hn_softc *sc = arg1;
2207 	int ackcnt, error, i;
2208 
2209 	/*
2210 	 * lro_ackcnt_lim is append count limit,
2211 	 * +1 to turn it into aggregation limit.
2212 	 */
2213 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2214 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2215 	if (error || req->newptr == NULL)
2216 		return error;
2217 
2218 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2219 		return EINVAL;
2220 
2221 	/*
2222 	 * Convert aggregation limit back to append
2223 	 * count limit.
2224 	 */
2225 	--ackcnt;
2226 	HN_LOCK(sc);
2227 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
2228 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2229 	HN_UNLOCK(sc);
2230 	return 0;
2231 }
2232 
2233 #endif
2234 
2235 static int
2236 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2237 {
2238 	struct hn_softc *sc = arg1;
2239 	int hcsum = arg2;
2240 	int on, error, i;
2241 
2242 	on = 0;
2243 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2244 		on = 1;
2245 
2246 	error = sysctl_handle_int(oidp, &on, 0, req);
2247 	if (error || req->newptr == NULL)
2248 		return error;
2249 
2250 	HN_LOCK(sc);
2251 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2252 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2253 
2254 		if (on)
2255 			rxr->hn_trust_hcsum |= hcsum;
2256 		else
2257 			rxr->hn_trust_hcsum &= ~hcsum;
2258 	}
2259 	HN_UNLOCK(sc);
2260 	return 0;
2261 }
2262 
2263 static int
2264 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2265 {
2266 	struct hn_softc *sc = arg1;
2267 	int chim_size, error;
2268 
2269 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
2270 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
2271 	if (error || req->newptr == NULL)
2272 		return error;
2273 
2274 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2275 		return EINVAL;
2276 
2277 	HN_LOCK(sc);
2278 	hn_set_chim_size(sc, chim_size);
2279 	HN_UNLOCK(sc);
2280 	return 0;
2281 }
2282 
2283 #if __FreeBSD_version < 1100095
2284 static int
2285 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2286 {
2287 	struct hn_softc *sc = arg1;
2288 	int ofs = arg2, i, error;
2289 	struct hn_rx_ring *rxr;
2290 	uint64_t stat;
2291 
2292 	stat = 0;
2293 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2294 		rxr = &sc->hn_rx_ring[i];
2295 		stat += *((int *)((uint8_t *)rxr + ofs));
2296 	}
2297 
2298 	error = sysctl_handle_64(oidp, &stat, 0, req);
2299 	if (error || req->newptr == NULL)
2300 		return error;
2301 
2302 	/* Zero out this stat. */
2303 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2304 		rxr = &sc->hn_rx_ring[i];
2305 		*((int *)((uint8_t *)rxr + ofs)) = 0;
2306 	}
2307 	return 0;
2308 }
2309 #else
2310 static int
2311 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2312 {
2313 	struct hn_softc *sc = arg1;
2314 	int ofs = arg2, i, error;
2315 	struct hn_rx_ring *rxr;
2316 	uint64_t stat;
2317 
2318 	stat = 0;
2319 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2320 		rxr = &sc->hn_rx_ring[i];
2321 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2322 	}
2323 
2324 	error = sysctl_handle_64(oidp, &stat, 0, req);
2325 	if (error || req->newptr == NULL)
2326 		return error;
2327 
2328 	/* Zero out this stat. */
2329 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2330 		rxr = &sc->hn_rx_ring[i];
2331 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2332 	}
2333 	return 0;
2334 }
2335 
2336 #endif
2337 
2338 static int
2339 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2340 {
2341 	struct hn_softc *sc = arg1;
2342 	int ofs = arg2, i, error;
2343 	struct hn_rx_ring *rxr;
2344 	u_long stat;
2345 
2346 	stat = 0;
2347 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2348 		rxr = &sc->hn_rx_ring[i];
2349 		stat += *((u_long *)((uint8_t *)rxr + ofs));
2350 	}
2351 
2352 	error = sysctl_handle_long(oidp, &stat, 0, req);
2353 	if (error || req->newptr == NULL)
2354 		return error;
2355 
2356 	/* Zero out this stat. */
2357 	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2358 		rxr = &sc->hn_rx_ring[i];
2359 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
2360 	}
2361 	return 0;
2362 }
2363 
2364 static int
2365 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2366 {
2367 	struct hn_softc *sc = arg1;
2368 	int ofs = arg2, i, error;
2369 	struct hn_tx_ring *txr;
2370 	u_long stat;
2371 
2372 	stat = 0;
2373 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2374 		txr = &sc->hn_tx_ring[i];
2375 		stat += *((u_long *)((uint8_t *)txr + ofs));
2376 	}
2377 
2378 	error = sysctl_handle_long(oidp, &stat, 0, req);
2379 	if (error || req->newptr == NULL)
2380 		return error;
2381 
2382 	/* Zero out this stat. */
2383 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2384 		txr = &sc->hn_tx_ring[i];
2385 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
2386 	}
2387 	return 0;
2388 }
2389 
2390 static int
2391 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2392 {
2393 	struct hn_softc *sc = arg1;
2394 	int ofs = arg2, i, error, conf;
2395 	struct hn_tx_ring *txr;
2396 
2397 	txr = &sc->hn_tx_ring[0];
2398 	conf = *((int *)((uint8_t *)txr + ofs));
2399 
2400 	error = sysctl_handle_int(oidp, &conf, 0, req);
2401 	if (error || req->newptr == NULL)
2402 		return error;
2403 
2404 	HN_LOCK(sc);
2405 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2406 		txr = &sc->hn_tx_ring[i];
2407 		*((int *)((uint8_t *)txr + ofs)) = conf;
2408 	}
2409 	HN_UNLOCK(sc);
2410 
2411 	return 0;
2412 }
2413 
2414 static int
2415 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2416 {
2417 	struct hn_softc *sc = arg1;
2418 	char verstr[16];
2419 
2420 	snprintf(verstr, sizeof(verstr), "%u.%u",
2421 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2422 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2423 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2424 }
2425 
2426 static int
2427 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2428 {
2429 	struct hn_softc *sc = arg1;
2430 	char caps_str[128];
2431 	uint32_t caps;
2432 
2433 	HN_LOCK(sc);
2434 	caps = sc->hn_caps;
2435 	HN_UNLOCK(sc);
2436 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2437 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2438 }
2439 
2440 static int
2441 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2442 {
2443 	struct hn_softc *sc = arg1;
2444 	char assist_str[128];
2445 	uint32_t hwassist;
2446 
2447 	HN_LOCK(sc);
2448 	hwassist = sc->hn_ifp->if_hwassist;
2449 	HN_UNLOCK(sc);
2450 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2451 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2452 }
2453 
2454 static int
2455 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2456 {
2457 	struct hn_softc *sc = arg1;
2458 	char filter_str[128];
2459 	uint32_t filter;
2460 
2461 	HN_LOCK(sc);
2462 	filter = sc->hn_rx_filter;
2463 	HN_UNLOCK(sc);
2464 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
2465 	    NDIS_PACKET_TYPES);
2466 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2467 }
2468 
2469 static int
2470 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2471 {
2472 	struct hn_softc *sc = arg1;
2473 	int error;
2474 
2475 	HN_LOCK(sc);
2476 
2477 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2478 	if (error || req->newptr == NULL)
2479 		goto back;
2480 
2481 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2482 	if (error)
2483 		goto back;
2484 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2485 
2486 	if (sc->hn_rx_ring_inuse > 1) {
2487 		error = hn_rss_reconfig(sc);
2488 	} else {
2489 		/* Not RSS capable, at least for now; just save the RSS key. */
2490 		error = 0;
2491 	}
2492 back:
2493 	HN_UNLOCK(sc);
2494 	return (error);
2495 }
2496 
2497 static int
2498 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2499 {
2500 	struct hn_softc *sc = arg1;
2501 	int error;
2502 
2503 	HN_LOCK(sc);
2504 
2505 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2506 	if (error || req->newptr == NULL)
2507 		goto back;
2508 
2509 	/*
2510 	 * Don't allow RSS indirect table change, if this interface is not
2511 	 * RSS capable currently.
2512 	 */
2513 	if (sc->hn_rx_ring_inuse == 1) {
2514 		error = EOPNOTSUPP;
2515 		goto back;
2516 	}
2517 
2518 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2519 	if (error)
2520 		goto back;
2521 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2522 
2523 	hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2524 	error = hn_rss_reconfig(sc);
2525 back:
2526 	HN_UNLOCK(sc);
2527 	return (error);
2528 }
2529 
2530 static int
2531 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2532 {
2533 	struct hn_softc *sc = arg1;
2534 	char hash_str[128];
2535 	uint32_t hash;
2536 
2537 	HN_LOCK(sc);
2538 	hash = sc->hn_rss_hash;
2539 	HN_UNLOCK(sc);
2540 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2541 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2542 }
2543 
2544 static int
2545 hn_check_iplen(const struct mbuf *m, int hoff)
2546 {
2547 	const struct ip *ip;
2548 	int len, iphlen, iplen;
2549 	const struct tcphdr *th;
2550 	int thoff;				/* TCP data offset */
2551 
2552 	len = hoff + sizeof(struct ip);
2553 
2554 	/* The packet must be at least the size of an IP header. */
2555 	if (m->m_pkthdr.len < len)
2556 		return IPPROTO_DONE;
2557 
2558 	/* The fixed IP header must reside completely in the first mbuf. */
2559 	if (m->m_len < len)
2560 		return IPPROTO_DONE;
2561 
2562 	ip = mtodo(m, hoff);
2563 
2564 	/* Bound check the packet's stated IP header length. */
2565 	iphlen = ip->ip_hl << 2;
2566 	if (iphlen < sizeof(struct ip))		/* minimum header length */
2567 		return IPPROTO_DONE;
2568 
2569 	/* The full IP header must reside completely in the one mbuf. */
2570 	if (m->m_len < hoff + iphlen)
2571 		return IPPROTO_DONE;
2572 
2573 	iplen = ntohs(ip->ip_len);
2574 
2575 	/*
2576 	 * Check that the amount of data in the buffers is as
2577 	 * at least much as the IP header would have us expect.
2578 	 */
2579 	if (m->m_pkthdr.len < hoff + iplen)
2580 		return IPPROTO_DONE;
2581 
2582 	/*
2583 	 * Ignore IP fragments.
2584 	 */
2585 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2586 		return IPPROTO_DONE;
2587 
2588 	/*
2589 	 * The TCP/IP or UDP/IP header must be entirely contained within
2590 	 * the first fragment of a packet.
2591 	 */
2592 	switch (ip->ip_p) {
2593 	case IPPROTO_TCP:
2594 		if (iplen < iphlen + sizeof(struct tcphdr))
2595 			return IPPROTO_DONE;
2596 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2597 			return IPPROTO_DONE;
2598 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2599 		thoff = th->th_off << 2;
2600 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2601 			return IPPROTO_DONE;
2602 		if (m->m_len < hoff + iphlen + thoff)
2603 			return IPPROTO_DONE;
2604 		break;
2605 	case IPPROTO_UDP:
2606 		if (iplen < iphlen + sizeof(struct udphdr))
2607 			return IPPROTO_DONE;
2608 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2609 			return IPPROTO_DONE;
2610 		break;
2611 	default:
2612 		if (iplen < iphlen)
2613 			return IPPROTO_DONE;
2614 		break;
2615 	}
2616 	return ip->ip_p;
2617 }
2618 
2619 static int
2620 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
2621 {
2622 	struct sysctl_oid_list *child;
2623 	struct sysctl_ctx_list *ctx;
2624 	device_t dev = sc->hn_dev;
2625 #if defined(INET) || defined(INET6)
2626 #if __FreeBSD_version >= 1100095
2627 	int lroent_cnt;
2628 #endif
2629 #endif
2630 	int i;
2631 
2632 	/*
2633 	 * Create RXBUF for reception.
2634 	 *
2635 	 * NOTE:
2636 	 * - It is shared by all channels.
2637 	 * - A large enough buffer is allocated, certain version of NVSes
2638 	 *   may further limit the usable space.
2639 	 */
2640 	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2641 	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
2642 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
2643 	if (sc->hn_rxbuf == NULL) {
2644 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
2645 		return (ENOMEM);
2646 	}
2647 
2648 	sc->hn_rx_ring_cnt = ring_cnt;
2649 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
2650 
2651 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
2652 	    M_DEVBUF, M_WAITOK | M_ZERO);
2653 
2654 #if defined(INET) || defined(INET6)
2655 #if __FreeBSD_version >= 1100095
2656 	lroent_cnt = hn_lro_entry_count;
2657 	if (lroent_cnt < TCP_LRO_ENTRIES)
2658 		lroent_cnt = TCP_LRO_ENTRIES;
2659 	if (bootverbose)
2660 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
2661 #endif
2662 #endif	/* INET || INET6 */
2663 
2664 	ctx = device_get_sysctl_ctx(dev);
2665 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2666 
2667 	/* Create dev.hn.UNIT.rx sysctl tree */
2668 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
2669 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2670 
2671 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2672 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2673 
2674 		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2675 		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
2676 		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
2677 		if (rxr->hn_br == NULL) {
2678 			device_printf(dev, "allocate bufring failed\n");
2679 			return (ENOMEM);
2680 		}
2681 
2682 		if (hn_trust_hosttcp)
2683 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
2684 		if (hn_trust_hostudp)
2685 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
2686 		if (hn_trust_hostip)
2687 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
2688 		rxr->hn_ifp = sc->hn_ifp;
2689 		if (i < sc->hn_tx_ring_cnt)
2690 			rxr->hn_txr = &sc->hn_tx_ring[i];
2691 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
2692 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
2693 		rxr->hn_rx_idx = i;
2694 		rxr->hn_rxbuf = sc->hn_rxbuf;
2695 
2696 		/*
2697 		 * Initialize LRO.
2698 		 */
2699 #if defined(INET) || defined(INET6)
2700 #if __FreeBSD_version >= 1100095
2701 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
2702 		    hn_lro_mbufq_depth);
2703 #else
2704 		tcp_lro_init(&rxr->hn_lro);
2705 		rxr->hn_lro.ifp = sc->hn_ifp;
2706 #endif
2707 #if __FreeBSD_version >= 1100099
2708 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
2709 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
2710 #endif
2711 #endif	/* INET || INET6 */
2712 
2713 		if (sc->hn_rx_sysctl_tree != NULL) {
2714 			char name[16];
2715 
2716 			/*
2717 			 * Create per RX ring sysctl tree:
2718 			 * dev.hn.UNIT.rx.RINGID
2719 			 */
2720 			snprintf(name, sizeof(name), "%d", i);
2721 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
2722 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
2723 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2724 
2725 			if (rxr->hn_rx_sysctl_tree != NULL) {
2726 				SYSCTL_ADD_ULONG(ctx,
2727 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2728 				    OID_AUTO, "packets", CTLFLAG_RW,
2729 				    &rxr->hn_pkts, "# of packets received");
2730 				SYSCTL_ADD_ULONG(ctx,
2731 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2732 				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
2733 				    &rxr->hn_rss_pkts,
2734 				    "# of packets w/ RSS info received");
2735 				SYSCTL_ADD_INT(ctx,
2736 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2737 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
2738 				    &rxr->hn_pktbuf_len, 0,
2739 				    "Temporary channel packet buffer length");
2740 			}
2741 		}
2742 	}
2743 
2744 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
2745 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2746 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2747 #if __FreeBSD_version < 1100095
2748 	    hn_rx_stat_int_sysctl,
2749 #else
2750 	    hn_rx_stat_u64_sysctl,
2751 #endif
2752 	    "LU", "LRO queued");
2753 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2754 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2755 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2756 #if __FreeBSD_version < 1100095
2757 	    hn_rx_stat_int_sysctl,
2758 #else
2759 	    hn_rx_stat_u64_sysctl,
2760 #endif
2761 	    "LU", "LRO flushed");
2762 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2763 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2764 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
2765 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2766 #if __FreeBSD_version >= 1100099
2767 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2768 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2769 	    hn_lro_lenlim_sysctl, "IU",
2770 	    "Max # of data bytes to be aggregated by LRO");
2771 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2772 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2773 	    hn_lro_ackcnt_sysctl, "I",
2774 	    "Max # of ACKs to be aggregated by LRO");
2775 #endif
2776 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2777 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
2778 	    hn_trust_hcsum_sysctl, "I",
2779 	    "Trust tcp segement verification on host side, "
2780 	    "when csum info is missing");
2781 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2782 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
2783 	    hn_trust_hcsum_sysctl, "I",
2784 	    "Trust udp datagram verification on host side, "
2785 	    "when csum info is missing");
2786 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2787 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
2788 	    hn_trust_hcsum_sysctl, "I",
2789 	    "Trust ip packet verification on host side, "
2790 	    "when csum info is missing");
2791 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2792 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2793 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
2794 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2795 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2796 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2797 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
2798 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2799 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2800 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2801 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
2802 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2803 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2804 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2805 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
2806 	    hn_rx_stat_ulong_sysctl, "LU",
2807 	    "# of packets that we trust host's csum verification");
2808 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2809 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2810 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
2811 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2812 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
2813 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2814 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
2815 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
2816 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
2817 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
2818 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
2819 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
2820 
2821 	return (0);
2822 }
2823 
2824 static void
2825 hn_destroy_rx_data(struct hn_softc *sc)
2826 {
2827 	int i;
2828 
2829 	if (sc->hn_rxbuf != NULL) {
2830 		hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
2831 		sc->hn_rxbuf = NULL;
2832 	}
2833 
2834 	if (sc->hn_rx_ring_cnt == 0)
2835 		return;
2836 
2837 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2838 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2839 
2840 		if (rxr->hn_br == NULL)
2841 			continue;
2842 		hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
2843 		rxr->hn_br = NULL;
2844 
2845 #if defined(INET) || defined(INET6)
2846 		tcp_lro_free(&rxr->hn_lro);
2847 #endif
2848 		free(rxr->hn_pktbuf, M_DEVBUF);
2849 	}
2850 	free(sc->hn_rx_ring, M_DEVBUF);
2851 	sc->hn_rx_ring = NULL;
2852 
2853 	sc->hn_rx_ring_cnt = 0;
2854 	sc->hn_rx_ring_inuse = 0;
2855 }
2856 
2857 static int
2858 hn_tx_ring_create(struct hn_softc *sc, int id)
2859 {
2860 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2861 	device_t dev = sc->hn_dev;
2862 	bus_dma_tag_t parent_dtag;
2863 	int error, i;
2864 
2865 	txr->hn_sc = sc;
2866 	txr->hn_tx_idx = id;
2867 
2868 #ifndef HN_USE_TXDESC_BUFRING
2869 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2870 #endif
2871 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2872 
2873 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2874 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2875 	    M_DEVBUF, M_WAITOK | M_ZERO);
2876 #ifndef HN_USE_TXDESC_BUFRING
2877 	SLIST_INIT(&txr->hn_txlist);
2878 #else
2879 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
2880 	    M_WAITOK, &txr->hn_tx_lock);
2881 #endif
2882 
2883 	txr->hn_tx_taskq = sc->hn_tx_taskq;
2884 
2885 #ifdef HN_IFSTART_SUPPORT
2886 	if (hn_use_if_start) {
2887 		txr->hn_txeof = hn_start_txeof;
2888 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2889 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2890 	} else
2891 #endif
2892 	{
2893 		int br_depth;
2894 
2895 		txr->hn_txeof = hn_xmit_txeof;
2896 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
2897 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
2898 
2899 		br_depth = hn_get_txswq_depth(txr);
2900 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
2901 		    M_WAITOK, &txr->hn_tx_lock);
2902 	}
2903 
2904 	txr->hn_direct_tx_size = hn_direct_tx_size;
2905 
2906 	/*
2907 	 * Always schedule transmission instead of trying to do direct
2908 	 * transmission.  This one gives the best performance so far.
2909 	 */
2910 	txr->hn_sched_tx = 1;
2911 
2912 	parent_dtag = bus_get_dma_tag(dev);
2913 
2914 	/* DMA tag for RNDIS packet messages. */
2915 	error = bus_dma_tag_create(parent_dtag, /* parent */
2916 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
2917 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
2918 	    BUS_SPACE_MAXADDR,		/* lowaddr */
2919 	    BUS_SPACE_MAXADDR,		/* highaddr */
2920 	    NULL, NULL,			/* filter, filterarg */
2921 	    HN_RNDIS_PKT_LEN,		/* maxsize */
2922 	    1,				/* nsegments */
2923 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
2924 	    0,				/* flags */
2925 	    NULL,			/* lockfunc */
2926 	    NULL,			/* lockfuncarg */
2927 	    &txr->hn_tx_rndis_dtag);
2928 	if (error) {
2929 		device_printf(dev, "failed to create rndis dmatag\n");
2930 		return error;
2931 	}
2932 
2933 	/* DMA tag for data. */
2934 	error = bus_dma_tag_create(parent_dtag, /* parent */
2935 	    1,				/* alignment */
2936 	    HN_TX_DATA_BOUNDARY,	/* boundary */
2937 	    BUS_SPACE_MAXADDR,		/* lowaddr */
2938 	    BUS_SPACE_MAXADDR,		/* highaddr */
2939 	    NULL, NULL,			/* filter, filterarg */
2940 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
2941 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
2942 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
2943 	    0,				/* flags */
2944 	    NULL,			/* lockfunc */
2945 	    NULL,			/* lockfuncarg */
2946 	    &txr->hn_tx_data_dtag);
2947 	if (error) {
2948 		device_printf(dev, "failed to create data dmatag\n");
2949 		return error;
2950 	}
2951 
2952 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
2953 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
2954 
2955 		txd->txr = txr;
2956 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2957 
2958 		/*
2959 		 * Allocate and load RNDIS packet message.
2960 		 */
2961         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
2962 		    (void **)&txd->rndis_pkt,
2963 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
2964 		    &txd->rndis_pkt_dmap);
2965 		if (error) {
2966 			device_printf(dev,
2967 			    "failed to allocate rndis_packet_msg, %d\n", i);
2968 			return error;
2969 		}
2970 
2971 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
2972 		    txd->rndis_pkt_dmap,
2973 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
2974 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
2975 		    BUS_DMA_NOWAIT);
2976 		if (error) {
2977 			device_printf(dev,
2978 			    "failed to load rndis_packet_msg, %d\n", i);
2979 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
2980 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
2981 			return error;
2982 		}
2983 
2984 		/* DMA map for TX data. */
2985 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
2986 		    &txd->data_dmap);
2987 		if (error) {
2988 			device_printf(dev,
2989 			    "failed to allocate tx data dmamap\n");
2990 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
2991 			    txd->rndis_pkt_dmap);
2992 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
2993 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
2994 			return error;
2995 		}
2996 
2997 		/* All set, put it to list */
2998 		txd->flags |= HN_TXD_FLAG_ONLIST;
2999 #ifndef HN_USE_TXDESC_BUFRING
3000 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3001 #else
3002 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
3003 #endif
3004 	}
3005 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3006 
3007 	if (sc->hn_tx_sysctl_tree != NULL) {
3008 		struct sysctl_oid_list *child;
3009 		struct sysctl_ctx_list *ctx;
3010 		char name[16];
3011 
3012 		/*
3013 		 * Create per TX ring sysctl tree:
3014 		 * dev.hn.UNIT.tx.RINGID
3015 		 */
3016 		ctx = device_get_sysctl_ctx(dev);
3017 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3018 
3019 		snprintf(name, sizeof(name), "%d", id);
3020 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3021 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3022 
3023 		if (txr->hn_tx_sysctl_tree != NULL) {
3024 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3025 
3026 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3027 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3028 			    "# of available TX descs");
3029 #ifdef HN_IFSTART_SUPPORT
3030 			if (!hn_use_if_start)
3031 #endif
3032 			{
3033 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3034 				    CTLFLAG_RD, &txr->hn_oactive, 0,
3035 				    "over active");
3036 			}
3037 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3038 			    CTLFLAG_RW, &txr->hn_pkts,
3039 			    "# of packets transmitted");
3040 		}
3041 	}
3042 
3043 	return 0;
3044 }
3045 
3046 static void
3047 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3048 {
3049 	struct hn_tx_ring *txr = txd->txr;
3050 
3051 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
3052 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3053 
3054 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3055 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3056 	    txd->rndis_pkt_dmap);
3057 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3058 }
3059 
3060 static void
3061 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3062 {
3063 	struct hn_txdesc *txd;
3064 
3065 	if (txr->hn_txdesc == NULL)
3066 		return;
3067 
3068 #ifndef HN_USE_TXDESC_BUFRING
3069 	while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
3070 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
3071 		hn_txdesc_dmamap_destroy(txd);
3072 	}
3073 #else
3074 	mtx_lock(&txr->hn_tx_lock);
3075 	while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
3076 		hn_txdesc_dmamap_destroy(txd);
3077 	mtx_unlock(&txr->hn_tx_lock);
3078 #endif
3079 
3080 	if (txr->hn_tx_data_dtag != NULL)
3081 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3082 	if (txr->hn_tx_rndis_dtag != NULL)
3083 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3084 
3085 #ifdef HN_USE_TXDESC_BUFRING
3086 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3087 #endif
3088 
3089 	free(txr->hn_txdesc, M_DEVBUF);
3090 	txr->hn_txdesc = NULL;
3091 
3092 	if (txr->hn_mbuf_br != NULL)
3093 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3094 
3095 #ifndef HN_USE_TXDESC_BUFRING
3096 	mtx_destroy(&txr->hn_txlist_spin);
3097 #endif
3098 	mtx_destroy(&txr->hn_tx_lock);
3099 }
3100 
3101 static int
3102 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3103 {
3104 	struct sysctl_oid_list *child;
3105 	struct sysctl_ctx_list *ctx;
3106 	int i;
3107 
3108 	/*
3109 	 * Create TXBUF for chimney sending.
3110 	 *
3111 	 * NOTE: It is shared by all channels.
3112 	 */
3113 	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3114 	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3115 	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
3116 	if (sc->hn_chim == NULL) {
3117 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
3118 		return (ENOMEM);
3119 	}
3120 
3121 	sc->hn_tx_ring_cnt = ring_cnt;
3122 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3123 
3124 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3125 	    M_DEVBUF, M_WAITOK | M_ZERO);
3126 
3127 	ctx = device_get_sysctl_ctx(sc->hn_dev);
3128 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3129 
3130 	/* Create dev.hn.UNIT.tx sysctl tree */
3131 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3132 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3133 
3134 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3135 		int error;
3136 
3137 		error = hn_tx_ring_create(sc, i);
3138 		if (error)
3139 			return error;
3140 	}
3141 
3142 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3143 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3144 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
3145 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3146 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3147 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3148 	    __offsetof(struct hn_tx_ring, hn_send_failed),
3149 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3150 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3151 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3152 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
3153 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3154 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3155 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3156 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3157 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3158 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3159 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3160 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
3161 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3162 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3163 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3164 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3165 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3166 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3167 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3168 	    "# of total TX descs");
3169 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3170 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3171 	    "Chimney send packet size upper boundary");
3172 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3173 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3174 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3175 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3176 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3177 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3178 	    hn_tx_conf_int_sysctl, "I",
3179 	    "Size of the packet for direct transmission");
3180 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3181 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3182 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
3183 	    hn_tx_conf_int_sysctl, "I",
3184 	    "Always schedule transmission "
3185 	    "instead of doing direct transmission");
3186 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3187 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3188 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3189 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3190 
3191 	return 0;
3192 }
3193 
3194 static void
3195 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3196 {
3197 	int i;
3198 
3199 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3200 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
3201 }
3202 
3203 static void
3204 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3205 {
3206 	struct ifnet *ifp = sc->hn_ifp;
3207 	int tso_minlen;
3208 
3209 	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3210 		return;
3211 
3212 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3213 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3214 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3215 
3216 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3217 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3218 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3219 
3220 	if (tso_maxlen < tso_minlen)
3221 		tso_maxlen = tso_minlen;
3222 	else if (tso_maxlen > IP_MAXPACKET)
3223 		tso_maxlen = IP_MAXPACKET;
3224 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
3225 		tso_maxlen = sc->hn_ndis_tso_szmax;
3226 	ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3227 	if (bootverbose)
3228 		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3229 }
3230 
3231 static void
3232 hn_fixup_tx_data(struct hn_softc *sc)
3233 {
3234 	uint64_t csum_assist;
3235 	int i;
3236 
3237 	hn_set_chim_size(sc, sc->hn_chim_szmax);
3238 	if (hn_tx_chimney_size > 0 &&
3239 	    hn_tx_chimney_size < sc->hn_chim_szmax)
3240 		hn_set_chim_size(sc, hn_tx_chimney_size);
3241 
3242 	csum_assist = 0;
3243 	if (sc->hn_caps & HN_CAP_IPCS)
3244 		csum_assist |= CSUM_IP;
3245 	if (sc->hn_caps & HN_CAP_TCP4CS)
3246 		csum_assist |= CSUM_IP_TCP;
3247 	if (sc->hn_caps & HN_CAP_UDP4CS)
3248 		csum_assist |= CSUM_IP_UDP;
3249 #ifdef notyet
3250 	if (sc->hn_caps & HN_CAP_TCP6CS)
3251 		csum_assist |= CSUM_IP6_TCP;
3252 	if (sc->hn_caps & HN_CAP_UDP6CS)
3253 		csum_assist |= CSUM_IP6_UDP;
3254 #endif
3255 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3256 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3257 
3258 	if (sc->hn_caps & HN_CAP_HASHVAL) {
3259 		/*
3260 		 * Support HASHVAL pktinfo on TX path.
3261 		 */
3262 		if (bootverbose)
3263 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3264 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3265 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3266 	}
3267 }
3268 
3269 static void
3270 hn_destroy_tx_data(struct hn_softc *sc)
3271 {
3272 	int i;
3273 
3274 	if (sc->hn_chim != NULL) {
3275 		hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3276 		sc->hn_chim = NULL;
3277 	}
3278 
3279 	if (sc->hn_tx_ring_cnt == 0)
3280 		return;
3281 
3282 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3283 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3284 
3285 	free(sc->hn_tx_ring, M_DEVBUF);
3286 	sc->hn_tx_ring = NULL;
3287 
3288 	sc->hn_tx_ring_cnt = 0;
3289 	sc->hn_tx_ring_inuse = 0;
3290 }
3291 
3292 #ifdef HN_IFSTART_SUPPORT
3293 
3294 static void
3295 hn_start_taskfunc(void *xtxr, int pending __unused)
3296 {
3297 	struct hn_tx_ring *txr = xtxr;
3298 
3299 	mtx_lock(&txr->hn_tx_lock);
3300 	hn_start_locked(txr, 0);
3301 	mtx_unlock(&txr->hn_tx_lock);
3302 }
3303 
3304 static int
3305 hn_start_locked(struct hn_tx_ring *txr, int len)
3306 {
3307 	struct hn_softc *sc = txr->hn_sc;
3308 	struct ifnet *ifp = sc->hn_ifp;
3309 
3310 	KASSERT(hn_use_if_start,
3311 	    ("hn_start_locked is called, when if_start is disabled"));
3312 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3313 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3314 
3315 	if (__predict_false(txr->hn_suspended))
3316 		return 0;
3317 
3318 	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3319 	    IFF_DRV_RUNNING)
3320 		return 0;
3321 
3322 	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3323 		struct hn_txdesc *txd;
3324 		struct mbuf *m_head;
3325 		int error;
3326 
3327 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3328 		if (m_head == NULL)
3329 			break;
3330 
3331 		if (len > 0 && m_head->m_pkthdr.len > len) {
3332 			/*
3333 			 * This sending could be time consuming; let callers
3334 			 * dispatch this packet sending (and sending of any
3335 			 * following up packets) to tx taskqueue.
3336 			 */
3337 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3338 			return 1;
3339 		}
3340 
3341 #if defined(INET6) || defined(INET)
3342 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3343 			m_head = hn_tso_fixup(m_head);
3344 			if (__predict_false(m_head == NULL)) {
3345 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3346 				continue;
3347 			}
3348 		}
3349 #endif
3350 
3351 		txd = hn_txdesc_get(txr);
3352 		if (txd == NULL) {
3353 			txr->hn_no_txdescs++;
3354 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3355 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3356 			break;
3357 		}
3358 
3359 		error = hn_encap(txr, txd, &m_head);
3360 		if (error) {
3361 			/* Both txd and m_head are freed */
3362 			continue;
3363 		}
3364 
3365 		error = hn_txpkt(ifp, txr, txd);
3366 		if (__predict_false(error)) {
3367 			/* txd is freed, but m_head is not */
3368 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3369 			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3370 			break;
3371 		}
3372 	}
3373 	return 0;
3374 }
3375 
3376 static void
3377 hn_start(struct ifnet *ifp)
3378 {
3379 	struct hn_softc *sc = ifp->if_softc;
3380 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3381 
3382 	if (txr->hn_sched_tx)
3383 		goto do_sched;
3384 
3385 	if (mtx_trylock(&txr->hn_tx_lock)) {
3386 		int sched;
3387 
3388 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3389 		mtx_unlock(&txr->hn_tx_lock);
3390 		if (!sched)
3391 			return;
3392 	}
3393 do_sched:
3394 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3395 }
3396 
3397 static void
3398 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3399 {
3400 	struct hn_tx_ring *txr = xtxr;
3401 
3402 	mtx_lock(&txr->hn_tx_lock);
3403 	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3404 	hn_start_locked(txr, 0);
3405 	mtx_unlock(&txr->hn_tx_lock);
3406 }
3407 
3408 static void
3409 hn_start_txeof(struct hn_tx_ring *txr)
3410 {
3411 	struct hn_softc *sc = txr->hn_sc;
3412 	struct ifnet *ifp = sc->hn_ifp;
3413 
3414 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3415 
3416 	if (txr->hn_sched_tx)
3417 		goto do_sched;
3418 
3419 	if (mtx_trylock(&txr->hn_tx_lock)) {
3420 		int sched;
3421 
3422 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3423 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3424 		mtx_unlock(&txr->hn_tx_lock);
3425 		if (sched) {
3426 			taskqueue_enqueue(txr->hn_tx_taskq,
3427 			    &txr->hn_tx_task);
3428 		}
3429 	} else {
3430 do_sched:
3431 		/*
3432 		 * Release the OACTIVE earlier, with the hope, that
3433 		 * others could catch up.  The task will clear the
3434 		 * flag again with the hn_tx_lock to avoid possible
3435 		 * races.
3436 		 */
3437 		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3438 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3439 	}
3440 }
3441 
3442 #endif	/* HN_IFSTART_SUPPORT */
3443 
3444 static int
3445 hn_xmit(struct hn_tx_ring *txr, int len)
3446 {
3447 	struct hn_softc *sc = txr->hn_sc;
3448 	struct ifnet *ifp = sc->hn_ifp;
3449 	struct mbuf *m_head;
3450 
3451 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3452 #ifdef HN_IFSTART_SUPPORT
3453 	KASSERT(hn_use_if_start == 0,
3454 	    ("hn_xmit is called, when if_start is enabled"));
3455 #endif
3456 
3457 	if (__predict_false(txr->hn_suspended))
3458 		return 0;
3459 
3460 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3461 		return 0;
3462 
3463 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3464 		struct hn_txdesc *txd;
3465 		int error;
3466 
3467 		if (len > 0 && m_head->m_pkthdr.len > len) {
3468 			/*
3469 			 * This sending could be time consuming; let callers
3470 			 * dispatch this packet sending (and sending of any
3471 			 * following up packets) to tx taskqueue.
3472 			 */
3473 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3474 			return 1;
3475 		}
3476 
3477 		txd = hn_txdesc_get(txr);
3478 		if (txd == NULL) {
3479 			txr->hn_no_txdescs++;
3480 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3481 			txr->hn_oactive = 1;
3482 			break;
3483 		}
3484 
3485 		error = hn_encap(txr, txd, &m_head);
3486 		if (error) {
3487 			/* Both txd and m_head are freed; discard */
3488 			drbr_advance(ifp, txr->hn_mbuf_br);
3489 			continue;
3490 		}
3491 
3492 		error = hn_txpkt(ifp, txr, txd);
3493 		if (__predict_false(error)) {
3494 			/* txd is freed, but m_head is not */
3495 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3496 			txr->hn_oactive = 1;
3497 			break;
3498 		}
3499 
3500 		/* Sent */
3501 		drbr_advance(ifp, txr->hn_mbuf_br);
3502 	}
3503 	return 0;
3504 }
3505 
3506 static int
3507 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3508 {
3509 	struct hn_softc *sc = ifp->if_softc;
3510 	struct hn_tx_ring *txr;
3511 	int error, idx = 0;
3512 
3513 #if defined(INET6) || defined(INET)
3514 	/*
3515 	 * Perform TSO packet header fixup now, since the TSO
3516 	 * packet header should be cache-hot.
3517 	 */
3518 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
3519 		m = hn_tso_fixup(m);
3520 		if (__predict_false(m == NULL)) {
3521 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3522 			return EIO;
3523 		}
3524 	}
3525 #endif
3526 
3527 	/*
3528 	 * Select the TX ring based on flowid
3529 	 */
3530 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
3531 		idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
3532 	txr = &sc->hn_tx_ring[idx];
3533 
3534 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
3535 	if (error) {
3536 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
3537 		return error;
3538 	}
3539 
3540 	if (txr->hn_oactive)
3541 		return 0;
3542 
3543 	if (txr->hn_sched_tx)
3544 		goto do_sched;
3545 
3546 	if (mtx_trylock(&txr->hn_tx_lock)) {
3547 		int sched;
3548 
3549 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
3550 		mtx_unlock(&txr->hn_tx_lock);
3551 		if (!sched)
3552 			return 0;
3553 	}
3554 do_sched:
3555 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3556 	return 0;
3557 }
3558 
3559 static void
3560 hn_tx_ring_qflush(struct hn_tx_ring *txr)
3561 {
3562 	struct mbuf *m;
3563 
3564 	mtx_lock(&txr->hn_tx_lock);
3565 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
3566 		m_freem(m);
3567 	mtx_unlock(&txr->hn_tx_lock);
3568 }
3569 
3570 static void
3571 hn_xmit_qflush(struct ifnet *ifp)
3572 {
3573 	struct hn_softc *sc = ifp->if_softc;
3574 	int i;
3575 
3576 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3577 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
3578 	if_qflush(ifp);
3579 }
3580 
3581 static void
3582 hn_xmit_txeof(struct hn_tx_ring *txr)
3583 {
3584 
3585 	if (txr->hn_sched_tx)
3586 		goto do_sched;
3587 
3588 	if (mtx_trylock(&txr->hn_tx_lock)) {
3589 		int sched;
3590 
3591 		txr->hn_oactive = 0;
3592 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
3593 		mtx_unlock(&txr->hn_tx_lock);
3594 		if (sched) {
3595 			taskqueue_enqueue(txr->hn_tx_taskq,
3596 			    &txr->hn_tx_task);
3597 		}
3598 	} else {
3599 do_sched:
3600 		/*
3601 		 * Release the oactive earlier, with the hope, that
3602 		 * others could catch up.  The task will clear the
3603 		 * oactive again with the hn_tx_lock to avoid possible
3604 		 * races.
3605 		 */
3606 		txr->hn_oactive = 0;
3607 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3608 	}
3609 }
3610 
3611 static void
3612 hn_xmit_taskfunc(void *xtxr, int pending __unused)
3613 {
3614 	struct hn_tx_ring *txr = xtxr;
3615 
3616 	mtx_lock(&txr->hn_tx_lock);
3617 	hn_xmit(txr, 0);
3618 	mtx_unlock(&txr->hn_tx_lock);
3619 }
3620 
3621 static void
3622 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
3623 {
3624 	struct hn_tx_ring *txr = xtxr;
3625 
3626 	mtx_lock(&txr->hn_tx_lock);
3627 	txr->hn_oactive = 0;
3628 	hn_xmit(txr, 0);
3629 	mtx_unlock(&txr->hn_tx_lock);
3630 }
3631 
3632 static int
3633 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
3634 {
3635 	struct vmbus_chan_br cbr;
3636 	struct hn_rx_ring *rxr;
3637 	struct hn_tx_ring *txr = NULL;
3638 	int idx, error;
3639 
3640 	idx = vmbus_chan_subidx(chan);
3641 
3642 	/*
3643 	 * Link this channel to RX/TX ring.
3644 	 */
3645 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3646 	    ("invalid channel index %d, should > 0 && < %d",
3647 	     idx, sc->hn_rx_ring_inuse));
3648 	rxr = &sc->hn_rx_ring[idx];
3649 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
3650 	    ("RX ring %d already attached", idx));
3651 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
3652 
3653 	if (bootverbose) {
3654 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
3655 		    idx, vmbus_chan_id(chan));
3656 	}
3657 
3658 	if (idx < sc->hn_tx_ring_inuse) {
3659 		txr = &sc->hn_tx_ring[idx];
3660 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
3661 		    ("TX ring %d already attached", idx));
3662 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
3663 
3664 		txr->hn_chan = chan;
3665 		if (bootverbose) {
3666 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
3667 			    idx, vmbus_chan_id(chan));
3668 		}
3669 	}
3670 
3671 	/* Bind this channel to a proper CPU. */
3672 	vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
3673 
3674 	/*
3675 	 * Open this channel
3676 	 */
3677 	cbr.cbr = rxr->hn_br;
3678 	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
3679 	cbr.cbr_txsz = HN_TXBR_SIZE;
3680 	cbr.cbr_rxsz = HN_RXBR_SIZE;
3681 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
3682 	if (error) {
3683 		if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
3684 		    vmbus_chan_id(chan), error);
3685 		rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3686 		if (txr != NULL)
3687 			txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3688 	}
3689 	return (error);
3690 }
3691 
3692 static void
3693 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
3694 {
3695 	struct hn_rx_ring *rxr;
3696 	int idx;
3697 
3698 	idx = vmbus_chan_subidx(chan);
3699 
3700 	/*
3701 	 * Link this channel to RX/TX ring.
3702 	 */
3703 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
3704 	    ("invalid channel index %d, should > 0 && < %d",
3705 	     idx, sc->hn_rx_ring_inuse));
3706 	rxr = &sc->hn_rx_ring[idx];
3707 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
3708 	    ("RX ring %d is not attached", idx));
3709 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
3710 
3711 	if (idx < sc->hn_tx_ring_inuse) {
3712 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
3713 
3714 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
3715 		    ("TX ring %d is not attached attached", idx));
3716 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
3717 	}
3718 
3719 	/*
3720 	 * Close this channel.
3721 	 *
3722 	 * NOTE:
3723 	 * Channel closing does _not_ destroy the target channel.
3724 	 */
3725 	vmbus_chan_close(chan);
3726 }
3727 
3728 static int
3729 hn_attach_subchans(struct hn_softc *sc)
3730 {
3731 	struct vmbus_channel **subchans;
3732 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3733 	int i, error = 0;
3734 
3735 	if (subchan_cnt == 0)
3736 		return (0);
3737 
3738 	/* Attach the sub-channels. */
3739 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3740 	for (i = 0; i < subchan_cnt; ++i) {
3741 		error = hn_chan_attach(sc, subchans[i]);
3742 		if (error)
3743 			break;
3744 	}
3745 	vmbus_subchan_rel(subchans, subchan_cnt);
3746 
3747 	if (error) {
3748 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
3749 	} else {
3750 		if (bootverbose) {
3751 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
3752 			    subchan_cnt);
3753 		}
3754 	}
3755 	return (error);
3756 }
3757 
3758 static void
3759 hn_detach_allchans(struct hn_softc *sc)
3760 {
3761 	struct vmbus_channel **subchans;
3762 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3763 	int i;
3764 
3765 	if (subchan_cnt == 0)
3766 		goto back;
3767 
3768 	/* Detach the sub-channels. */
3769 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3770 	for (i = 0; i < subchan_cnt; ++i)
3771 		hn_chan_detach(sc, subchans[i]);
3772 	vmbus_subchan_rel(subchans, subchan_cnt);
3773 
3774 back:
3775 	/*
3776 	 * Detach the primary channel, _after_ all sub-channels
3777 	 * are detached.
3778 	 */
3779 	hn_chan_detach(sc, sc->hn_prichan);
3780 
3781 	/* Wait for sub-channels to be destroyed, if any. */
3782 	vmbus_subchan_drain(sc->hn_prichan);
3783 
3784 #ifdef INVARIANTS
3785 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3786 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
3787 		    HN_RX_FLAG_ATTACHED) == 0,
3788 		    ("%dth RX ring is still attached", i));
3789 	}
3790 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3791 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
3792 		    HN_TX_FLAG_ATTACHED) == 0,
3793 		    ("%dth TX ring is still attached", i));
3794 	}
3795 #endif
3796 }
3797 
3798 static int
3799 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
3800 {
3801 	struct vmbus_channel **subchans;
3802 	int nchan, rxr_cnt, error;
3803 
3804 	nchan = *nsubch + 1;
3805 	if (nchan == 1) {
3806 		/*
3807 		 * Multiple RX/TX rings are not requested.
3808 		 */
3809 		*nsubch = 0;
3810 		return (0);
3811 	}
3812 
3813 	/*
3814 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
3815 	 * table entries.
3816 	 */
3817 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
3818 	if (error) {
3819 		/* No RSS; this is benign. */
3820 		*nsubch = 0;
3821 		return (0);
3822 	}
3823 	if (bootverbose) {
3824 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
3825 		    rxr_cnt, nchan);
3826 	}
3827 
3828 	if (nchan > rxr_cnt)
3829 		nchan = rxr_cnt;
3830 	if (nchan == 1) {
3831 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
3832 		*nsubch = 0;
3833 		return (0);
3834 	}
3835 
3836 	/*
3837 	 * Allocate sub-channels from NVS.
3838 	 */
3839 	*nsubch = nchan - 1;
3840 	error = hn_nvs_alloc_subchans(sc, nsubch);
3841 	if (error || *nsubch == 0) {
3842 		/* Failed to allocate sub-channels. */
3843 		*nsubch = 0;
3844 		return (0);
3845 	}
3846 
3847 	/*
3848 	 * Wait for all sub-channels to become ready before moving on.
3849 	 */
3850 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
3851 	vmbus_subchan_rel(subchans, *nsubch);
3852 	return (0);
3853 }
3854 
3855 static int
3856 hn_synth_attach(struct hn_softc *sc, int mtu)
3857 {
3858 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
3859 	int error, nsubch, nchan, i;
3860 	uint32_t old_caps;
3861 
3862 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
3863 	    ("synthetic parts were attached"));
3864 
3865 	/* Save capabilities for later verification. */
3866 	old_caps = sc->hn_caps;
3867 	sc->hn_caps = 0;
3868 
3869 	/* Clear RSS stuffs. */
3870 	sc->hn_rss_ind_size = 0;
3871 	sc->hn_rss_hash = 0;
3872 
3873 	/*
3874 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
3875 	 */
3876 	error = hn_chan_attach(sc, sc->hn_prichan);
3877 	if (error)
3878 		return (error);
3879 
3880 	/*
3881 	 * Attach NVS.
3882 	 */
3883 	error = hn_nvs_attach(sc, mtu);
3884 	if (error)
3885 		return (error);
3886 
3887 	/*
3888 	 * Attach RNDIS _after_ NVS is attached.
3889 	 */
3890 	error = hn_rndis_attach(sc, mtu);
3891 	if (error)
3892 		return (error);
3893 
3894 	/*
3895 	 * Make sure capabilities are not changed.
3896 	 */
3897 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
3898 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
3899 		    old_caps, sc->hn_caps);
3900 		/* Restore old capabilities and abort. */
3901 		sc->hn_caps = old_caps;
3902 		return ENXIO;
3903 	}
3904 
3905 	/*
3906 	 * Allocate sub-channels for multi-TX/RX rings.
3907 	 *
3908 	 * NOTE:
3909 	 * The # of RX rings that can be used is equivalent to the # of
3910 	 * channels to be requested.
3911 	 */
3912 	nsubch = sc->hn_rx_ring_cnt - 1;
3913 	error = hn_synth_alloc_subchans(sc, &nsubch);
3914 	if (error)
3915 		return (error);
3916 
3917 	nchan = nsubch + 1;
3918 	if (nchan == 1) {
3919 		/* Only the primary channel can be used; done */
3920 		goto back;
3921 	}
3922 
3923 	/*
3924 	 * Configure RSS key and indirect table _after_ all sub-channels
3925 	 * are allocated.
3926 	 */
3927 
3928 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
3929 		/*
3930 		 * RSS key is not set yet; set it to the default RSS key.
3931 		 */
3932 		if (bootverbose)
3933 			if_printf(sc->hn_ifp, "setup default RSS key\n");
3934 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
3935 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3936 	}
3937 
3938 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
3939 		/*
3940 		 * RSS indirect table is not set yet; set it up in round-
3941 		 * robin fashion.
3942 		 */
3943 		if (bootverbose) {
3944 			if_printf(sc->hn_ifp, "setup default RSS indirect "
3945 			    "table\n");
3946 		}
3947 		for (i = 0; i < NDIS_HASH_INDCNT; ++i)
3948 			rss->rss_ind[i] = i % nchan;
3949 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3950 	} else {
3951 		/*
3952 		 * # of usable channels may be changed, so we have to
3953 		 * make sure that all entries in RSS indirect table
3954 		 * are valid.
3955 		 */
3956 		hn_rss_ind_fixup(sc, nchan);
3957 	}
3958 
3959 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
3960 	if (error) {
3961 		/*
3962 		 * Failed to configure RSS key or indirect table; only
3963 		 * the primary channel can be used.
3964 		 */
3965 		nchan = 1;
3966 	}
3967 back:
3968 	/*
3969 	 * Set the # of TX/RX rings that could be used according to
3970 	 * the # of channels that NVS offered.
3971 	 */
3972 	hn_set_ring_inuse(sc, nchan);
3973 
3974 	/*
3975 	 * Attach the sub-channels, if any.
3976 	 */
3977 	error = hn_attach_subchans(sc);
3978 	if (error)
3979 		return (error);
3980 
3981 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
3982 	return (0);
3983 }
3984 
3985 /*
3986  * NOTE:
3987  * The interface must have been suspended though hn_suspend(), before
3988  * this function get called.
3989  */
3990 static void
3991 hn_synth_detach(struct hn_softc *sc)
3992 {
3993 	HN_LOCK_ASSERT(sc);
3994 
3995 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3996 	    ("synthetic parts were not attached"));
3997 
3998 	/* Detach the RNDIS first. */
3999 	hn_rndis_detach(sc);
4000 
4001 	/* Detach NVS. */
4002 	hn_nvs_detach(sc);
4003 
4004 	/* Detach all of the channels. */
4005 	hn_detach_allchans(sc);
4006 
4007 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4008 }
4009 
4010 static void
4011 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4012 {
4013 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4014 	    ("invalid ring count %d", ring_cnt));
4015 
4016 	if (sc->hn_tx_ring_cnt > ring_cnt)
4017 		sc->hn_tx_ring_inuse = ring_cnt;
4018 	else
4019 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4020 	sc->hn_rx_ring_inuse = ring_cnt;
4021 
4022 	if (bootverbose) {
4023 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4024 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4025 	}
4026 }
4027 
4028 static void
4029 hn_chan_drain(struct vmbus_channel *chan)
4030 {
4031 
4032 	while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
4033 		pause("waitch", 1);
4034 	vmbus_chan_intr_drain(chan);
4035 }
4036 
4037 static void
4038 hn_suspend_data(struct hn_softc *sc)
4039 {
4040 	struct vmbus_channel **subch = NULL;
4041 	int i, nsubch;
4042 
4043 	HN_LOCK_ASSERT(sc);
4044 
4045 	/*
4046 	 * Suspend TX.
4047 	 */
4048 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4049 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4050 
4051 		mtx_lock(&txr->hn_tx_lock);
4052 		txr->hn_suspended = 1;
4053 		mtx_unlock(&txr->hn_tx_lock);
4054 		/* No one is able send more packets now. */
4055 
4056 		/* Wait for all pending sends to finish. */
4057 		while (hn_tx_ring_pending(txr))
4058 			pause("hnwtx", 1 /* 1 tick */);
4059 
4060 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4061 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4062 	}
4063 
4064 	/*
4065 	 * Disable RX by clearing RX filter.
4066 	 */
4067 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4068 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4069 
4070 	/*
4071 	 * Give RNDIS enough time to flush all pending data packets.
4072 	 */
4073 	pause("waitrx", (200 * hz) / 1000);
4074 
4075 	/*
4076 	 * Drain RX/TX bufrings and interrupts.
4077 	 */
4078 	nsubch = sc->hn_rx_ring_inuse - 1;
4079 	if (nsubch > 0)
4080 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4081 
4082 	if (subch != NULL) {
4083 		for (i = 0; i < nsubch; ++i)
4084 			hn_chan_drain(subch[i]);
4085 	}
4086 	hn_chan_drain(sc->hn_prichan);
4087 
4088 	if (subch != NULL)
4089 		vmbus_subchan_rel(subch, nsubch);
4090 }
4091 
4092 static void
4093 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4094 {
4095 
4096 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4097 }
4098 
4099 static void
4100 hn_suspend_mgmt(struct hn_softc *sc)
4101 {
4102 	struct task task;
4103 
4104 	HN_LOCK_ASSERT(sc);
4105 
4106 	/*
4107 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4108 	 * through hn_mgmt_taskq.
4109 	 */
4110 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4111 	vmbus_chan_run_task(sc->hn_prichan, &task);
4112 
4113 	/*
4114 	 * Make sure that all pending management tasks are completed.
4115 	 */
4116 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4117 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4118 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
4119 }
4120 
4121 static void
4122 hn_suspend(struct hn_softc *sc)
4123 {
4124 
4125 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4126 		hn_suspend_data(sc);
4127 	hn_suspend_mgmt(sc);
4128 }
4129 
4130 static void
4131 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4132 {
4133 	int i;
4134 
4135 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4136 	    ("invalid TX ring count %d", tx_ring_cnt));
4137 
4138 	for (i = 0; i < tx_ring_cnt; ++i) {
4139 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4140 
4141 		mtx_lock(&txr->hn_tx_lock);
4142 		txr->hn_suspended = 0;
4143 		mtx_unlock(&txr->hn_tx_lock);
4144 	}
4145 }
4146 
4147 static void
4148 hn_resume_data(struct hn_softc *sc)
4149 {
4150 	int i;
4151 
4152 	HN_LOCK_ASSERT(sc);
4153 
4154 	/*
4155 	 * Re-enable RX.
4156 	 */
4157 	hn_set_rxfilter(sc);
4158 
4159 	/*
4160 	 * Make sure to clear suspend status on "all" TX rings,
4161 	 * since hn_tx_ring_inuse can be changed after
4162 	 * hn_suspend_data().
4163 	 */
4164 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4165 
4166 #ifdef HN_IFSTART_SUPPORT
4167 	if (!hn_use_if_start)
4168 #endif
4169 	{
4170 		/*
4171 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
4172 		 * reduced.
4173 		 */
4174 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4175 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4176 	}
4177 
4178 	/*
4179 	 * Kick start TX.
4180 	 */
4181 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4182 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4183 
4184 		/*
4185 		 * Use txeof task, so that any pending oactive can be
4186 		 * cleared properly.
4187 		 */
4188 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4189 	}
4190 }
4191 
4192 static void
4193 hn_resume_mgmt(struct hn_softc *sc)
4194 {
4195 
4196 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4197 
4198 	/*
4199 	 * Kick off network change detection, if it was pending.
4200 	 * If no network change was pending, start link status
4201 	 * checks, which is more lightweight than network change
4202 	 * detection.
4203 	 */
4204 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4205 		hn_change_network(sc);
4206 	else
4207 		hn_update_link_status(sc);
4208 }
4209 
4210 static void
4211 hn_resume(struct hn_softc *sc)
4212 {
4213 
4214 	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4215 		hn_resume_data(sc);
4216 	hn_resume_mgmt(sc);
4217 }
4218 
4219 static void
4220 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4221 {
4222 	const struct rndis_status_msg *msg;
4223 	int ofs;
4224 
4225 	if (dlen < sizeof(*msg)) {
4226 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4227 		return;
4228 	}
4229 	msg = data;
4230 
4231 	switch (msg->rm_status) {
4232 	case RNDIS_STATUS_MEDIA_CONNECT:
4233 	case RNDIS_STATUS_MEDIA_DISCONNECT:
4234 		hn_update_link_status(sc);
4235 		break;
4236 
4237 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4238 		/* Not really useful; ignore. */
4239 		break;
4240 
4241 	case RNDIS_STATUS_NETWORK_CHANGE:
4242 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4243 		if (dlen < ofs + msg->rm_stbuflen ||
4244 		    msg->rm_stbuflen < sizeof(uint32_t)) {
4245 			if_printf(sc->hn_ifp, "network changed\n");
4246 		} else {
4247 			uint32_t change;
4248 
4249 			memcpy(&change, ((const uint8_t *)msg) + ofs,
4250 			    sizeof(change));
4251 			if_printf(sc->hn_ifp, "network changed, change %u\n",
4252 			    change);
4253 		}
4254 		hn_change_network(sc);
4255 		break;
4256 
4257 	default:
4258 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4259 		    msg->rm_status);
4260 		break;
4261 	}
4262 }
4263 
4264 static int
4265 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4266 {
4267 	const struct rndis_pktinfo *pi = info_data;
4268 	uint32_t mask = 0;
4269 
4270 	while (info_dlen != 0) {
4271 		const void *data;
4272 		uint32_t dlen;
4273 
4274 		if (__predict_false(info_dlen < sizeof(*pi)))
4275 			return (EINVAL);
4276 		if (__predict_false(info_dlen < pi->rm_size))
4277 			return (EINVAL);
4278 		info_dlen -= pi->rm_size;
4279 
4280 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4281 			return (EINVAL);
4282 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4283 			return (EINVAL);
4284 		dlen = pi->rm_size - pi->rm_pktinfooffset;
4285 		data = pi->rm_data;
4286 
4287 		switch (pi->rm_type) {
4288 		case NDIS_PKTINFO_TYPE_VLAN:
4289 			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4290 				return (EINVAL);
4291 			info->vlan_info = *((const uint32_t *)data);
4292 			mask |= HN_RXINFO_VLAN;
4293 			break;
4294 
4295 		case NDIS_PKTINFO_TYPE_CSUM:
4296 			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4297 				return (EINVAL);
4298 			info->csum_info = *((const uint32_t *)data);
4299 			mask |= HN_RXINFO_CSUM;
4300 			break;
4301 
4302 		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4303 			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4304 				return (EINVAL);
4305 			info->hash_value = *((const uint32_t *)data);
4306 			mask |= HN_RXINFO_HASHVAL;
4307 			break;
4308 
4309 		case HN_NDIS_PKTINFO_TYPE_HASHINF:
4310 			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4311 				return (EINVAL);
4312 			info->hash_info = *((const uint32_t *)data);
4313 			mask |= HN_RXINFO_HASHINF;
4314 			break;
4315 
4316 		default:
4317 			goto next;
4318 		}
4319 
4320 		if (mask == HN_RXINFO_ALL) {
4321 			/* All found; done */
4322 			break;
4323 		}
4324 next:
4325 		pi = (const struct rndis_pktinfo *)
4326 		    ((const uint8_t *)pi + pi->rm_size);
4327 	}
4328 
4329 	/*
4330 	 * Final fixup.
4331 	 * - If there is no hash value, invalidate the hash info.
4332 	 */
4333 	if ((mask & HN_RXINFO_HASHVAL) == 0)
4334 		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4335 	return (0);
4336 }
4337 
4338 static __inline bool
4339 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4340 {
4341 
4342 	if (off < check_off) {
4343 		if (__predict_true(off + len <= check_off))
4344 			return (false);
4345 	} else if (off > check_off) {
4346 		if (__predict_true(check_off + check_len <= off))
4347 			return (false);
4348 	}
4349 	return (true);
4350 }
4351 
4352 static void
4353 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4354 {
4355 	const struct rndis_packet_msg *pkt;
4356 	struct hn_rxinfo info;
4357 	int data_off, pktinfo_off, data_len, pktinfo_len;
4358 
4359 	/*
4360 	 * Check length.
4361 	 */
4362 	if (__predict_false(dlen < sizeof(*pkt))) {
4363 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4364 		return;
4365 	}
4366 	pkt = data;
4367 
4368 	if (__predict_false(dlen < pkt->rm_len)) {
4369 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4370 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4371 		return;
4372 	}
4373 	if (__predict_false(pkt->rm_len <
4374 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4375 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4376 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
4377 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4378 		    pkt->rm_pktinfolen);
4379 		return;
4380 	}
4381 	if (__predict_false(pkt->rm_datalen == 0)) {
4382 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4383 		return;
4384 	}
4385 
4386 	/*
4387 	 * Check offests.
4388 	 */
4389 #define IS_OFFSET_INVALID(ofs)			\
4390 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
4391 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4392 
4393 	/* XXX Hyper-V does not meet data offset alignment requirement */
4394 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4395 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4396 		    "data offset %u\n", pkt->rm_dataoffset);
4397 		return;
4398 	}
4399 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4400 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4401 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4402 		    "oob offset %u\n", pkt->rm_oobdataoffset);
4403 		return;
4404 	}
4405 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4406 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4407 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4408 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4409 		return;
4410 	}
4411 
4412 #undef IS_OFFSET_INVALID
4413 
4414 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4415 	data_len = pkt->rm_datalen;
4416 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4417 	pktinfo_len = pkt->rm_pktinfolen;
4418 
4419 	/*
4420 	 * Check OOB coverage.
4421 	 */
4422 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
4423 		int oob_off, oob_len;
4424 
4425 		if_printf(rxr->hn_ifp, "got oobdata\n");
4426 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
4427 		oob_len = pkt->rm_oobdatalen;
4428 
4429 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
4430 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4431 			    "oob overflow, msglen %u, oob abs %d len %d\n",
4432 			    pkt->rm_len, oob_off, oob_len);
4433 			return;
4434 		}
4435 
4436 		/*
4437 		 * Check against data.
4438 		 */
4439 		if (hn_rndis_check_overlap(oob_off, oob_len,
4440 		    data_off, data_len)) {
4441 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4442 			    "oob overlaps data, oob abs %d len %d, "
4443 			    "data abs %d len %d\n",
4444 			    oob_off, oob_len, data_off, data_len);
4445 			return;
4446 		}
4447 
4448 		/*
4449 		 * Check against pktinfo.
4450 		 */
4451 		if (pktinfo_len != 0 &&
4452 		    hn_rndis_check_overlap(oob_off, oob_len,
4453 		    pktinfo_off, pktinfo_len)) {
4454 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4455 			    "oob overlaps pktinfo, oob abs %d len %d, "
4456 			    "pktinfo abs %d len %d\n",
4457 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
4458 			return;
4459 		}
4460 	}
4461 
4462 	/*
4463 	 * Check per-packet-info coverage and find useful per-packet-info.
4464 	 */
4465 	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
4466 	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
4467 	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
4468 	if (__predict_true(pktinfo_len != 0)) {
4469 		bool overlap;
4470 		int error;
4471 
4472 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
4473 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4474 			    "pktinfo overflow, msglen %u, "
4475 			    "pktinfo abs %d len %d\n",
4476 			    pkt->rm_len, pktinfo_off, pktinfo_len);
4477 			return;
4478 		}
4479 
4480 		/*
4481 		 * Check packet info coverage.
4482 		 */
4483 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
4484 		    data_off, data_len);
4485 		if (__predict_false(overlap)) {
4486 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4487 			    "pktinfo overlap data, pktinfo abs %d len %d, "
4488 			    "data abs %d len %d\n",
4489 			    pktinfo_off, pktinfo_len, data_off, data_len);
4490 			return;
4491 		}
4492 
4493 		/*
4494 		 * Find useful per-packet-info.
4495 		 */
4496 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
4497 		    pktinfo_len, &info);
4498 		if (__predict_false(error)) {
4499 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
4500 			    "pktinfo\n");
4501 			return;
4502 		}
4503 	}
4504 
4505 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
4506 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4507 		    "data overflow, msglen %u, data abs %d len %d\n",
4508 		    pkt->rm_len, data_off, data_len);
4509 		return;
4510 	}
4511 	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
4512 }
4513 
4514 static __inline void
4515 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
4516 {
4517 	const struct rndis_msghdr *hdr;
4518 
4519 	if (__predict_false(dlen < sizeof(*hdr))) {
4520 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
4521 		return;
4522 	}
4523 	hdr = data;
4524 
4525 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
4526 		/* Hot data path. */
4527 		hn_rndis_rx_data(rxr, data, dlen);
4528 		/* Done! */
4529 		return;
4530 	}
4531 
4532 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
4533 		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
4534 	else
4535 		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
4536 }
4537 
4538 static void
4539 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
4540 {
4541 	const struct hn_nvs_hdr *hdr;
4542 
4543 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
4544 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
4545 		return;
4546 	}
4547 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
4548 
4549 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
4550 		/* Useless; ignore */
4551 		return;
4552 	}
4553 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
4554 }
4555 
4556 static void
4557 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
4558     const struct vmbus_chanpkt_hdr *pkt)
4559 {
4560 	struct hn_nvs_sendctx *sndc;
4561 
4562 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
4563 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
4564 	    VMBUS_CHANPKT_DATALEN(pkt));
4565 	/*
4566 	 * NOTE:
4567 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
4568 	 * its callback.
4569 	 */
4570 }
4571 
4572 static void
4573 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
4574     const struct vmbus_chanpkt_hdr *pkthdr)
4575 {
4576 	const struct vmbus_chanpkt_rxbuf *pkt;
4577 	const struct hn_nvs_hdr *nvs_hdr;
4578 	int count, i, hlen;
4579 
4580 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
4581 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
4582 		return;
4583 	}
4584 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
4585 
4586 	/* Make sure that this is a RNDIS message. */
4587 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
4588 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
4589 		    nvs_hdr->nvs_type);
4590 		return;
4591 	}
4592 
4593 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
4594 	if (__predict_false(hlen < sizeof(*pkt))) {
4595 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
4596 		return;
4597 	}
4598 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
4599 
4600 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
4601 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
4602 		    pkt->cp_rxbuf_id);
4603 		return;
4604 	}
4605 
4606 	count = pkt->cp_rxbuf_cnt;
4607 	if (__predict_false(hlen <
4608 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
4609 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
4610 		return;
4611 	}
4612 
4613 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
4614 	for (i = 0; i < count; ++i) {
4615 		int ofs, len;
4616 
4617 		ofs = pkt->cp_rxbuf[i].rb_ofs;
4618 		len = pkt->cp_rxbuf[i].rb_len;
4619 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
4620 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
4621 			    "ofs %d, len %d\n", i, ofs, len);
4622 			continue;
4623 		}
4624 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
4625 	}
4626 
4627 	/*
4628 	 * Ack the consumed RXBUF associated w/ this channel packet,
4629 	 * so that this RXBUF can be recycled by the hypervisor.
4630 	 */
4631 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
4632 }
4633 
4634 static void
4635 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
4636     uint64_t tid)
4637 {
4638 	struct hn_nvs_rndis_ack ack;
4639 	int retries, error;
4640 
4641 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
4642 	ack.nvs_status = HN_NVS_STATUS_OK;
4643 
4644 	retries = 0;
4645 again:
4646 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
4647 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
4648 	if (__predict_false(error == EAGAIN)) {
4649 		/*
4650 		 * NOTE:
4651 		 * This should _not_ happen in real world, since the
4652 		 * consumption of the TX bufring from the TX path is
4653 		 * controlled.
4654 		 */
4655 		if (rxr->hn_ack_failed == 0)
4656 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
4657 		rxr->hn_ack_failed++;
4658 		retries++;
4659 		if (retries < 10) {
4660 			DELAY(100);
4661 			goto again;
4662 		}
4663 		/* RXBUF leaks! */
4664 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
4665 	}
4666 }
4667 
4668 static void
4669 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
4670 {
4671 	struct hn_rx_ring *rxr = xrxr;
4672 	struct hn_softc *sc = rxr->hn_ifp->if_softc;
4673 
4674 	for (;;) {
4675 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
4676 		int error, pktlen;
4677 
4678 		pktlen = rxr->hn_pktbuf_len;
4679 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
4680 		if (__predict_false(error == ENOBUFS)) {
4681 			void *nbuf;
4682 			int nlen;
4683 
4684 			/*
4685 			 * Expand channel packet buffer.
4686 			 *
4687 			 * XXX
4688 			 * Use M_WAITOK here, since allocation failure
4689 			 * is fatal.
4690 			 */
4691 			nlen = rxr->hn_pktbuf_len * 2;
4692 			while (nlen < pktlen)
4693 				nlen *= 2;
4694 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
4695 
4696 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
4697 			    rxr->hn_pktbuf_len, nlen);
4698 
4699 			free(rxr->hn_pktbuf, M_DEVBUF);
4700 			rxr->hn_pktbuf = nbuf;
4701 			rxr->hn_pktbuf_len = nlen;
4702 			/* Retry! */
4703 			continue;
4704 		} else if (__predict_false(error == EAGAIN)) {
4705 			/* No more channel packets; done! */
4706 			break;
4707 		}
4708 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
4709 
4710 		switch (pkt->cph_type) {
4711 		case VMBUS_CHANPKT_TYPE_COMP:
4712 			hn_nvs_handle_comp(sc, chan, pkt);
4713 			break;
4714 
4715 		case VMBUS_CHANPKT_TYPE_RXBUF:
4716 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
4717 			break;
4718 
4719 		case VMBUS_CHANPKT_TYPE_INBAND:
4720 			hn_nvs_handle_notify(sc, pkt);
4721 			break;
4722 
4723 		default:
4724 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
4725 			    pkt->cph_type);
4726 			break;
4727 		}
4728 	}
4729 	hn_chan_rollup(rxr, rxr->hn_txr);
4730 }
4731 
4732 static void
4733 hn_tx_taskq_create(void *arg __unused)
4734 {
4735 
4736 	if (vm_guest != VM_GUEST_HV)
4737 		return;
4738 
4739 	if (!hn_share_tx_taskq)
4740 		return;
4741 
4742 	hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
4743 	    taskqueue_thread_enqueue, &hn_tx_taskq);
4744 	if (hn_bind_tx_taskq >= 0) {
4745 		int cpu = hn_bind_tx_taskq;
4746 		cpuset_t cpu_set;
4747 
4748 		if (cpu > mp_ncpus - 1)
4749 			cpu = mp_ncpus - 1;
4750 		CPU_SETOF(cpu, &cpu_set);
4751 		taskqueue_start_threads_cpuset(&hn_tx_taskq, 1, PI_NET,
4752 		    &cpu_set, "hn tx");
4753 	} else {
4754 		taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
4755 	}
4756 }
4757 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
4758     hn_tx_taskq_create, NULL);
4759 
4760 static void
4761 hn_tx_taskq_destroy(void *arg __unused)
4762 {
4763 
4764 	if (hn_tx_taskq != NULL)
4765 		taskqueue_free(hn_tx_taskq);
4766 }
4767 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
4768     hn_tx_taskq_destroy, NULL);
4769