1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/smp.h> 73 #include <sys/socket.h> 74 #include <sys/sockio.h> 75 #include <sys/sx.h> 76 #include <sys/sysctl.h> 77 #include <sys/systm.h> 78 #include <sys/taskqueue.h> 79 #include <sys/buf_ring.h> 80 81 #include <machine/atomic.h> 82 #include <machine/in_cksum.h> 83 84 #include <net/bpf.h> 85 #include <net/ethernet.h> 86 #include <net/if.h> 87 #include <net/if_media.h> 88 #include <net/if_types.h> 89 #include <net/if_var.h> 90 #include <net/rndis.h> 91 #ifdef RSS 92 #include <net/rss_config.h> 93 #endif 94 95 #include <netinet/in_systm.h> 96 #include <netinet/in.h> 97 #include <netinet/ip.h> 98 #include <netinet/ip6.h> 99 #include <netinet/tcp.h> 100 #include <netinet/tcp_lro.h> 101 #include <netinet/udp.h> 102 103 #include <dev/hyperv/include/hyperv.h> 104 #include <dev/hyperv/include/hyperv_busdma.h> 105 #include <dev/hyperv/include/vmbus.h> 106 #include <dev/hyperv/include/vmbus_xact.h> 107 108 #include <dev/hyperv/netvsc/ndis.h> 109 #include <dev/hyperv/netvsc/if_hnreg.h> 110 #include <dev/hyperv/netvsc/if_hnvar.h> 111 #include <dev/hyperv/netvsc/hn_nvs.h> 112 #include <dev/hyperv/netvsc/hn_rndis.h> 113 114 #include "vmbus_if.h" 115 116 #define HN_IFSTART_SUPPORT 117 118 #define HN_RING_CNT_DEF_MAX 8 119 120 /* YYY should get it from the underlying channel */ 121 #define HN_TX_DESC_CNT 512 122 123 #define HN_RNDIS_PKT_LEN \ 124 (sizeof(struct rndis_packet_msg) + \ 125 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 126 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 127 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 128 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 129 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 130 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 131 132 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 133 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 134 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 135 /* -1 for RNDIS packet message */ 136 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 137 138 #define HN_DIRECT_TX_SIZE_DEF 128 139 140 #define HN_EARLY_TXEOF_THRESH 8 141 142 #define HN_PKTBUF_LEN_DEF (16 * 1024) 143 144 #define HN_LROENT_CNT_DEF 128 145 146 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 147 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 148 /* YYY 2*MTU is a bit rough, but should be good enough. */ 149 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 150 151 #define HN_LRO_ACKCNT_DEF 1 152 153 #define HN_LOCK_INIT(sc) \ 154 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 155 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 156 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 157 #define HN_LOCK(sc) \ 158 do { \ 159 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 160 DELAY(1000); \ 161 } while (0) 162 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 163 164 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 165 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 166 #define HN_CSUM_IP_HWASSIST(sc) \ 167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 168 #define HN_CSUM_IP6_HWASSIST(sc) \ 169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 170 171 #define HN_PKTSIZE_MIN(align) \ 172 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 173 HN_RNDIS_PKT_LEN, (align)) 174 #define HN_PKTSIZE(m, align) \ 175 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 176 177 #ifdef RSS 178 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 179 #else 180 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 181 #endif 182 183 struct hn_txdesc { 184 #ifndef HN_USE_TXDESC_BUFRING 185 SLIST_ENTRY(hn_txdesc) link; 186 #endif 187 STAILQ_ENTRY(hn_txdesc) agg_link; 188 189 /* Aggregated txdescs, in sending order. */ 190 STAILQ_HEAD(, hn_txdesc) agg_list; 191 192 /* The oldest packet, if transmission aggregation happens. */ 193 struct mbuf *m; 194 struct hn_tx_ring *txr; 195 int refs; 196 uint32_t flags; /* HN_TXD_FLAG_ */ 197 struct hn_nvs_sendctx send_ctx; 198 uint32_t chim_index; 199 int chim_size; 200 201 bus_dmamap_t data_dmap; 202 203 bus_addr_t rndis_pkt_paddr; 204 struct rndis_packet_msg *rndis_pkt; 205 bus_dmamap_t rndis_pkt_dmap; 206 }; 207 208 #define HN_TXD_FLAG_ONLIST 0x0001 209 #define HN_TXD_FLAG_DMAMAP 0x0002 210 #define HN_TXD_FLAG_ONAGG 0x0004 211 212 struct hn_rxinfo { 213 uint32_t vlan_info; 214 uint32_t csum_info; 215 uint32_t hash_info; 216 uint32_t hash_value; 217 }; 218 219 #define HN_RXINFO_VLAN 0x0001 220 #define HN_RXINFO_CSUM 0x0002 221 #define HN_RXINFO_HASHINF 0x0004 222 #define HN_RXINFO_HASHVAL 0x0008 223 #define HN_RXINFO_ALL \ 224 (HN_RXINFO_VLAN | \ 225 HN_RXINFO_CSUM | \ 226 HN_RXINFO_HASHINF | \ 227 HN_RXINFO_HASHVAL) 228 229 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 230 #define HN_NDIS_RXCSUM_INFO_INVALID 0 231 #define HN_NDIS_HASH_INFO_INVALID 0 232 233 static int hn_probe(device_t); 234 static int hn_attach(device_t); 235 static int hn_detach(device_t); 236 static int hn_shutdown(device_t); 237 static void hn_chan_callback(struct vmbus_channel *, 238 void *); 239 240 static void hn_init(void *); 241 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 242 #ifdef HN_IFSTART_SUPPORT 243 static void hn_start(struct ifnet *); 244 #endif 245 static int hn_transmit(struct ifnet *, struct mbuf *); 246 static void hn_xmit_qflush(struct ifnet *); 247 static int hn_ifmedia_upd(struct ifnet *); 248 static void hn_ifmedia_sts(struct ifnet *, 249 struct ifmediareq *); 250 251 static int hn_rndis_rxinfo(const void *, int, 252 struct hn_rxinfo *); 253 static void hn_rndis_rx_data(struct hn_rx_ring *, 254 const void *, int); 255 static void hn_rndis_rx_status(struct hn_softc *, 256 const void *, int); 257 258 static void hn_nvs_handle_notify(struct hn_softc *, 259 const struct vmbus_chanpkt_hdr *); 260 static void hn_nvs_handle_comp(struct hn_softc *, 261 struct vmbus_channel *, 262 const struct vmbus_chanpkt_hdr *); 263 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 264 struct vmbus_channel *, 265 const struct vmbus_chanpkt_hdr *); 266 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 267 struct vmbus_channel *, uint64_t); 268 269 #if __FreeBSD_version >= 1100099 270 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 271 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 272 #endif 273 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 274 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 275 #if __FreeBSD_version < 1100095 276 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 277 #else 278 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 279 #endif 280 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 281 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 282 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 283 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 284 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 285 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 286 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 287 #ifndef RSS 288 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 289 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 290 #endif 291 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 292 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 293 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 294 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 295 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 296 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 297 298 static void hn_stop(struct hn_softc *); 299 static void hn_init_locked(struct hn_softc *); 300 static int hn_chan_attach(struct hn_softc *, 301 struct vmbus_channel *); 302 static void hn_chan_detach(struct hn_softc *, 303 struct vmbus_channel *); 304 static int hn_attach_subchans(struct hn_softc *); 305 static void hn_detach_allchans(struct hn_softc *); 306 static void hn_chan_rollup(struct hn_rx_ring *, 307 struct hn_tx_ring *); 308 static void hn_set_ring_inuse(struct hn_softc *, int); 309 static int hn_synth_attach(struct hn_softc *, int); 310 static void hn_synth_detach(struct hn_softc *); 311 static int hn_synth_alloc_subchans(struct hn_softc *, 312 int *); 313 static bool hn_synth_attachable(const struct hn_softc *); 314 static void hn_suspend(struct hn_softc *); 315 static void hn_suspend_data(struct hn_softc *); 316 static void hn_suspend_mgmt(struct hn_softc *); 317 static void hn_resume(struct hn_softc *); 318 static void hn_resume_data(struct hn_softc *); 319 static void hn_resume_mgmt(struct hn_softc *); 320 static void hn_suspend_mgmt_taskfunc(void *, int); 321 static void hn_chan_drain(struct hn_softc *, 322 struct vmbus_channel *); 323 static void hn_polling(struct hn_softc *, u_int); 324 static void hn_chan_polling(struct vmbus_channel *, u_int); 325 326 static void hn_update_link_status(struct hn_softc *); 327 static void hn_change_network(struct hn_softc *); 328 static void hn_link_taskfunc(void *, int); 329 static void hn_netchg_init_taskfunc(void *, int); 330 static void hn_netchg_status_taskfunc(void *, int); 331 static void hn_link_status(struct hn_softc *); 332 333 static int hn_create_rx_data(struct hn_softc *, int); 334 static void hn_destroy_rx_data(struct hn_softc *); 335 static int hn_check_iplen(const struct mbuf *, int); 336 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 337 static int hn_rxfilter_config(struct hn_softc *); 338 #ifndef RSS 339 static int hn_rss_reconfig(struct hn_softc *); 340 #endif 341 static void hn_rss_ind_fixup(struct hn_softc *); 342 static int hn_rxpkt(struct hn_rx_ring *, const void *, 343 int, const struct hn_rxinfo *); 344 345 static int hn_tx_ring_create(struct hn_softc *, int); 346 static void hn_tx_ring_destroy(struct hn_tx_ring *); 347 static int hn_create_tx_data(struct hn_softc *, int); 348 static void hn_fixup_tx_data(struct hn_softc *); 349 static void hn_destroy_tx_data(struct hn_softc *); 350 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 351 static void hn_txdesc_gc(struct hn_tx_ring *, 352 struct hn_txdesc *); 353 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 354 struct hn_txdesc *, struct mbuf **); 355 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 356 struct hn_txdesc *); 357 static void hn_set_chim_size(struct hn_softc *, int); 358 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 359 static bool hn_tx_ring_pending(struct hn_tx_ring *); 360 static void hn_tx_ring_qflush(struct hn_tx_ring *); 361 static void hn_resume_tx(struct hn_softc *, int); 362 static void hn_set_txagg(struct hn_softc *); 363 static void *hn_try_txagg(struct ifnet *, 364 struct hn_tx_ring *, struct hn_txdesc *, 365 int); 366 static int hn_get_txswq_depth(const struct hn_tx_ring *); 367 static void hn_txpkt_done(struct hn_nvs_sendctx *, 368 struct hn_softc *, struct vmbus_channel *, 369 const void *, int); 370 static int hn_txpkt_sglist(struct hn_tx_ring *, 371 struct hn_txdesc *); 372 static int hn_txpkt_chim(struct hn_tx_ring *, 373 struct hn_txdesc *); 374 static int hn_xmit(struct hn_tx_ring *, int); 375 static void hn_xmit_taskfunc(void *, int); 376 static void hn_xmit_txeof(struct hn_tx_ring *); 377 static void hn_xmit_txeof_taskfunc(void *, int); 378 #ifdef HN_IFSTART_SUPPORT 379 static int hn_start_locked(struct hn_tx_ring *, int); 380 static void hn_start_taskfunc(void *, int); 381 static void hn_start_txeof(struct hn_tx_ring *); 382 static void hn_start_txeof_taskfunc(void *, int); 383 #endif 384 385 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 386 "Hyper-V network interface"); 387 388 /* Trust tcp segements verification on host side. */ 389 static int hn_trust_hosttcp = 1; 390 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 391 &hn_trust_hosttcp, 0, 392 "Trust tcp segement verification on host side, " 393 "when csum info is missing (global setting)"); 394 395 /* Trust udp datagrams verification on host side. */ 396 static int hn_trust_hostudp = 1; 397 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 398 &hn_trust_hostudp, 0, 399 "Trust udp datagram verification on host side, " 400 "when csum info is missing (global setting)"); 401 402 /* Trust ip packets verification on host side. */ 403 static int hn_trust_hostip = 1; 404 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 405 &hn_trust_hostip, 0, 406 "Trust ip packet verification on host side, " 407 "when csum info is missing (global setting)"); 408 409 /* Limit TSO burst size */ 410 static int hn_tso_maxlen = IP_MAXPACKET; 411 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 412 &hn_tso_maxlen, 0, "TSO burst limit"); 413 414 /* Limit chimney send size */ 415 static int hn_tx_chimney_size = 0; 416 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 417 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 418 419 /* Limit the size of packet for direct transmission */ 420 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 421 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 422 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 423 424 /* # of LRO entries per RX ring */ 425 #if defined(INET) || defined(INET6) 426 #if __FreeBSD_version >= 1100095 427 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 428 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 429 &hn_lro_entry_count, 0, "LRO entry count"); 430 #endif 431 #endif 432 433 static int hn_tx_taskq_cnt = 1; 434 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 435 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 436 437 #define HN_TX_TASKQ_M_INDEP 0 438 #define HN_TX_TASKQ_M_GLOBAL 1 439 #define HN_TX_TASKQ_M_EVTTQ 2 440 441 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 443 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 444 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 445 446 #ifndef HN_USE_TXDESC_BUFRING 447 static int hn_use_txdesc_bufring = 0; 448 #else 449 static int hn_use_txdesc_bufring = 1; 450 #endif 451 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 452 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 453 454 #ifdef HN_IFSTART_SUPPORT 455 /* Use ifnet.if_start instead of ifnet.if_transmit */ 456 static int hn_use_if_start = 0; 457 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 458 &hn_use_if_start, 0, "Use if_start TX method"); 459 #endif 460 461 /* # of channels to use */ 462 static int hn_chan_cnt = 0; 463 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 464 &hn_chan_cnt, 0, 465 "# of channels to use; each channel has one RX ring and one TX ring"); 466 467 /* # of transmit rings to use */ 468 static int hn_tx_ring_cnt = 0; 469 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 470 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 471 472 /* Software TX ring deptch */ 473 static int hn_tx_swq_depth = 0; 474 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 475 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 476 477 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 478 #if __FreeBSD_version >= 1100095 479 static u_int hn_lro_mbufq_depth = 0; 480 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 481 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 482 #endif 483 484 /* Packet transmission aggregation size limit */ 485 static int hn_tx_agg_size = -1; 486 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 487 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 488 489 /* Packet transmission aggregation count limit */ 490 static int hn_tx_agg_pkts = -1; 491 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 492 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 493 494 static u_int hn_cpu_index; /* next CPU for channel */ 495 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 496 497 #ifndef RSS 498 static const uint8_t 499 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 500 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 501 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 502 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 503 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 504 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 505 }; 506 #endif /* !RSS */ 507 508 static device_method_t hn_methods[] = { 509 /* Device interface */ 510 DEVMETHOD(device_probe, hn_probe), 511 DEVMETHOD(device_attach, hn_attach), 512 DEVMETHOD(device_detach, hn_detach), 513 DEVMETHOD(device_shutdown, hn_shutdown), 514 DEVMETHOD_END 515 }; 516 517 static driver_t hn_driver = { 518 "hn", 519 hn_methods, 520 sizeof(struct hn_softc) 521 }; 522 523 static devclass_t hn_devclass; 524 525 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 526 MODULE_VERSION(hn, 1); 527 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 528 529 #if __FreeBSD_version >= 1100099 530 static void 531 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 532 { 533 int i; 534 535 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 536 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 537 } 538 #endif 539 540 static int 541 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 542 { 543 544 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 545 txd->chim_size == 0, ("invalid rndis sglist txd")); 546 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 547 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 548 } 549 550 static int 551 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 552 { 553 struct hn_nvs_rndis rndis; 554 555 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 556 txd->chim_size > 0, ("invalid rndis chim txd")); 557 558 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 559 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 560 rndis.nvs_chim_idx = txd->chim_index; 561 rndis.nvs_chim_sz = txd->chim_size; 562 563 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 564 &rndis, sizeof(rndis), &txd->send_ctx)); 565 } 566 567 static __inline uint32_t 568 hn_chim_alloc(struct hn_softc *sc) 569 { 570 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 571 u_long *bmap = sc->hn_chim_bmap; 572 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 573 574 for (i = 0; i < bmap_cnt; ++i) { 575 int idx; 576 577 idx = ffsl(~bmap[i]); 578 if (idx == 0) 579 continue; 580 581 --idx; /* ffsl is 1-based */ 582 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 583 ("invalid i %d and idx %d", i, idx)); 584 585 if (atomic_testandset_long(&bmap[i], idx)) 586 continue; 587 588 ret = i * LONG_BIT + idx; 589 break; 590 } 591 return (ret); 592 } 593 594 static __inline void 595 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 596 { 597 u_long mask; 598 uint32_t idx; 599 600 idx = chim_idx / LONG_BIT; 601 KASSERT(idx < sc->hn_chim_bmap_cnt, 602 ("invalid chimney index 0x%x", chim_idx)); 603 604 mask = 1UL << (chim_idx % LONG_BIT); 605 KASSERT(sc->hn_chim_bmap[idx] & mask, 606 ("index bitmap 0x%lx, chimney index %u, " 607 "bitmap idx %d, bitmask 0x%lx", 608 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 609 610 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 611 } 612 613 #if defined(INET6) || defined(INET) 614 /* 615 * NOTE: If this function failed, the m_head would be freed. 616 */ 617 static __inline struct mbuf * 618 hn_tso_fixup(struct mbuf *m_head) 619 { 620 struct ether_vlan_header *evl; 621 struct tcphdr *th; 622 int ehlen; 623 624 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 625 626 #define PULLUP_HDR(m, len) \ 627 do { \ 628 if (__predict_false((m)->m_len < (len))) { \ 629 (m) = m_pullup((m), (len)); \ 630 if ((m) == NULL) \ 631 return (NULL); \ 632 } \ 633 } while (0) 634 635 PULLUP_HDR(m_head, sizeof(*evl)); 636 evl = mtod(m_head, struct ether_vlan_header *); 637 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 638 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 639 else 640 ehlen = ETHER_HDR_LEN; 641 642 #ifdef INET 643 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 644 struct ip *ip; 645 int iphlen; 646 647 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 648 ip = mtodo(m_head, ehlen); 649 iphlen = ip->ip_hl << 2; 650 651 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 652 th = mtodo(m_head, ehlen + iphlen); 653 654 ip->ip_len = 0; 655 ip->ip_sum = 0; 656 th->th_sum = in_pseudo(ip->ip_src.s_addr, 657 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 658 } 659 #endif 660 #if defined(INET6) && defined(INET) 661 else 662 #endif 663 #ifdef INET6 664 { 665 struct ip6_hdr *ip6; 666 667 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 668 ip6 = mtodo(m_head, ehlen); 669 if (ip6->ip6_nxt != IPPROTO_TCP) { 670 m_freem(m_head); 671 return (NULL); 672 } 673 674 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 675 th = mtodo(m_head, ehlen + sizeof(*ip6)); 676 677 ip6->ip6_plen = 0; 678 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 679 } 680 #endif 681 return (m_head); 682 683 #undef PULLUP_HDR 684 } 685 #endif /* INET6 || INET */ 686 687 static int 688 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 689 { 690 int error = 0; 691 692 HN_LOCK_ASSERT(sc); 693 694 if (sc->hn_rx_filter != filter) { 695 error = hn_rndis_set_rxfilter(sc, filter); 696 if (!error) 697 sc->hn_rx_filter = filter; 698 } 699 return (error); 700 } 701 702 static int 703 hn_rxfilter_config(struct hn_softc *sc) 704 { 705 struct ifnet *ifp = sc->hn_ifp; 706 uint32_t filter; 707 708 HN_LOCK_ASSERT(sc); 709 710 if (ifp->if_flags & IFF_PROMISC) { 711 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 712 } else { 713 filter = NDIS_PACKET_TYPE_DIRECTED; 714 if (ifp->if_flags & IFF_BROADCAST) 715 filter |= NDIS_PACKET_TYPE_BROADCAST; 716 /* TODO: support multicast list */ 717 if ((ifp->if_flags & IFF_ALLMULTI) || 718 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 719 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 720 } 721 return (hn_set_rxfilter(sc, filter)); 722 } 723 724 static void 725 hn_set_txagg(struct hn_softc *sc) 726 { 727 uint32_t size, pkts; 728 int i; 729 730 /* 731 * Setup aggregation size. 732 */ 733 if (sc->hn_agg_size < 0) 734 size = UINT32_MAX; 735 else 736 size = sc->hn_agg_size; 737 738 if (sc->hn_rndis_agg_size < size) 739 size = sc->hn_rndis_agg_size; 740 741 /* NOTE: We only aggregate packets using chimney sending buffers. */ 742 if (size > (uint32_t)sc->hn_chim_szmax) 743 size = sc->hn_chim_szmax; 744 745 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 746 /* Disable */ 747 size = 0; 748 pkts = 0; 749 goto done; 750 } 751 752 /* NOTE: Type of the per TX ring setting is 'int'. */ 753 if (size > INT_MAX) 754 size = INT_MAX; 755 756 /* 757 * Setup aggregation packet count. 758 */ 759 if (sc->hn_agg_pkts < 0) 760 pkts = UINT32_MAX; 761 else 762 pkts = sc->hn_agg_pkts; 763 764 if (sc->hn_rndis_agg_pkts < pkts) 765 pkts = sc->hn_rndis_agg_pkts; 766 767 if (pkts <= 1) { 768 /* Disable */ 769 size = 0; 770 pkts = 0; 771 goto done; 772 } 773 774 /* NOTE: Type of the per TX ring setting is 'short'. */ 775 if (pkts > SHRT_MAX) 776 pkts = SHRT_MAX; 777 778 done: 779 /* NOTE: Type of the per TX ring setting is 'short'. */ 780 if (sc->hn_rndis_agg_align > SHRT_MAX) { 781 /* Disable */ 782 size = 0; 783 pkts = 0; 784 } 785 786 if (bootverbose) { 787 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 788 size, pkts, sc->hn_rndis_agg_align); 789 } 790 791 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 792 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 793 794 mtx_lock(&txr->hn_tx_lock); 795 txr->hn_agg_szmax = size; 796 txr->hn_agg_pktmax = pkts; 797 txr->hn_agg_align = sc->hn_rndis_agg_align; 798 mtx_unlock(&txr->hn_tx_lock); 799 } 800 } 801 802 static int 803 hn_get_txswq_depth(const struct hn_tx_ring *txr) 804 { 805 806 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 807 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 808 return txr->hn_txdesc_cnt; 809 return hn_tx_swq_depth; 810 } 811 812 #ifndef RSS 813 static int 814 hn_rss_reconfig(struct hn_softc *sc) 815 { 816 int error; 817 818 HN_LOCK_ASSERT(sc); 819 820 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 821 return (ENXIO); 822 823 /* 824 * Disable RSS first. 825 * 826 * NOTE: 827 * Direct reconfiguration by setting the UNCHG flags does 828 * _not_ work properly. 829 */ 830 if (bootverbose) 831 if_printf(sc->hn_ifp, "disable RSS\n"); 832 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 833 if (error) { 834 if_printf(sc->hn_ifp, "RSS disable failed\n"); 835 return (error); 836 } 837 838 /* 839 * Reenable the RSS w/ the updated RSS key or indirect 840 * table. 841 */ 842 if (bootverbose) 843 if_printf(sc->hn_ifp, "reconfig RSS\n"); 844 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 845 if (error) { 846 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 847 return (error); 848 } 849 return (0); 850 } 851 #endif /* !RSS */ 852 853 static void 854 hn_rss_ind_fixup(struct hn_softc *sc) 855 { 856 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 857 int i, nchan; 858 859 nchan = sc->hn_rx_ring_inuse; 860 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 861 862 /* 863 * Check indirect table to make sure that all channels in it 864 * can be used. 865 */ 866 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 867 if (rss->rss_ind[i] >= nchan) { 868 if_printf(sc->hn_ifp, 869 "RSS indirect table %d fixup: %u -> %d\n", 870 i, rss->rss_ind[i], nchan - 1); 871 rss->rss_ind[i] = nchan - 1; 872 } 873 } 874 } 875 876 static int 877 hn_ifmedia_upd(struct ifnet *ifp __unused) 878 { 879 880 return EOPNOTSUPP; 881 } 882 883 static void 884 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 885 { 886 struct hn_softc *sc = ifp->if_softc; 887 888 ifmr->ifm_status = IFM_AVALID; 889 ifmr->ifm_active = IFM_ETHER; 890 891 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 892 ifmr->ifm_active |= IFM_NONE; 893 return; 894 } 895 ifmr->ifm_status |= IFM_ACTIVE; 896 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 897 } 898 899 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 900 static const struct hyperv_guid g_net_vsc_device_type = { 901 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 902 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 903 }; 904 905 static int 906 hn_probe(device_t dev) 907 { 908 909 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 910 &g_net_vsc_device_type) == 0) { 911 device_set_desc(dev, "Hyper-V Network Interface"); 912 return BUS_PROBE_DEFAULT; 913 } 914 return ENXIO; 915 } 916 917 static int 918 hn_attach(device_t dev) 919 { 920 struct hn_softc *sc = device_get_softc(dev); 921 struct sysctl_oid_list *child; 922 struct sysctl_ctx_list *ctx; 923 uint8_t eaddr[ETHER_ADDR_LEN]; 924 struct ifnet *ifp = NULL; 925 int error, ring_cnt, tx_ring_cnt; 926 927 sc->hn_dev = dev; 928 sc->hn_prichan = vmbus_get_channel(dev); 929 HN_LOCK_INIT(sc); 930 931 /* 932 * Initialize these tunables once. 933 */ 934 sc->hn_agg_size = hn_tx_agg_size; 935 sc->hn_agg_pkts = hn_tx_agg_pkts; 936 937 /* 938 * Setup taskqueue for transmission. 939 */ 940 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 941 int i; 942 943 sc->hn_tx_taskqs = 944 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 945 M_DEVBUF, M_WAITOK); 946 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 947 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 948 M_WAITOK, taskqueue_thread_enqueue, 949 &sc->hn_tx_taskqs[i]); 950 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 951 "%s tx%d", device_get_nameunit(dev), i); 952 } 953 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 954 sc->hn_tx_taskqs = hn_tx_taskque; 955 } 956 957 /* 958 * Setup taskqueue for mangement tasks, e.g. link status. 959 */ 960 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 961 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 962 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 963 device_get_nameunit(dev)); 964 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 965 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 966 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 967 hn_netchg_status_taskfunc, sc); 968 969 /* 970 * Allocate ifnet and setup its name earlier, so that if_printf 971 * can be used by functions, which will be called after 972 * ether_ifattach(). 973 */ 974 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 975 ifp->if_softc = sc; 976 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 977 978 /* 979 * Initialize ifmedia earlier so that it can be unconditionally 980 * destroyed, if error happened later on. 981 */ 982 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 983 984 /* 985 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 986 * to use (tx_ring_cnt). 987 * 988 * NOTE: 989 * The # of RX rings to use is same as the # of channels to use. 990 */ 991 ring_cnt = hn_chan_cnt; 992 if (ring_cnt <= 0) { 993 /* Default */ 994 ring_cnt = mp_ncpus; 995 if (ring_cnt > HN_RING_CNT_DEF_MAX) 996 ring_cnt = HN_RING_CNT_DEF_MAX; 997 } else if (ring_cnt > mp_ncpus) { 998 ring_cnt = mp_ncpus; 999 } 1000 #ifdef RSS 1001 if (ring_cnt > rss_getnumbuckets()) 1002 ring_cnt = rss_getnumbuckets(); 1003 #endif 1004 1005 tx_ring_cnt = hn_tx_ring_cnt; 1006 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 1007 tx_ring_cnt = ring_cnt; 1008 #ifdef HN_IFSTART_SUPPORT 1009 if (hn_use_if_start) { 1010 /* ifnet.if_start only needs one TX ring. */ 1011 tx_ring_cnt = 1; 1012 } 1013 #endif 1014 1015 /* 1016 * Set the leader CPU for channels. 1017 */ 1018 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1019 1020 /* 1021 * Create enough TX/RX rings, even if only limited number of 1022 * channels can be allocated. 1023 */ 1024 error = hn_create_tx_data(sc, tx_ring_cnt); 1025 if (error) 1026 goto failed; 1027 error = hn_create_rx_data(sc, ring_cnt); 1028 if (error) 1029 goto failed; 1030 1031 /* 1032 * Create transaction context for NVS and RNDIS transactions. 1033 */ 1034 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1035 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1036 if (sc->hn_xact == NULL) { 1037 error = ENXIO; 1038 goto failed; 1039 } 1040 1041 /* 1042 * Install orphan handler for the revocation of this device's 1043 * primary channel. 1044 * 1045 * NOTE: 1046 * The processing order is critical here: 1047 * Install the orphan handler, _before_ testing whether this 1048 * device's primary channel has been revoked or not. 1049 */ 1050 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1051 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1052 error = ENXIO; 1053 goto failed; 1054 } 1055 1056 /* 1057 * Attach the synthetic parts, i.e. NVS and RNDIS. 1058 */ 1059 error = hn_synth_attach(sc, ETHERMTU); 1060 if (error) 1061 goto failed; 1062 1063 error = hn_rndis_get_eaddr(sc, eaddr); 1064 if (error) 1065 goto failed; 1066 1067 #if __FreeBSD_version >= 1100099 1068 if (sc->hn_rx_ring_inuse > 1) { 1069 /* 1070 * Reduce TCP segment aggregation limit for multiple 1071 * RX rings to increase ACK timeliness. 1072 */ 1073 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1074 } 1075 #endif 1076 1077 /* 1078 * Fixup TX stuffs after synthetic parts are attached. 1079 */ 1080 hn_fixup_tx_data(sc); 1081 1082 ctx = device_get_sysctl_ctx(dev); 1083 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1084 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1085 &sc->hn_nvs_ver, 0, "NVS version"); 1086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1087 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1088 hn_ndis_version_sysctl, "A", "NDIS version"); 1089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1090 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1091 hn_caps_sysctl, "A", "capabilities"); 1092 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1093 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1094 hn_hwassist_sysctl, "A", "hwassist"); 1095 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1096 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1097 hn_rxfilter_sysctl, "A", "rxfilter"); 1098 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1099 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1100 hn_rss_hash_sysctl, "A", "RSS hash"); 1101 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1102 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1103 #ifndef RSS 1104 /* 1105 * Don't allow RSS key/indirect table changes, if RSS is defined. 1106 */ 1107 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1108 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1109 hn_rss_key_sysctl, "IU", "RSS key"); 1110 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1111 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1112 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1113 #endif 1114 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1115 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1116 "RNDIS offered packet transmission aggregation size limit"); 1117 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1118 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1119 "RNDIS offered packet transmission aggregation count limit"); 1120 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1121 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1122 "RNDIS packet transmission aggregation alignment"); 1123 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1124 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1125 hn_txagg_size_sysctl, "I", 1126 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1127 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1128 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1129 hn_txagg_pkts_sysctl, "I", 1130 "Packet transmission aggregation packets, " 1131 "0 -- disable, -1 -- auto"); 1132 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 1133 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1134 hn_polling_sysctl, "I", 1135 "Polling frequency: [100,1000000], 0 disable polling"); 1136 1137 /* 1138 * Setup the ifmedia, which has been initialized earlier. 1139 */ 1140 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1141 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1142 /* XXX ifmedia_set really should do this for us */ 1143 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1144 1145 /* 1146 * Setup the ifnet for this interface. 1147 */ 1148 1149 ifp->if_baudrate = IF_Gbps(10); 1150 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1151 ifp->if_ioctl = hn_ioctl; 1152 ifp->if_init = hn_init; 1153 #ifdef HN_IFSTART_SUPPORT 1154 if (hn_use_if_start) { 1155 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1156 1157 ifp->if_start = hn_start; 1158 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1159 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1160 IFQ_SET_READY(&ifp->if_snd); 1161 } else 1162 #endif 1163 { 1164 ifp->if_transmit = hn_transmit; 1165 ifp->if_qflush = hn_xmit_qflush; 1166 } 1167 1168 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO; 1169 #ifdef foo 1170 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1171 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1172 #endif 1173 if (sc->hn_caps & HN_CAP_VLAN) { 1174 /* XXX not sure about VLAN_MTU. */ 1175 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1176 } 1177 1178 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1179 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1180 ifp->if_capabilities |= IFCAP_TXCSUM; 1181 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1182 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1183 if (sc->hn_caps & HN_CAP_TSO4) { 1184 ifp->if_capabilities |= IFCAP_TSO4; 1185 ifp->if_hwassist |= CSUM_IP_TSO; 1186 } 1187 if (sc->hn_caps & HN_CAP_TSO6) { 1188 ifp->if_capabilities |= IFCAP_TSO6; 1189 ifp->if_hwassist |= CSUM_IP6_TSO; 1190 } 1191 1192 /* Enable all available capabilities by default. */ 1193 ifp->if_capenable = ifp->if_capabilities; 1194 1195 /* 1196 * Disable IPv6 TSO and TXCSUM by default, they still can 1197 * be enabled through SIOCSIFCAP. 1198 */ 1199 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 1200 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 1201 1202 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 1203 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 1204 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 1205 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 1206 } 1207 1208 ether_ifattach(ifp, eaddr); 1209 1210 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 1211 if_printf(ifp, "TSO segcnt %u segsz %u\n", 1212 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 1213 } 1214 1215 /* Inform the upper layer about the long frame support. */ 1216 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 1217 1218 /* 1219 * Kick off link status check. 1220 */ 1221 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 1222 hn_update_link_status(sc); 1223 1224 return (0); 1225 failed: 1226 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 1227 hn_synth_detach(sc); 1228 hn_detach(dev); 1229 return (error); 1230 } 1231 1232 static int 1233 hn_detach(device_t dev) 1234 { 1235 struct hn_softc *sc = device_get_softc(dev); 1236 struct ifnet *ifp = sc->hn_ifp; 1237 1238 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 1239 /* 1240 * In case that the vmbus missed the orphan handler 1241 * installation. 1242 */ 1243 vmbus_xact_ctx_orphan(sc->hn_xact); 1244 } 1245 1246 if (device_is_attached(dev)) { 1247 HN_LOCK(sc); 1248 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1249 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 1250 hn_stop(sc); 1251 /* 1252 * NOTE: 1253 * hn_stop() only suspends data, so managment 1254 * stuffs have to be suspended manually here. 1255 */ 1256 hn_suspend_mgmt(sc); 1257 hn_synth_detach(sc); 1258 } 1259 HN_UNLOCK(sc); 1260 ether_ifdetach(ifp); 1261 } 1262 1263 ifmedia_removeall(&sc->hn_media); 1264 hn_destroy_rx_data(sc); 1265 hn_destroy_tx_data(sc); 1266 1267 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 1268 int i; 1269 1270 for (i = 0; i < hn_tx_taskq_cnt; ++i) 1271 taskqueue_free(sc->hn_tx_taskqs[i]); 1272 free(sc->hn_tx_taskqs, M_DEVBUF); 1273 } 1274 taskqueue_free(sc->hn_mgmt_taskq0); 1275 1276 if (sc->hn_xact != NULL) { 1277 /* 1278 * Uninstall the orphan handler _before_ the xact is 1279 * destructed. 1280 */ 1281 vmbus_chan_unset_orphan(sc->hn_prichan); 1282 vmbus_xact_ctx_destroy(sc->hn_xact); 1283 } 1284 1285 if_free(ifp); 1286 1287 HN_LOCK_DESTROY(sc); 1288 return (0); 1289 } 1290 1291 static int 1292 hn_shutdown(device_t dev) 1293 { 1294 1295 return (0); 1296 } 1297 1298 static void 1299 hn_link_status(struct hn_softc *sc) 1300 { 1301 uint32_t link_status; 1302 int error; 1303 1304 error = hn_rndis_get_linkstatus(sc, &link_status); 1305 if (error) { 1306 /* XXX what to do? */ 1307 return; 1308 } 1309 1310 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 1311 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 1312 else 1313 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1314 if_link_state_change(sc->hn_ifp, 1315 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 1316 LINK_STATE_UP : LINK_STATE_DOWN); 1317 } 1318 1319 static void 1320 hn_link_taskfunc(void *xsc, int pending __unused) 1321 { 1322 struct hn_softc *sc = xsc; 1323 1324 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 1325 return; 1326 hn_link_status(sc); 1327 } 1328 1329 static void 1330 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 1331 { 1332 struct hn_softc *sc = xsc; 1333 1334 /* Prevent any link status checks from running. */ 1335 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 1336 1337 /* 1338 * Fake up a [link down --> link up] state change; 5 seconds 1339 * delay is used, which closely simulates miibus reaction 1340 * upon link down event. 1341 */ 1342 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 1343 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 1344 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 1345 &sc->hn_netchg_status, 5 * hz); 1346 } 1347 1348 static void 1349 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 1350 { 1351 struct hn_softc *sc = xsc; 1352 1353 /* Re-allow link status checks. */ 1354 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 1355 hn_link_status(sc); 1356 } 1357 1358 static void 1359 hn_update_link_status(struct hn_softc *sc) 1360 { 1361 1362 if (sc->hn_mgmt_taskq != NULL) 1363 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 1364 } 1365 1366 static void 1367 hn_change_network(struct hn_softc *sc) 1368 { 1369 1370 if (sc->hn_mgmt_taskq != NULL) 1371 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 1372 } 1373 1374 static __inline int 1375 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 1376 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 1377 { 1378 struct mbuf *m = *m_head; 1379 int error; 1380 1381 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 1382 1383 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 1384 m, segs, nsegs, BUS_DMA_NOWAIT); 1385 if (error == EFBIG) { 1386 struct mbuf *m_new; 1387 1388 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 1389 if (m_new == NULL) 1390 return ENOBUFS; 1391 else 1392 *m_head = m = m_new; 1393 txr->hn_tx_collapsed++; 1394 1395 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 1396 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 1397 } 1398 if (!error) { 1399 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 1400 BUS_DMASYNC_PREWRITE); 1401 txd->flags |= HN_TXD_FLAG_DMAMAP; 1402 } 1403 return error; 1404 } 1405 1406 static __inline int 1407 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 1408 { 1409 1410 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 1411 ("put an onlist txd %#x", txd->flags)); 1412 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1413 ("put an onagg txd %#x", txd->flags)); 1414 1415 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1416 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 1417 return 0; 1418 1419 if (!STAILQ_EMPTY(&txd->agg_list)) { 1420 struct hn_txdesc *tmp_txd; 1421 1422 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 1423 int freed; 1424 1425 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 1426 ("resursive aggregation on aggregated txdesc")); 1427 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 1428 ("not aggregated txdesc")); 1429 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1430 ("aggregated txdesc uses dmamap")); 1431 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1432 ("aggregated txdesc consumes " 1433 "chimney sending buffer")); 1434 KASSERT(tmp_txd->chim_size == 0, 1435 ("aggregated txdesc has non-zero " 1436 "chimney sending size")); 1437 1438 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 1439 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 1440 freed = hn_txdesc_put(txr, tmp_txd); 1441 KASSERT(freed, ("failed to free aggregated txdesc")); 1442 } 1443 } 1444 1445 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 1446 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 1447 ("chim txd uses dmamap")); 1448 hn_chim_free(txr->hn_sc, txd->chim_index); 1449 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1450 txd->chim_size = 0; 1451 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 1452 bus_dmamap_sync(txr->hn_tx_data_dtag, 1453 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 1454 bus_dmamap_unload(txr->hn_tx_data_dtag, 1455 txd->data_dmap); 1456 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 1457 } 1458 1459 if (txd->m != NULL) { 1460 m_freem(txd->m); 1461 txd->m = NULL; 1462 } 1463 1464 txd->flags |= HN_TXD_FLAG_ONLIST; 1465 #ifndef HN_USE_TXDESC_BUFRING 1466 mtx_lock_spin(&txr->hn_txlist_spin); 1467 KASSERT(txr->hn_txdesc_avail >= 0 && 1468 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 1469 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 1470 txr->hn_txdesc_avail++; 1471 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 1472 mtx_unlock_spin(&txr->hn_txlist_spin); 1473 #else /* HN_USE_TXDESC_BUFRING */ 1474 #ifdef HN_DEBUG 1475 atomic_add_int(&txr->hn_txdesc_avail, 1); 1476 #endif 1477 buf_ring_enqueue(txr->hn_txdesc_br, txd); 1478 #endif /* !HN_USE_TXDESC_BUFRING */ 1479 1480 return 1; 1481 } 1482 1483 static __inline struct hn_txdesc * 1484 hn_txdesc_get(struct hn_tx_ring *txr) 1485 { 1486 struct hn_txdesc *txd; 1487 1488 #ifndef HN_USE_TXDESC_BUFRING 1489 mtx_lock_spin(&txr->hn_txlist_spin); 1490 txd = SLIST_FIRST(&txr->hn_txlist); 1491 if (txd != NULL) { 1492 KASSERT(txr->hn_txdesc_avail > 0, 1493 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 1494 txr->hn_txdesc_avail--; 1495 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 1496 } 1497 mtx_unlock_spin(&txr->hn_txlist_spin); 1498 #else 1499 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 1500 #endif 1501 1502 if (txd != NULL) { 1503 #ifdef HN_USE_TXDESC_BUFRING 1504 #ifdef HN_DEBUG 1505 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 1506 #endif 1507 #endif /* HN_USE_TXDESC_BUFRING */ 1508 KASSERT(txd->m == NULL && txd->refs == 0 && 1509 STAILQ_EMPTY(&txd->agg_list) && 1510 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 1511 txd->chim_size == 0 && 1512 (txd->flags & HN_TXD_FLAG_ONLIST) && 1513 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 1514 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 1515 txd->flags &= ~HN_TXD_FLAG_ONLIST; 1516 txd->refs = 1; 1517 } 1518 return txd; 1519 } 1520 1521 static __inline void 1522 hn_txdesc_hold(struct hn_txdesc *txd) 1523 { 1524 1525 /* 0->1 transition will never work */ 1526 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 1527 atomic_add_int(&txd->refs, 1); 1528 } 1529 1530 static __inline void 1531 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 1532 { 1533 1534 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1535 ("recursive aggregation on aggregating txdesc")); 1536 1537 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 1538 ("already aggregated")); 1539 KASSERT(STAILQ_EMPTY(&txd->agg_list), 1540 ("recursive aggregation on to-be-aggregated txdesc")); 1541 1542 txd->flags |= HN_TXD_FLAG_ONAGG; 1543 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 1544 } 1545 1546 static bool 1547 hn_tx_ring_pending(struct hn_tx_ring *txr) 1548 { 1549 bool pending = false; 1550 1551 #ifndef HN_USE_TXDESC_BUFRING 1552 mtx_lock_spin(&txr->hn_txlist_spin); 1553 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 1554 pending = true; 1555 mtx_unlock_spin(&txr->hn_txlist_spin); 1556 #else 1557 if (!buf_ring_full(txr->hn_txdesc_br)) 1558 pending = true; 1559 #endif 1560 return (pending); 1561 } 1562 1563 static __inline void 1564 hn_txeof(struct hn_tx_ring *txr) 1565 { 1566 txr->hn_has_txeof = 0; 1567 txr->hn_txeof(txr); 1568 } 1569 1570 static void 1571 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 1572 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 1573 { 1574 struct hn_txdesc *txd = sndc->hn_cbarg; 1575 struct hn_tx_ring *txr; 1576 1577 txr = txd->txr; 1578 KASSERT(txr->hn_chan == chan, 1579 ("channel mismatch, on chan%u, should be chan%u", 1580 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 1581 1582 txr->hn_has_txeof = 1; 1583 hn_txdesc_put(txr, txd); 1584 1585 ++txr->hn_txdone_cnt; 1586 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 1587 txr->hn_txdone_cnt = 0; 1588 if (txr->hn_oactive) 1589 hn_txeof(txr); 1590 } 1591 } 1592 1593 static void 1594 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 1595 { 1596 #if defined(INET) || defined(INET6) 1597 tcp_lro_flush_all(&rxr->hn_lro); 1598 #endif 1599 1600 /* 1601 * NOTE: 1602 * 'txr' could be NULL, if multiple channels and 1603 * ifnet.if_start method are enabled. 1604 */ 1605 if (txr == NULL || !txr->hn_has_txeof) 1606 return; 1607 1608 txr->hn_txdone_cnt = 0; 1609 hn_txeof(txr); 1610 } 1611 1612 static __inline uint32_t 1613 hn_rndis_pktmsg_offset(uint32_t ofs) 1614 { 1615 1616 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 1617 ("invalid RNDIS packet msg offset %u", ofs)); 1618 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 1619 } 1620 1621 static __inline void * 1622 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 1623 size_t pi_dlen, uint32_t pi_type) 1624 { 1625 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 1626 struct rndis_pktinfo *pi; 1627 1628 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 1629 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 1630 1631 /* 1632 * Per-packet-info does not move; it only grows. 1633 * 1634 * NOTE: 1635 * rm_pktinfooffset in this phase counts from the beginning 1636 * of rndis_packet_msg. 1637 */ 1638 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 1639 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 1640 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 1641 pkt->rm_pktinfolen); 1642 pkt->rm_pktinfolen += pi_size; 1643 1644 pi->rm_size = pi_size; 1645 pi->rm_type = pi_type; 1646 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 1647 1648 /* Data immediately follow per-packet-info. */ 1649 pkt->rm_dataoffset += pi_size; 1650 1651 /* Update RNDIS packet msg length */ 1652 pkt->rm_len += pi_size; 1653 1654 return (pi->rm_data); 1655 } 1656 1657 static __inline int 1658 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 1659 { 1660 struct hn_txdesc *txd; 1661 struct mbuf *m; 1662 int error, pkts; 1663 1664 txd = txr->hn_agg_txd; 1665 KASSERT(txd != NULL, ("no aggregate txdesc")); 1666 1667 /* 1668 * Since hn_txpkt() will reset this temporary stat, save 1669 * it now, so that oerrors can be updated properly, if 1670 * hn_txpkt() ever fails. 1671 */ 1672 pkts = txr->hn_stat_pkts; 1673 1674 /* 1675 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 1676 * failure, save it for later freeing, if hn_txpkt() ever 1677 * fails. 1678 */ 1679 m = txd->m; 1680 error = hn_txpkt(ifp, txr, txd); 1681 if (__predict_false(error)) { 1682 /* txd is freed, but m is not. */ 1683 m_freem(m); 1684 1685 txr->hn_flush_failed++; 1686 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 1687 } 1688 1689 /* Reset all aggregation states. */ 1690 txr->hn_agg_txd = NULL; 1691 txr->hn_agg_szleft = 0; 1692 txr->hn_agg_pktleft = 0; 1693 txr->hn_agg_prevpkt = NULL; 1694 1695 return (error); 1696 } 1697 1698 static void * 1699 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1700 int pktsize) 1701 { 1702 void *chim; 1703 1704 if (txr->hn_agg_txd != NULL) { 1705 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 1706 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 1707 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 1708 int olen; 1709 1710 /* 1711 * Update the previous RNDIS packet's total length, 1712 * it can be increased due to the mandatory alignment 1713 * padding for this RNDIS packet. And update the 1714 * aggregating txdesc's chimney sending buffer size 1715 * accordingly. 1716 * 1717 * XXX 1718 * Zero-out the padding, as required by the RNDIS spec. 1719 */ 1720 olen = pkt->rm_len; 1721 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 1722 agg_txd->chim_size += pkt->rm_len - olen; 1723 1724 /* Link this txdesc to the parent. */ 1725 hn_txdesc_agg(agg_txd, txd); 1726 1727 chim = (uint8_t *)pkt + pkt->rm_len; 1728 /* Save the current packet for later fixup. */ 1729 txr->hn_agg_prevpkt = chim; 1730 1731 txr->hn_agg_pktleft--; 1732 txr->hn_agg_szleft -= pktsize; 1733 if (txr->hn_agg_szleft <= 1734 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1735 /* 1736 * Probably can't aggregate more packets, 1737 * flush this aggregating txdesc proactively. 1738 */ 1739 txr->hn_agg_pktleft = 0; 1740 } 1741 /* Done! */ 1742 return (chim); 1743 } 1744 hn_flush_txagg(ifp, txr); 1745 } 1746 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 1747 1748 txr->hn_tx_chimney_tried++; 1749 txd->chim_index = hn_chim_alloc(txr->hn_sc); 1750 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 1751 return (NULL); 1752 txr->hn_tx_chimney++; 1753 1754 chim = txr->hn_sc->hn_chim + 1755 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 1756 1757 if (txr->hn_agg_pktmax > 1 && 1758 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 1759 txr->hn_agg_txd = txd; 1760 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 1761 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 1762 txr->hn_agg_prevpkt = chim; 1763 } 1764 return (chim); 1765 } 1766 1767 /* 1768 * NOTE: 1769 * If this function fails, then both txd and m_head0 will be freed. 1770 */ 1771 static int 1772 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 1773 struct mbuf **m_head0) 1774 { 1775 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 1776 int error, nsegs, i; 1777 struct mbuf *m_head = *m_head0; 1778 struct rndis_packet_msg *pkt; 1779 uint32_t *pi_data; 1780 void *chim = NULL; 1781 int pkt_hlen, pkt_size; 1782 1783 pkt = txd->rndis_pkt; 1784 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 1785 if (pkt_size < txr->hn_chim_size) { 1786 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 1787 if (chim != NULL) 1788 pkt = chim; 1789 } else { 1790 if (txr->hn_agg_txd != NULL) 1791 hn_flush_txagg(ifp, txr); 1792 } 1793 1794 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 1795 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; 1796 pkt->rm_dataoffset = sizeof(*pkt); 1797 pkt->rm_datalen = m_head->m_pkthdr.len; 1798 pkt->rm_oobdataoffset = 0; 1799 pkt->rm_oobdatalen = 0; 1800 pkt->rm_oobdataelements = 0; 1801 pkt->rm_pktinfooffset = sizeof(*pkt); 1802 pkt->rm_pktinfolen = 0; 1803 pkt->rm_vchandle = 0; 1804 pkt->rm_reserved = 0; 1805 1806 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 1807 /* 1808 * Set the hash value for this packet, so that the host could 1809 * dispatch the TX done event for this packet back to this TX 1810 * ring's channel. 1811 */ 1812 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1813 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 1814 *pi_data = txr->hn_tx_idx; 1815 } 1816 1817 if (m_head->m_flags & M_VLANTAG) { 1818 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1819 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 1820 *pi_data = NDIS_VLAN_INFO_MAKE( 1821 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 1822 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 1823 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 1824 } 1825 1826 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 1827 #if defined(INET6) || defined(INET) 1828 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1829 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 1830 #ifdef INET 1831 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 1832 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 1833 m_head->m_pkthdr.tso_segsz); 1834 } 1835 #endif 1836 #if defined(INET6) && defined(INET) 1837 else 1838 #endif 1839 #ifdef INET6 1840 { 1841 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 1842 m_head->m_pkthdr.tso_segsz); 1843 } 1844 #endif 1845 #endif /* INET6 || INET */ 1846 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 1847 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 1848 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 1849 if (m_head->m_pkthdr.csum_flags & 1850 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 1851 *pi_data = NDIS_TXCSUM_INFO_IPV6; 1852 } else { 1853 *pi_data = NDIS_TXCSUM_INFO_IPV4; 1854 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 1855 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 1856 } 1857 1858 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 1859 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 1860 else if (m_head->m_pkthdr.csum_flags & 1861 (CSUM_IP_UDP | CSUM_IP6_UDP)) 1862 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 1863 } 1864 1865 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 1866 /* Convert RNDIS packet message offsets */ 1867 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); 1868 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 1869 1870 /* 1871 * Fast path: Chimney sending. 1872 */ 1873 if (chim != NULL) { 1874 struct hn_txdesc *tgt_txd = txd; 1875 1876 if (txr->hn_agg_txd != NULL) { 1877 tgt_txd = txr->hn_agg_txd; 1878 #ifdef INVARIANTS 1879 *m_head0 = NULL; 1880 #endif 1881 } 1882 1883 KASSERT(pkt == chim, 1884 ("RNDIS pkt not in chimney sending buffer")); 1885 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 1886 ("chimney sending buffer is not used")); 1887 tgt_txd->chim_size += pkt->rm_len; 1888 1889 m_copydata(m_head, 0, m_head->m_pkthdr.len, 1890 ((uint8_t *)chim) + pkt_hlen); 1891 1892 txr->hn_gpa_cnt = 0; 1893 txr->hn_sendpkt = hn_txpkt_chim; 1894 goto done; 1895 } 1896 1897 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 1898 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 1899 ("chimney buffer is used")); 1900 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 1901 1902 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 1903 if (__predict_false(error)) { 1904 int freed; 1905 1906 /* 1907 * This mbuf is not linked w/ the txd yet, so free it now. 1908 */ 1909 m_freem(m_head); 1910 *m_head0 = NULL; 1911 1912 freed = hn_txdesc_put(txr, txd); 1913 KASSERT(freed != 0, 1914 ("fail to free txd upon txdma error")); 1915 1916 txr->hn_txdma_failed++; 1917 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 1918 return error; 1919 } 1920 *m_head0 = m_head; 1921 1922 /* +1 RNDIS packet message */ 1923 txr->hn_gpa_cnt = nsegs + 1; 1924 1925 /* send packet with page buffer */ 1926 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 1927 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 1928 txr->hn_gpa[0].gpa_len = pkt_hlen; 1929 1930 /* 1931 * Fill the page buffers with mbuf info after the page 1932 * buffer for RNDIS packet message. 1933 */ 1934 for (i = 0; i < nsegs; ++i) { 1935 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 1936 1937 gpa->gpa_page = atop(segs[i].ds_addr); 1938 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 1939 gpa->gpa_len = segs[i].ds_len; 1940 } 1941 1942 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 1943 txd->chim_size = 0; 1944 txr->hn_sendpkt = hn_txpkt_sglist; 1945 done: 1946 txd->m = m_head; 1947 1948 /* Set the completion routine */ 1949 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 1950 1951 /* Update temporary stats for later use. */ 1952 txr->hn_stat_pkts++; 1953 txr->hn_stat_size += m_head->m_pkthdr.len; 1954 if (m_head->m_flags & M_MCAST) 1955 txr->hn_stat_mcasts++; 1956 1957 return 0; 1958 } 1959 1960 /* 1961 * NOTE: 1962 * If this function fails, then txd will be freed, but the mbuf 1963 * associated w/ the txd will _not_ be freed. 1964 */ 1965 static int 1966 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 1967 { 1968 int error, send_failed = 0, has_bpf; 1969 1970 again: 1971 has_bpf = bpf_peers_present(ifp->if_bpf); 1972 if (has_bpf) { 1973 /* 1974 * Make sure that this txd and any aggregated txds are not 1975 * freed before ETHER_BPF_MTAP. 1976 */ 1977 hn_txdesc_hold(txd); 1978 } 1979 error = txr->hn_sendpkt(txr, txd); 1980 if (!error) { 1981 if (has_bpf) { 1982 const struct hn_txdesc *tmp_txd; 1983 1984 ETHER_BPF_MTAP(ifp, txd->m); 1985 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 1986 ETHER_BPF_MTAP(ifp, tmp_txd->m); 1987 } 1988 1989 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 1990 #ifdef HN_IFSTART_SUPPORT 1991 if (!hn_use_if_start) 1992 #endif 1993 { 1994 if_inc_counter(ifp, IFCOUNTER_OBYTES, 1995 txr->hn_stat_size); 1996 if (txr->hn_stat_mcasts != 0) { 1997 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1998 txr->hn_stat_mcasts); 1999 } 2000 } 2001 txr->hn_pkts += txr->hn_stat_pkts; 2002 txr->hn_sends++; 2003 } 2004 if (has_bpf) 2005 hn_txdesc_put(txr, txd); 2006 2007 if (__predict_false(error)) { 2008 int freed; 2009 2010 /* 2011 * This should "really rarely" happen. 2012 * 2013 * XXX Too many RX to be acked or too many sideband 2014 * commands to run? Ask netvsc_channel_rollup() 2015 * to kick start later. 2016 */ 2017 txr->hn_has_txeof = 1; 2018 if (!send_failed) { 2019 txr->hn_send_failed++; 2020 send_failed = 1; 2021 /* 2022 * Try sending again after set hn_has_txeof; 2023 * in case that we missed the last 2024 * netvsc_channel_rollup(). 2025 */ 2026 goto again; 2027 } 2028 if_printf(ifp, "send failed\n"); 2029 2030 /* 2031 * Caller will perform further processing on the 2032 * associated mbuf, so don't free it in hn_txdesc_put(); 2033 * only unload it from the DMA map in hn_txdesc_put(), 2034 * if it was loaded. 2035 */ 2036 txd->m = NULL; 2037 freed = hn_txdesc_put(txr, txd); 2038 KASSERT(freed != 0, 2039 ("fail to free txd upon send error")); 2040 2041 txr->hn_send_failed++; 2042 } 2043 2044 /* Reset temporary stats, after this sending is done. */ 2045 txr->hn_stat_size = 0; 2046 txr->hn_stat_pkts = 0; 2047 txr->hn_stat_mcasts = 0; 2048 2049 return (error); 2050 } 2051 2052 /* 2053 * Append the specified data to the indicated mbuf chain, 2054 * Extend the mbuf chain if the new data does not fit in 2055 * existing space. 2056 * 2057 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2058 * There should be an equivalent in the kernel mbuf code, 2059 * but there does not appear to be one yet. 2060 * 2061 * Differs from m_append() in that additional mbufs are 2062 * allocated with cluster size MJUMPAGESIZE, and filled 2063 * accordingly. 2064 * 2065 * Return 1 if able to complete the job; otherwise 0. 2066 */ 2067 static int 2068 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2069 { 2070 struct mbuf *m, *n; 2071 int remainder, space; 2072 2073 for (m = m0; m->m_next != NULL; m = m->m_next) 2074 ; 2075 remainder = len; 2076 space = M_TRAILINGSPACE(m); 2077 if (space > 0) { 2078 /* 2079 * Copy into available space. 2080 */ 2081 if (space > remainder) 2082 space = remainder; 2083 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2084 m->m_len += space; 2085 cp += space; 2086 remainder -= space; 2087 } 2088 while (remainder > 0) { 2089 /* 2090 * Allocate a new mbuf; could check space 2091 * and allocate a cluster instead. 2092 */ 2093 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2094 if (n == NULL) 2095 break; 2096 n->m_len = min(MJUMPAGESIZE, remainder); 2097 bcopy(cp, mtod(n, caddr_t), n->m_len); 2098 cp += n->m_len; 2099 remainder -= n->m_len; 2100 m->m_next = n; 2101 m = n; 2102 } 2103 if (m0->m_flags & M_PKTHDR) 2104 m0->m_pkthdr.len += len - remainder; 2105 2106 return (remainder == 0); 2107 } 2108 2109 #if defined(INET) || defined(INET6) 2110 static __inline int 2111 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2112 { 2113 #if __FreeBSD_version >= 1100095 2114 if (hn_lro_mbufq_depth) { 2115 tcp_lro_queue_mbuf(lc, m); 2116 return 0; 2117 } 2118 #endif 2119 return tcp_lro_rx(lc, m, 0); 2120 } 2121 #endif 2122 2123 static int 2124 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2125 const struct hn_rxinfo *info) 2126 { 2127 struct ifnet *ifp = rxr->hn_ifp; 2128 struct mbuf *m_new; 2129 int size, do_lro = 0, do_csum = 1; 2130 int hash_type; 2131 2132 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) 2133 return (0); 2134 2135 /* 2136 * Bail out if packet contains more data than configured MTU. 2137 */ 2138 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) { 2139 return (0); 2140 } else if (dlen <= MHLEN) { 2141 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2142 if (m_new == NULL) { 2143 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2144 return (0); 2145 } 2146 memcpy(mtod(m_new, void *), data, dlen); 2147 m_new->m_pkthdr.len = m_new->m_len = dlen; 2148 rxr->hn_small_pkts++; 2149 } else { 2150 /* 2151 * Get an mbuf with a cluster. For packets 2K or less, 2152 * get a standard 2K cluster. For anything larger, get a 2153 * 4K cluster. Any buffers larger than 4K can cause problems 2154 * if looped around to the Hyper-V TX channel, so avoid them. 2155 */ 2156 size = MCLBYTES; 2157 if (dlen > MCLBYTES) { 2158 /* 4096 */ 2159 size = MJUMPAGESIZE; 2160 } 2161 2162 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 2163 if (m_new == NULL) { 2164 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 2165 return (0); 2166 } 2167 2168 hv_m_append(m_new, dlen, data); 2169 } 2170 m_new->m_pkthdr.rcvif = ifp; 2171 2172 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 2173 do_csum = 0; 2174 2175 /* receive side checksum offload */ 2176 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 2177 /* IP csum offload */ 2178 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 2179 m_new->m_pkthdr.csum_flags |= 2180 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2181 rxr->hn_csum_ip++; 2182 } 2183 2184 /* TCP/UDP csum offload */ 2185 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 2186 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 2187 m_new->m_pkthdr.csum_flags |= 2188 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2189 m_new->m_pkthdr.csum_data = 0xffff; 2190 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 2191 rxr->hn_csum_tcp++; 2192 else 2193 rxr->hn_csum_udp++; 2194 } 2195 2196 /* 2197 * XXX 2198 * As of this write (Oct 28th, 2016), host side will turn 2199 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 2200 * the do_lro setting here is actually _not_ accurate. We 2201 * depend on the RSS hash type check to reset do_lro. 2202 */ 2203 if ((info->csum_info & 2204 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 2205 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 2206 do_lro = 1; 2207 } else { 2208 const struct ether_header *eh; 2209 uint16_t etype; 2210 int hoff; 2211 2212 hoff = sizeof(*eh); 2213 if (m_new->m_len < hoff) 2214 goto skip; 2215 eh = mtod(m_new, struct ether_header *); 2216 etype = ntohs(eh->ether_type); 2217 if (etype == ETHERTYPE_VLAN) { 2218 const struct ether_vlan_header *evl; 2219 2220 hoff = sizeof(*evl); 2221 if (m_new->m_len < hoff) 2222 goto skip; 2223 evl = mtod(m_new, struct ether_vlan_header *); 2224 etype = ntohs(evl->evl_proto); 2225 } 2226 2227 if (etype == ETHERTYPE_IP) { 2228 int pr; 2229 2230 pr = hn_check_iplen(m_new, hoff); 2231 if (pr == IPPROTO_TCP) { 2232 if (do_csum && 2233 (rxr->hn_trust_hcsum & 2234 HN_TRUST_HCSUM_TCP)) { 2235 rxr->hn_csum_trusted++; 2236 m_new->m_pkthdr.csum_flags |= 2237 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2238 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2239 m_new->m_pkthdr.csum_data = 0xffff; 2240 } 2241 do_lro = 1; 2242 } else if (pr == IPPROTO_UDP) { 2243 if (do_csum && 2244 (rxr->hn_trust_hcsum & 2245 HN_TRUST_HCSUM_UDP)) { 2246 rxr->hn_csum_trusted++; 2247 m_new->m_pkthdr.csum_flags |= 2248 (CSUM_IP_CHECKED | CSUM_IP_VALID | 2249 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 2250 m_new->m_pkthdr.csum_data = 0xffff; 2251 } 2252 } else if (pr != IPPROTO_DONE && do_csum && 2253 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 2254 rxr->hn_csum_trusted++; 2255 m_new->m_pkthdr.csum_flags |= 2256 (CSUM_IP_CHECKED | CSUM_IP_VALID); 2257 } 2258 } 2259 } 2260 skip: 2261 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 2262 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 2263 NDIS_VLAN_INFO_ID(info->vlan_info), 2264 NDIS_VLAN_INFO_PRI(info->vlan_info), 2265 NDIS_VLAN_INFO_CFI(info->vlan_info)); 2266 m_new->m_flags |= M_VLANTAG; 2267 } 2268 2269 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 2270 rxr->hn_rss_pkts++; 2271 m_new->m_pkthdr.flowid = info->hash_value; 2272 hash_type = M_HASHTYPE_OPAQUE_HASH; 2273 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 2274 NDIS_HASH_FUNCTION_TOEPLITZ) { 2275 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 2276 2277 /* 2278 * NOTE: 2279 * do_lro is resetted, if the hash types are not TCP 2280 * related. See the comment in the above csum_flags 2281 * setup section. 2282 */ 2283 switch (type) { 2284 case NDIS_HASH_IPV4: 2285 hash_type = M_HASHTYPE_RSS_IPV4; 2286 do_lro = 0; 2287 break; 2288 2289 case NDIS_HASH_TCP_IPV4: 2290 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 2291 break; 2292 2293 case NDIS_HASH_IPV6: 2294 hash_type = M_HASHTYPE_RSS_IPV6; 2295 do_lro = 0; 2296 break; 2297 2298 case NDIS_HASH_IPV6_EX: 2299 hash_type = M_HASHTYPE_RSS_IPV6_EX; 2300 do_lro = 0; 2301 break; 2302 2303 case NDIS_HASH_TCP_IPV6: 2304 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 2305 break; 2306 2307 case NDIS_HASH_TCP_IPV6_EX: 2308 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 2309 break; 2310 } 2311 } 2312 } else { 2313 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 2314 hash_type = M_HASHTYPE_OPAQUE; 2315 } 2316 M_HASHTYPE_SET(m_new, hash_type); 2317 2318 /* 2319 * Note: Moved RX completion back to hv_nv_on_receive() so all 2320 * messages (not just data messages) will trigger a response. 2321 */ 2322 2323 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 2324 rxr->hn_pkts++; 2325 2326 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 2327 #if defined(INET) || defined(INET6) 2328 struct lro_ctrl *lro = &rxr->hn_lro; 2329 2330 if (lro->lro_cnt) { 2331 rxr->hn_lro_tried++; 2332 if (hn_lro_rx(lro, m_new) == 0) { 2333 /* DONE! */ 2334 return 0; 2335 } 2336 } 2337 #endif 2338 } 2339 2340 /* We're not holding the lock here, so don't release it */ 2341 (*ifp->if_input)(ifp, m_new); 2342 2343 return (0); 2344 } 2345 2346 static int 2347 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 2348 { 2349 struct hn_softc *sc = ifp->if_softc; 2350 struct ifreq *ifr = (struct ifreq *)data; 2351 int mask, error = 0; 2352 2353 switch (cmd) { 2354 case SIOCSIFMTU: 2355 if (ifr->ifr_mtu > HN_MTU_MAX) { 2356 error = EINVAL; 2357 break; 2358 } 2359 2360 HN_LOCK(sc); 2361 2362 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2363 HN_UNLOCK(sc); 2364 break; 2365 } 2366 2367 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 2368 /* Can't change MTU */ 2369 HN_UNLOCK(sc); 2370 error = EOPNOTSUPP; 2371 break; 2372 } 2373 2374 if (ifp->if_mtu == ifr->ifr_mtu) { 2375 HN_UNLOCK(sc); 2376 break; 2377 } 2378 2379 /* 2380 * Suspend this interface before the synthetic parts 2381 * are ripped. 2382 */ 2383 hn_suspend(sc); 2384 2385 /* 2386 * Detach the synthetics parts, i.e. NVS and RNDIS. 2387 */ 2388 hn_synth_detach(sc); 2389 2390 /* 2391 * Reattach the synthetic parts, i.e. NVS and RNDIS, 2392 * with the new MTU setting. 2393 */ 2394 error = hn_synth_attach(sc, ifr->ifr_mtu); 2395 if (error) { 2396 HN_UNLOCK(sc); 2397 break; 2398 } 2399 2400 /* 2401 * Commit the requested MTU, after the synthetic parts 2402 * have been successfully attached. 2403 */ 2404 ifp->if_mtu = ifr->ifr_mtu; 2405 2406 /* 2407 * Make sure that various parameters based on MTU are 2408 * still valid, after the MTU change. 2409 */ 2410 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 2411 hn_set_chim_size(sc, sc->hn_chim_szmax); 2412 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 2413 #if __FreeBSD_version >= 1100099 2414 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < 2415 HN_LRO_LENLIM_MIN(ifp)) 2416 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 2417 #endif 2418 2419 /* 2420 * All done! Resume the interface now. 2421 */ 2422 hn_resume(sc); 2423 2424 HN_UNLOCK(sc); 2425 break; 2426 2427 case SIOCSIFFLAGS: 2428 HN_LOCK(sc); 2429 2430 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2431 HN_UNLOCK(sc); 2432 break; 2433 } 2434 2435 if (ifp->if_flags & IFF_UP) { 2436 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2437 /* 2438 * Caller meight hold mutex, e.g. 2439 * bpf; use busy-wait for the RNDIS 2440 * reply. 2441 */ 2442 HN_NO_SLEEPING(sc); 2443 hn_rxfilter_config(sc); 2444 HN_SLEEPING_OK(sc); 2445 } else { 2446 hn_init_locked(sc); 2447 } 2448 } else { 2449 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2450 hn_stop(sc); 2451 } 2452 sc->hn_if_flags = ifp->if_flags; 2453 2454 HN_UNLOCK(sc); 2455 break; 2456 2457 case SIOCSIFCAP: 2458 HN_LOCK(sc); 2459 mask = ifr->ifr_reqcap ^ ifp->if_capenable; 2460 2461 if (mask & IFCAP_TXCSUM) { 2462 ifp->if_capenable ^= IFCAP_TXCSUM; 2463 if (ifp->if_capenable & IFCAP_TXCSUM) 2464 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 2465 else 2466 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 2467 } 2468 if (mask & IFCAP_TXCSUM_IPV6) { 2469 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 2470 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 2471 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 2472 else 2473 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 2474 } 2475 2476 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 2477 if (mask & IFCAP_RXCSUM) 2478 ifp->if_capenable ^= IFCAP_RXCSUM; 2479 #ifdef foo 2480 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2481 if (mask & IFCAP_RXCSUM_IPV6) 2482 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 2483 #endif 2484 2485 if (mask & IFCAP_LRO) 2486 ifp->if_capenable ^= IFCAP_LRO; 2487 2488 if (mask & IFCAP_TSO4) { 2489 ifp->if_capenable ^= IFCAP_TSO4; 2490 if (ifp->if_capenable & IFCAP_TSO4) 2491 ifp->if_hwassist |= CSUM_IP_TSO; 2492 else 2493 ifp->if_hwassist &= ~CSUM_IP_TSO; 2494 } 2495 if (mask & IFCAP_TSO6) { 2496 ifp->if_capenable ^= IFCAP_TSO6; 2497 if (ifp->if_capenable & IFCAP_TSO6) 2498 ifp->if_hwassist |= CSUM_IP6_TSO; 2499 else 2500 ifp->if_hwassist &= ~CSUM_IP6_TSO; 2501 } 2502 2503 HN_UNLOCK(sc); 2504 break; 2505 2506 case SIOCADDMULTI: 2507 case SIOCDELMULTI: 2508 HN_LOCK(sc); 2509 2510 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 2511 HN_UNLOCK(sc); 2512 break; 2513 } 2514 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 2515 /* 2516 * Multicast uses mutex; use busy-wait for 2517 * the RNDIS reply. 2518 */ 2519 HN_NO_SLEEPING(sc); 2520 hn_rxfilter_config(sc); 2521 HN_SLEEPING_OK(sc); 2522 } 2523 2524 HN_UNLOCK(sc); 2525 break; 2526 2527 case SIOCSIFMEDIA: 2528 case SIOCGIFMEDIA: 2529 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 2530 break; 2531 2532 default: 2533 error = ether_ioctl(ifp, cmd, data); 2534 break; 2535 } 2536 return (error); 2537 } 2538 2539 static void 2540 hn_stop(struct hn_softc *sc) 2541 { 2542 struct ifnet *ifp = sc->hn_ifp; 2543 int i; 2544 2545 HN_LOCK_ASSERT(sc); 2546 2547 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 2548 ("synthetic parts were not attached")); 2549 2550 /* Disable polling. */ 2551 hn_polling(sc, 0); 2552 2553 /* Clear RUNNING bit _before_ hn_suspend_data() */ 2554 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2555 hn_suspend_data(sc); 2556 2557 /* Clear OACTIVE bit. */ 2558 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2559 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2560 sc->hn_tx_ring[i].hn_oactive = 0; 2561 } 2562 2563 static void 2564 hn_init_locked(struct hn_softc *sc) 2565 { 2566 struct ifnet *ifp = sc->hn_ifp; 2567 int i; 2568 2569 HN_LOCK_ASSERT(sc); 2570 2571 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 2572 return; 2573 2574 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2575 return; 2576 2577 /* Configure RX filter */ 2578 hn_rxfilter_config(sc); 2579 2580 /* Clear OACTIVE bit. */ 2581 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 2582 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 2583 sc->hn_tx_ring[i].hn_oactive = 0; 2584 2585 /* Clear TX 'suspended' bit. */ 2586 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 2587 2588 /* Everything is ready; unleash! */ 2589 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 2590 2591 /* Re-enable polling if requested. */ 2592 if (sc->hn_pollhz > 0) 2593 hn_polling(sc, sc->hn_pollhz); 2594 } 2595 2596 static void 2597 hn_init(void *xsc) 2598 { 2599 struct hn_softc *sc = xsc; 2600 2601 HN_LOCK(sc); 2602 hn_init_locked(sc); 2603 HN_UNLOCK(sc); 2604 } 2605 2606 #if __FreeBSD_version >= 1100099 2607 2608 static int 2609 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 2610 { 2611 struct hn_softc *sc = arg1; 2612 unsigned int lenlim; 2613 int error; 2614 2615 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 2616 error = sysctl_handle_int(oidp, &lenlim, 0, req); 2617 if (error || req->newptr == NULL) 2618 return error; 2619 2620 HN_LOCK(sc); 2621 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 2622 lenlim > TCP_LRO_LENGTH_MAX) { 2623 HN_UNLOCK(sc); 2624 return EINVAL; 2625 } 2626 hn_set_lro_lenlim(sc, lenlim); 2627 HN_UNLOCK(sc); 2628 2629 return 0; 2630 } 2631 2632 static int 2633 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 2634 { 2635 struct hn_softc *sc = arg1; 2636 int ackcnt, error, i; 2637 2638 /* 2639 * lro_ackcnt_lim is append count limit, 2640 * +1 to turn it into aggregation limit. 2641 */ 2642 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 2643 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 2644 if (error || req->newptr == NULL) 2645 return error; 2646 2647 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 2648 return EINVAL; 2649 2650 /* 2651 * Convert aggregation limit back to append 2652 * count limit. 2653 */ 2654 --ackcnt; 2655 HN_LOCK(sc); 2656 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 2657 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 2658 HN_UNLOCK(sc); 2659 return 0; 2660 } 2661 2662 #endif 2663 2664 static int 2665 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 2666 { 2667 struct hn_softc *sc = arg1; 2668 int hcsum = arg2; 2669 int on, error, i; 2670 2671 on = 0; 2672 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 2673 on = 1; 2674 2675 error = sysctl_handle_int(oidp, &on, 0, req); 2676 if (error || req->newptr == NULL) 2677 return error; 2678 2679 HN_LOCK(sc); 2680 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2681 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 2682 2683 if (on) 2684 rxr->hn_trust_hcsum |= hcsum; 2685 else 2686 rxr->hn_trust_hcsum &= ~hcsum; 2687 } 2688 HN_UNLOCK(sc); 2689 return 0; 2690 } 2691 2692 static int 2693 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 2694 { 2695 struct hn_softc *sc = arg1; 2696 int chim_size, error; 2697 2698 chim_size = sc->hn_tx_ring[0].hn_chim_size; 2699 error = sysctl_handle_int(oidp, &chim_size, 0, req); 2700 if (error || req->newptr == NULL) 2701 return error; 2702 2703 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 2704 return EINVAL; 2705 2706 HN_LOCK(sc); 2707 hn_set_chim_size(sc, chim_size); 2708 HN_UNLOCK(sc); 2709 return 0; 2710 } 2711 2712 #if __FreeBSD_version < 1100095 2713 static int 2714 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 2715 { 2716 struct hn_softc *sc = arg1; 2717 int ofs = arg2, i, error; 2718 struct hn_rx_ring *rxr; 2719 uint64_t stat; 2720 2721 stat = 0; 2722 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2723 rxr = &sc->hn_rx_ring[i]; 2724 stat += *((int *)((uint8_t *)rxr + ofs)); 2725 } 2726 2727 error = sysctl_handle_64(oidp, &stat, 0, req); 2728 if (error || req->newptr == NULL) 2729 return error; 2730 2731 /* Zero out this stat. */ 2732 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2733 rxr = &sc->hn_rx_ring[i]; 2734 *((int *)((uint8_t *)rxr + ofs)) = 0; 2735 } 2736 return 0; 2737 } 2738 #else 2739 static int 2740 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 2741 { 2742 struct hn_softc *sc = arg1; 2743 int ofs = arg2, i, error; 2744 struct hn_rx_ring *rxr; 2745 uint64_t stat; 2746 2747 stat = 0; 2748 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2749 rxr = &sc->hn_rx_ring[i]; 2750 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 2751 } 2752 2753 error = sysctl_handle_64(oidp, &stat, 0, req); 2754 if (error || req->newptr == NULL) 2755 return error; 2756 2757 /* Zero out this stat. */ 2758 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2759 rxr = &sc->hn_rx_ring[i]; 2760 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 2761 } 2762 return 0; 2763 } 2764 2765 #endif 2766 2767 static int 2768 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2769 { 2770 struct hn_softc *sc = arg1; 2771 int ofs = arg2, i, error; 2772 struct hn_rx_ring *rxr; 2773 u_long stat; 2774 2775 stat = 0; 2776 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2777 rxr = &sc->hn_rx_ring[i]; 2778 stat += *((u_long *)((uint8_t *)rxr + ofs)); 2779 } 2780 2781 error = sysctl_handle_long(oidp, &stat, 0, req); 2782 if (error || req->newptr == NULL) 2783 return error; 2784 2785 /* Zero out this stat. */ 2786 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 2787 rxr = &sc->hn_rx_ring[i]; 2788 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 2789 } 2790 return 0; 2791 } 2792 2793 static int 2794 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 2795 { 2796 struct hn_softc *sc = arg1; 2797 int ofs = arg2, i, error; 2798 struct hn_tx_ring *txr; 2799 u_long stat; 2800 2801 stat = 0; 2802 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2803 txr = &sc->hn_tx_ring[i]; 2804 stat += *((u_long *)((uint8_t *)txr + ofs)); 2805 } 2806 2807 error = sysctl_handle_long(oidp, &stat, 0, req); 2808 if (error || req->newptr == NULL) 2809 return error; 2810 2811 /* Zero out this stat. */ 2812 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2813 txr = &sc->hn_tx_ring[i]; 2814 *((u_long *)((uint8_t *)txr + ofs)) = 0; 2815 } 2816 return 0; 2817 } 2818 2819 static int 2820 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 2821 { 2822 struct hn_softc *sc = arg1; 2823 int ofs = arg2, i, error, conf; 2824 struct hn_tx_ring *txr; 2825 2826 txr = &sc->hn_tx_ring[0]; 2827 conf = *((int *)((uint8_t *)txr + ofs)); 2828 2829 error = sysctl_handle_int(oidp, &conf, 0, req); 2830 if (error || req->newptr == NULL) 2831 return error; 2832 2833 HN_LOCK(sc); 2834 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 2835 txr = &sc->hn_tx_ring[i]; 2836 *((int *)((uint8_t *)txr + ofs)) = conf; 2837 } 2838 HN_UNLOCK(sc); 2839 2840 return 0; 2841 } 2842 2843 static int 2844 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 2845 { 2846 struct hn_softc *sc = arg1; 2847 int error, size; 2848 2849 size = sc->hn_agg_size; 2850 error = sysctl_handle_int(oidp, &size, 0, req); 2851 if (error || req->newptr == NULL) 2852 return (error); 2853 2854 HN_LOCK(sc); 2855 sc->hn_agg_size = size; 2856 hn_set_txagg(sc); 2857 HN_UNLOCK(sc); 2858 2859 return (0); 2860 } 2861 2862 static int 2863 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 2864 { 2865 struct hn_softc *sc = arg1; 2866 int error, pkts; 2867 2868 pkts = sc->hn_agg_pkts; 2869 error = sysctl_handle_int(oidp, &pkts, 0, req); 2870 if (error || req->newptr == NULL) 2871 return (error); 2872 2873 HN_LOCK(sc); 2874 sc->hn_agg_pkts = pkts; 2875 hn_set_txagg(sc); 2876 HN_UNLOCK(sc); 2877 2878 return (0); 2879 } 2880 2881 static int 2882 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 2883 { 2884 struct hn_softc *sc = arg1; 2885 int pkts; 2886 2887 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 2888 return (sysctl_handle_int(oidp, &pkts, 0, req)); 2889 } 2890 2891 static int 2892 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 2893 { 2894 struct hn_softc *sc = arg1; 2895 int align; 2896 2897 align = sc->hn_tx_ring[0].hn_agg_align; 2898 return (sysctl_handle_int(oidp, &align, 0, req)); 2899 } 2900 2901 static void 2902 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 2903 { 2904 if (pollhz == 0) 2905 vmbus_chan_poll_disable(chan); 2906 else 2907 vmbus_chan_poll_enable(chan, pollhz); 2908 } 2909 2910 static void 2911 hn_polling(struct hn_softc *sc, u_int pollhz) 2912 { 2913 int nsubch = sc->hn_rx_ring_inuse - 1; 2914 2915 HN_LOCK_ASSERT(sc); 2916 2917 if (nsubch > 0) { 2918 struct vmbus_channel **subch; 2919 int i; 2920 2921 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 2922 for (i = 0; i < nsubch; ++i) 2923 hn_chan_polling(subch[i], pollhz); 2924 vmbus_subchan_rel(subch, nsubch); 2925 } 2926 hn_chan_polling(sc->hn_prichan, pollhz); 2927 } 2928 2929 static int 2930 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 2931 { 2932 struct hn_softc *sc = arg1; 2933 int pollhz, error; 2934 2935 pollhz = sc->hn_pollhz; 2936 error = sysctl_handle_int(oidp, &pollhz, 0, req); 2937 if (error || req->newptr == NULL) 2938 return (error); 2939 2940 if (pollhz != 0 && 2941 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 2942 return (EINVAL); 2943 2944 HN_LOCK(sc); 2945 if (sc->hn_pollhz != pollhz) { 2946 sc->hn_pollhz = pollhz; 2947 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 2948 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 2949 hn_polling(sc, sc->hn_pollhz); 2950 } 2951 HN_UNLOCK(sc); 2952 2953 return (0); 2954 } 2955 2956 static int 2957 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 2958 { 2959 struct hn_softc *sc = arg1; 2960 char verstr[16]; 2961 2962 snprintf(verstr, sizeof(verstr), "%u.%u", 2963 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 2964 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 2965 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 2966 } 2967 2968 static int 2969 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 2970 { 2971 struct hn_softc *sc = arg1; 2972 char caps_str[128]; 2973 uint32_t caps; 2974 2975 HN_LOCK(sc); 2976 caps = sc->hn_caps; 2977 HN_UNLOCK(sc); 2978 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 2979 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 2980 } 2981 2982 static int 2983 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 2984 { 2985 struct hn_softc *sc = arg1; 2986 char assist_str[128]; 2987 uint32_t hwassist; 2988 2989 HN_LOCK(sc); 2990 hwassist = sc->hn_ifp->if_hwassist; 2991 HN_UNLOCK(sc); 2992 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 2993 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 2994 } 2995 2996 static int 2997 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 2998 { 2999 struct hn_softc *sc = arg1; 3000 char filter_str[128]; 3001 uint32_t filter; 3002 3003 HN_LOCK(sc); 3004 filter = sc->hn_rx_filter; 3005 HN_UNLOCK(sc); 3006 snprintf(filter_str, sizeof(filter_str), "%b", filter, 3007 NDIS_PACKET_TYPES); 3008 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 3009 } 3010 3011 #ifndef RSS 3012 3013 static int 3014 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 3015 { 3016 struct hn_softc *sc = arg1; 3017 int error; 3018 3019 HN_LOCK(sc); 3020 3021 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3022 if (error || req->newptr == NULL) 3023 goto back; 3024 3025 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3026 if (error) 3027 goto back; 3028 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 3029 3030 if (sc->hn_rx_ring_inuse > 1) { 3031 error = hn_rss_reconfig(sc); 3032 } else { 3033 /* Not RSS capable, at least for now; just save the RSS key. */ 3034 error = 0; 3035 } 3036 back: 3037 HN_UNLOCK(sc); 3038 return (error); 3039 } 3040 3041 static int 3042 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 3043 { 3044 struct hn_softc *sc = arg1; 3045 int error; 3046 3047 HN_LOCK(sc); 3048 3049 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3050 if (error || req->newptr == NULL) 3051 goto back; 3052 3053 /* 3054 * Don't allow RSS indirect table change, if this interface is not 3055 * RSS capable currently. 3056 */ 3057 if (sc->hn_rx_ring_inuse == 1) { 3058 error = EOPNOTSUPP; 3059 goto back; 3060 } 3061 3062 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 3063 if (error) 3064 goto back; 3065 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 3066 3067 hn_rss_ind_fixup(sc); 3068 error = hn_rss_reconfig(sc); 3069 back: 3070 HN_UNLOCK(sc); 3071 return (error); 3072 } 3073 3074 #endif /* !RSS */ 3075 3076 static int 3077 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 3078 { 3079 struct hn_softc *sc = arg1; 3080 char hash_str[128]; 3081 uint32_t hash; 3082 3083 HN_LOCK(sc); 3084 hash = sc->hn_rss_hash; 3085 HN_UNLOCK(sc); 3086 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 3087 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 3088 } 3089 3090 static int 3091 hn_check_iplen(const struct mbuf *m, int hoff) 3092 { 3093 const struct ip *ip; 3094 int len, iphlen, iplen; 3095 const struct tcphdr *th; 3096 int thoff; /* TCP data offset */ 3097 3098 len = hoff + sizeof(struct ip); 3099 3100 /* The packet must be at least the size of an IP header. */ 3101 if (m->m_pkthdr.len < len) 3102 return IPPROTO_DONE; 3103 3104 /* The fixed IP header must reside completely in the first mbuf. */ 3105 if (m->m_len < len) 3106 return IPPROTO_DONE; 3107 3108 ip = mtodo(m, hoff); 3109 3110 /* Bound check the packet's stated IP header length. */ 3111 iphlen = ip->ip_hl << 2; 3112 if (iphlen < sizeof(struct ip)) /* minimum header length */ 3113 return IPPROTO_DONE; 3114 3115 /* The full IP header must reside completely in the one mbuf. */ 3116 if (m->m_len < hoff + iphlen) 3117 return IPPROTO_DONE; 3118 3119 iplen = ntohs(ip->ip_len); 3120 3121 /* 3122 * Check that the amount of data in the buffers is as 3123 * at least much as the IP header would have us expect. 3124 */ 3125 if (m->m_pkthdr.len < hoff + iplen) 3126 return IPPROTO_DONE; 3127 3128 /* 3129 * Ignore IP fragments. 3130 */ 3131 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 3132 return IPPROTO_DONE; 3133 3134 /* 3135 * The TCP/IP or UDP/IP header must be entirely contained within 3136 * the first fragment of a packet. 3137 */ 3138 switch (ip->ip_p) { 3139 case IPPROTO_TCP: 3140 if (iplen < iphlen + sizeof(struct tcphdr)) 3141 return IPPROTO_DONE; 3142 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 3143 return IPPROTO_DONE; 3144 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 3145 thoff = th->th_off << 2; 3146 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 3147 return IPPROTO_DONE; 3148 if (m->m_len < hoff + iphlen + thoff) 3149 return IPPROTO_DONE; 3150 break; 3151 case IPPROTO_UDP: 3152 if (iplen < iphlen + sizeof(struct udphdr)) 3153 return IPPROTO_DONE; 3154 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 3155 return IPPROTO_DONE; 3156 break; 3157 default: 3158 if (iplen < iphlen) 3159 return IPPROTO_DONE; 3160 break; 3161 } 3162 return ip->ip_p; 3163 } 3164 3165 static int 3166 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 3167 { 3168 struct sysctl_oid_list *child; 3169 struct sysctl_ctx_list *ctx; 3170 device_t dev = sc->hn_dev; 3171 #if defined(INET) || defined(INET6) 3172 #if __FreeBSD_version >= 1100095 3173 int lroent_cnt; 3174 #endif 3175 #endif 3176 int i; 3177 3178 /* 3179 * Create RXBUF for reception. 3180 * 3181 * NOTE: 3182 * - It is shared by all channels. 3183 * - A large enough buffer is allocated, certain version of NVSes 3184 * may further limit the usable space. 3185 */ 3186 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3187 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 3188 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3189 if (sc->hn_rxbuf == NULL) { 3190 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 3191 return (ENOMEM); 3192 } 3193 3194 sc->hn_rx_ring_cnt = ring_cnt; 3195 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 3196 3197 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 3198 M_DEVBUF, M_WAITOK | M_ZERO); 3199 3200 #if defined(INET) || defined(INET6) 3201 #if __FreeBSD_version >= 1100095 3202 lroent_cnt = hn_lro_entry_count; 3203 if (lroent_cnt < TCP_LRO_ENTRIES) 3204 lroent_cnt = TCP_LRO_ENTRIES; 3205 if (bootverbose) 3206 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 3207 #endif 3208 #endif /* INET || INET6 */ 3209 3210 ctx = device_get_sysctl_ctx(dev); 3211 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 3212 3213 /* Create dev.hn.UNIT.rx sysctl tree */ 3214 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 3215 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3216 3217 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3218 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3219 3220 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 3221 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 3222 &rxr->hn_br_dma, BUS_DMA_WAITOK); 3223 if (rxr->hn_br == NULL) { 3224 device_printf(dev, "allocate bufring failed\n"); 3225 return (ENOMEM); 3226 } 3227 3228 if (hn_trust_hosttcp) 3229 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 3230 if (hn_trust_hostudp) 3231 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 3232 if (hn_trust_hostip) 3233 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 3234 rxr->hn_ifp = sc->hn_ifp; 3235 if (i < sc->hn_tx_ring_cnt) 3236 rxr->hn_txr = &sc->hn_tx_ring[i]; 3237 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 3238 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 3239 rxr->hn_rx_idx = i; 3240 rxr->hn_rxbuf = sc->hn_rxbuf; 3241 3242 /* 3243 * Initialize LRO. 3244 */ 3245 #if defined(INET) || defined(INET6) 3246 #if __FreeBSD_version >= 1100095 3247 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 3248 hn_lro_mbufq_depth); 3249 #else 3250 tcp_lro_init(&rxr->hn_lro); 3251 rxr->hn_lro.ifp = sc->hn_ifp; 3252 #endif 3253 #if __FreeBSD_version >= 1100099 3254 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 3255 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 3256 #endif 3257 #endif /* INET || INET6 */ 3258 3259 if (sc->hn_rx_sysctl_tree != NULL) { 3260 char name[16]; 3261 3262 /* 3263 * Create per RX ring sysctl tree: 3264 * dev.hn.UNIT.rx.RINGID 3265 */ 3266 snprintf(name, sizeof(name), "%d", i); 3267 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 3268 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 3269 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3270 3271 if (rxr->hn_rx_sysctl_tree != NULL) { 3272 SYSCTL_ADD_ULONG(ctx, 3273 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3274 OID_AUTO, "packets", CTLFLAG_RW, 3275 &rxr->hn_pkts, "# of packets received"); 3276 SYSCTL_ADD_ULONG(ctx, 3277 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3278 OID_AUTO, "rss_pkts", CTLFLAG_RW, 3279 &rxr->hn_rss_pkts, 3280 "# of packets w/ RSS info received"); 3281 SYSCTL_ADD_INT(ctx, 3282 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 3283 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 3284 &rxr->hn_pktbuf_len, 0, 3285 "Temporary channel packet buffer length"); 3286 } 3287 } 3288 } 3289 3290 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 3291 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3292 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 3293 #if __FreeBSD_version < 1100095 3294 hn_rx_stat_int_sysctl, 3295 #else 3296 hn_rx_stat_u64_sysctl, 3297 #endif 3298 "LU", "LRO queued"); 3299 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 3300 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3301 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 3302 #if __FreeBSD_version < 1100095 3303 hn_rx_stat_int_sysctl, 3304 #else 3305 hn_rx_stat_u64_sysctl, 3306 #endif 3307 "LU", "LRO flushed"); 3308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 3309 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3310 __offsetof(struct hn_rx_ring, hn_lro_tried), 3311 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 3312 #if __FreeBSD_version >= 1100099 3313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 3314 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3315 hn_lro_lenlim_sysctl, "IU", 3316 "Max # of data bytes to be aggregated by LRO"); 3317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 3318 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3319 hn_lro_ackcnt_sysctl, "I", 3320 "Max # of ACKs to be aggregated by LRO"); 3321 #endif 3322 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 3323 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 3324 hn_trust_hcsum_sysctl, "I", 3325 "Trust tcp segement verification on host side, " 3326 "when csum info is missing"); 3327 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 3328 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 3329 hn_trust_hcsum_sysctl, "I", 3330 "Trust udp datagram verification on host side, " 3331 "when csum info is missing"); 3332 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 3333 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 3334 hn_trust_hcsum_sysctl, "I", 3335 "Trust ip packet verification on host side, " 3336 "when csum info is missing"); 3337 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 3338 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3339 __offsetof(struct hn_rx_ring, hn_csum_ip), 3340 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 3341 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 3342 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3343 __offsetof(struct hn_rx_ring, hn_csum_tcp), 3344 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 3345 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 3346 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3347 __offsetof(struct hn_rx_ring, hn_csum_udp), 3348 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 3349 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 3350 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3351 __offsetof(struct hn_rx_ring, hn_csum_trusted), 3352 hn_rx_stat_ulong_sysctl, "LU", 3353 "# of packets that we trust host's csum verification"); 3354 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 3355 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3356 __offsetof(struct hn_rx_ring, hn_small_pkts), 3357 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 3358 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 3359 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3360 __offsetof(struct hn_rx_ring, hn_ack_failed), 3361 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 3362 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 3363 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 3364 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 3365 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 3366 3367 return (0); 3368 } 3369 3370 static void 3371 hn_destroy_rx_data(struct hn_softc *sc) 3372 { 3373 int i; 3374 3375 if (sc->hn_rxbuf != NULL) { 3376 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 3377 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 3378 else 3379 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 3380 sc->hn_rxbuf = NULL; 3381 } 3382 3383 if (sc->hn_rx_ring_cnt == 0) 3384 return; 3385 3386 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3387 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3388 3389 if (rxr->hn_br == NULL) 3390 continue; 3391 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 3392 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 3393 } else { 3394 device_printf(sc->hn_dev, 3395 "%dth channel bufring is referenced", i); 3396 } 3397 rxr->hn_br = NULL; 3398 3399 #if defined(INET) || defined(INET6) 3400 tcp_lro_free(&rxr->hn_lro); 3401 #endif 3402 free(rxr->hn_pktbuf, M_DEVBUF); 3403 } 3404 free(sc->hn_rx_ring, M_DEVBUF); 3405 sc->hn_rx_ring = NULL; 3406 3407 sc->hn_rx_ring_cnt = 0; 3408 sc->hn_rx_ring_inuse = 0; 3409 } 3410 3411 static int 3412 hn_tx_ring_create(struct hn_softc *sc, int id) 3413 { 3414 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 3415 device_t dev = sc->hn_dev; 3416 bus_dma_tag_t parent_dtag; 3417 int error, i; 3418 3419 txr->hn_sc = sc; 3420 txr->hn_tx_idx = id; 3421 3422 #ifndef HN_USE_TXDESC_BUFRING 3423 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 3424 #endif 3425 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 3426 3427 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 3428 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 3429 M_DEVBUF, M_WAITOK | M_ZERO); 3430 #ifndef HN_USE_TXDESC_BUFRING 3431 SLIST_INIT(&txr->hn_txlist); 3432 #else 3433 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 3434 M_WAITOK, &txr->hn_tx_lock); 3435 #endif 3436 3437 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 3438 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 3439 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 3440 } else { 3441 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 3442 } 3443 3444 #ifdef HN_IFSTART_SUPPORT 3445 if (hn_use_if_start) { 3446 txr->hn_txeof = hn_start_txeof; 3447 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 3448 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 3449 } else 3450 #endif 3451 { 3452 int br_depth; 3453 3454 txr->hn_txeof = hn_xmit_txeof; 3455 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 3456 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 3457 3458 br_depth = hn_get_txswq_depth(txr); 3459 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 3460 M_WAITOK, &txr->hn_tx_lock); 3461 } 3462 3463 txr->hn_direct_tx_size = hn_direct_tx_size; 3464 3465 /* 3466 * Always schedule transmission instead of trying to do direct 3467 * transmission. This one gives the best performance so far. 3468 */ 3469 txr->hn_sched_tx = 1; 3470 3471 parent_dtag = bus_get_dma_tag(dev); 3472 3473 /* DMA tag for RNDIS packet messages. */ 3474 error = bus_dma_tag_create(parent_dtag, /* parent */ 3475 HN_RNDIS_PKT_ALIGN, /* alignment */ 3476 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 3477 BUS_SPACE_MAXADDR, /* lowaddr */ 3478 BUS_SPACE_MAXADDR, /* highaddr */ 3479 NULL, NULL, /* filter, filterarg */ 3480 HN_RNDIS_PKT_LEN, /* maxsize */ 3481 1, /* nsegments */ 3482 HN_RNDIS_PKT_LEN, /* maxsegsize */ 3483 0, /* flags */ 3484 NULL, /* lockfunc */ 3485 NULL, /* lockfuncarg */ 3486 &txr->hn_tx_rndis_dtag); 3487 if (error) { 3488 device_printf(dev, "failed to create rndis dmatag\n"); 3489 return error; 3490 } 3491 3492 /* DMA tag for data. */ 3493 error = bus_dma_tag_create(parent_dtag, /* parent */ 3494 1, /* alignment */ 3495 HN_TX_DATA_BOUNDARY, /* boundary */ 3496 BUS_SPACE_MAXADDR, /* lowaddr */ 3497 BUS_SPACE_MAXADDR, /* highaddr */ 3498 NULL, NULL, /* filter, filterarg */ 3499 HN_TX_DATA_MAXSIZE, /* maxsize */ 3500 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 3501 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 3502 0, /* flags */ 3503 NULL, /* lockfunc */ 3504 NULL, /* lockfuncarg */ 3505 &txr->hn_tx_data_dtag); 3506 if (error) { 3507 device_printf(dev, "failed to create data dmatag\n"); 3508 return error; 3509 } 3510 3511 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 3512 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 3513 3514 txd->txr = txr; 3515 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3516 STAILQ_INIT(&txd->agg_list); 3517 3518 /* 3519 * Allocate and load RNDIS packet message. 3520 */ 3521 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 3522 (void **)&txd->rndis_pkt, 3523 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 3524 &txd->rndis_pkt_dmap); 3525 if (error) { 3526 device_printf(dev, 3527 "failed to allocate rndis_packet_msg, %d\n", i); 3528 return error; 3529 } 3530 3531 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 3532 txd->rndis_pkt_dmap, 3533 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 3534 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 3535 BUS_DMA_NOWAIT); 3536 if (error) { 3537 device_printf(dev, 3538 "failed to load rndis_packet_msg, %d\n", i); 3539 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3540 txd->rndis_pkt, txd->rndis_pkt_dmap); 3541 return error; 3542 } 3543 3544 /* DMA map for TX data. */ 3545 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 3546 &txd->data_dmap); 3547 if (error) { 3548 device_printf(dev, 3549 "failed to allocate tx data dmamap\n"); 3550 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 3551 txd->rndis_pkt_dmap); 3552 bus_dmamem_free(txr->hn_tx_rndis_dtag, 3553 txd->rndis_pkt, txd->rndis_pkt_dmap); 3554 return error; 3555 } 3556 3557 /* All set, put it to list */ 3558 txd->flags |= HN_TXD_FLAG_ONLIST; 3559 #ifndef HN_USE_TXDESC_BUFRING 3560 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 3561 #else 3562 buf_ring_enqueue(txr->hn_txdesc_br, txd); 3563 #endif 3564 } 3565 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 3566 3567 if (sc->hn_tx_sysctl_tree != NULL) { 3568 struct sysctl_oid_list *child; 3569 struct sysctl_ctx_list *ctx; 3570 char name[16]; 3571 3572 /* 3573 * Create per TX ring sysctl tree: 3574 * dev.hn.UNIT.tx.RINGID 3575 */ 3576 ctx = device_get_sysctl_ctx(dev); 3577 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 3578 3579 snprintf(name, sizeof(name), "%d", id); 3580 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 3581 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3582 3583 if (txr->hn_tx_sysctl_tree != NULL) { 3584 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 3585 3586 #ifdef HN_DEBUG 3587 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 3588 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 3589 "# of available TX descs"); 3590 #endif 3591 #ifdef HN_IFSTART_SUPPORT 3592 if (!hn_use_if_start) 3593 #endif 3594 { 3595 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 3596 CTLFLAG_RD, &txr->hn_oactive, 0, 3597 "over active"); 3598 } 3599 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 3600 CTLFLAG_RW, &txr->hn_pkts, 3601 "# of packets transmitted"); 3602 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 3603 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 3604 } 3605 } 3606 3607 return 0; 3608 } 3609 3610 static void 3611 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 3612 { 3613 struct hn_tx_ring *txr = txd->txr; 3614 3615 KASSERT(txd->m == NULL, ("still has mbuf installed")); 3616 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 3617 3618 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 3619 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 3620 txd->rndis_pkt_dmap); 3621 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 3622 } 3623 3624 static void 3625 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 3626 { 3627 3628 KASSERT(txd->refs == 0 || txd->refs == 1, 3629 ("invalid txd refs %d", txd->refs)); 3630 3631 /* Aggregated txds will be freed by their aggregating txd. */ 3632 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 3633 int freed; 3634 3635 freed = hn_txdesc_put(txr, txd); 3636 KASSERT(freed, ("can't free txdesc")); 3637 } 3638 } 3639 3640 static void 3641 hn_tx_ring_destroy(struct hn_tx_ring *txr) 3642 { 3643 int i; 3644 3645 if (txr->hn_txdesc == NULL) 3646 return; 3647 3648 /* 3649 * NOTE: 3650 * Because the freeing of aggregated txds will be deferred 3651 * to the aggregating txd, two passes are used here: 3652 * - The first pass GCes any pending txds. This GC is necessary, 3653 * since if the channels are revoked, hypervisor will not 3654 * deliver send-done for all pending txds. 3655 * - The second pass frees the busdma stuffs, i.e. after all txds 3656 * were freed. 3657 */ 3658 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3659 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 3660 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 3661 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 3662 3663 if (txr->hn_tx_data_dtag != NULL) 3664 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 3665 if (txr->hn_tx_rndis_dtag != NULL) 3666 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 3667 3668 #ifdef HN_USE_TXDESC_BUFRING 3669 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 3670 #endif 3671 3672 free(txr->hn_txdesc, M_DEVBUF); 3673 txr->hn_txdesc = NULL; 3674 3675 if (txr->hn_mbuf_br != NULL) 3676 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 3677 3678 #ifndef HN_USE_TXDESC_BUFRING 3679 mtx_destroy(&txr->hn_txlist_spin); 3680 #endif 3681 mtx_destroy(&txr->hn_tx_lock); 3682 } 3683 3684 static int 3685 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 3686 { 3687 struct sysctl_oid_list *child; 3688 struct sysctl_ctx_list *ctx; 3689 int i; 3690 3691 /* 3692 * Create TXBUF for chimney sending. 3693 * 3694 * NOTE: It is shared by all channels. 3695 */ 3696 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 3697 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 3698 BUS_DMA_WAITOK | BUS_DMA_ZERO); 3699 if (sc->hn_chim == NULL) { 3700 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 3701 return (ENOMEM); 3702 } 3703 3704 sc->hn_tx_ring_cnt = ring_cnt; 3705 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 3706 3707 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 3708 M_DEVBUF, M_WAITOK | M_ZERO); 3709 3710 ctx = device_get_sysctl_ctx(sc->hn_dev); 3711 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 3712 3713 /* Create dev.hn.UNIT.tx sysctl tree */ 3714 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 3715 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 3716 3717 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3718 int error; 3719 3720 error = hn_tx_ring_create(sc, i); 3721 if (error) 3722 return error; 3723 } 3724 3725 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 3726 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3727 __offsetof(struct hn_tx_ring, hn_no_txdescs), 3728 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 3729 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 3730 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3731 __offsetof(struct hn_tx_ring, hn_send_failed), 3732 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 3733 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 3734 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3735 __offsetof(struct hn_tx_ring, hn_txdma_failed), 3736 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 3737 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 3738 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3739 __offsetof(struct hn_tx_ring, hn_flush_failed), 3740 hn_tx_stat_ulong_sysctl, "LU", 3741 "# of packet transmission aggregation flush failure"); 3742 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 3743 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3744 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 3745 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 3746 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 3747 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3748 __offsetof(struct hn_tx_ring, hn_tx_chimney), 3749 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 3750 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 3751 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3752 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 3753 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 3754 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 3755 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 3756 "# of total TX descs"); 3757 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 3758 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 3759 "Chimney send packet size upper boundary"); 3760 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 3761 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 3762 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 3763 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 3764 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3765 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 3766 hn_tx_conf_int_sysctl, "I", 3767 "Size of the packet for direct transmission"); 3768 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 3769 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 3770 __offsetof(struct hn_tx_ring, hn_sched_tx), 3771 hn_tx_conf_int_sysctl, "I", 3772 "Always schedule transmission " 3773 "instead of doing direct transmission"); 3774 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 3775 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 3776 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 3777 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 3778 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 3779 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 3780 "Applied packet transmission aggregation size"); 3781 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 3782 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3783 hn_txagg_pktmax_sysctl, "I", 3784 "Applied packet transmission aggregation packets"); 3785 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 3786 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 3787 hn_txagg_align_sysctl, "I", 3788 "Applied packet transmission aggregation alignment"); 3789 3790 return 0; 3791 } 3792 3793 static void 3794 hn_set_chim_size(struct hn_softc *sc, int chim_size) 3795 { 3796 int i; 3797 3798 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3799 sc->hn_tx_ring[i].hn_chim_size = chim_size; 3800 } 3801 3802 static void 3803 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 3804 { 3805 struct ifnet *ifp = sc->hn_ifp; 3806 int tso_minlen; 3807 3808 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 3809 return; 3810 3811 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 3812 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 3813 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 3814 3815 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 3816 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 3817 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 3818 3819 if (tso_maxlen < tso_minlen) 3820 tso_maxlen = tso_minlen; 3821 else if (tso_maxlen > IP_MAXPACKET) 3822 tso_maxlen = IP_MAXPACKET; 3823 if (tso_maxlen > sc->hn_ndis_tso_szmax) 3824 tso_maxlen = sc->hn_ndis_tso_szmax; 3825 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 3826 if (bootverbose) 3827 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 3828 } 3829 3830 static void 3831 hn_fixup_tx_data(struct hn_softc *sc) 3832 { 3833 uint64_t csum_assist; 3834 int i; 3835 3836 hn_set_chim_size(sc, sc->hn_chim_szmax); 3837 if (hn_tx_chimney_size > 0 && 3838 hn_tx_chimney_size < sc->hn_chim_szmax) 3839 hn_set_chim_size(sc, hn_tx_chimney_size); 3840 3841 csum_assist = 0; 3842 if (sc->hn_caps & HN_CAP_IPCS) 3843 csum_assist |= CSUM_IP; 3844 if (sc->hn_caps & HN_CAP_TCP4CS) 3845 csum_assist |= CSUM_IP_TCP; 3846 if (sc->hn_caps & HN_CAP_UDP4CS) 3847 csum_assist |= CSUM_IP_UDP; 3848 if (sc->hn_caps & HN_CAP_TCP6CS) 3849 csum_assist |= CSUM_IP6_TCP; 3850 if (sc->hn_caps & HN_CAP_UDP6CS) 3851 csum_assist |= CSUM_IP6_UDP; 3852 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3853 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 3854 3855 if (sc->hn_caps & HN_CAP_HASHVAL) { 3856 /* 3857 * Support HASHVAL pktinfo on TX path. 3858 */ 3859 if (bootverbose) 3860 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 3861 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3862 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 3863 } 3864 } 3865 3866 static void 3867 hn_destroy_tx_data(struct hn_softc *sc) 3868 { 3869 int i; 3870 3871 if (sc->hn_chim != NULL) { 3872 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 3873 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 3874 } else { 3875 device_printf(sc->hn_dev, 3876 "chimney sending buffer is referenced"); 3877 } 3878 sc->hn_chim = NULL; 3879 } 3880 3881 if (sc->hn_tx_ring_cnt == 0) 3882 return; 3883 3884 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 3885 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 3886 3887 free(sc->hn_tx_ring, M_DEVBUF); 3888 sc->hn_tx_ring = NULL; 3889 3890 sc->hn_tx_ring_cnt = 0; 3891 sc->hn_tx_ring_inuse = 0; 3892 } 3893 3894 #ifdef HN_IFSTART_SUPPORT 3895 3896 static void 3897 hn_start_taskfunc(void *xtxr, int pending __unused) 3898 { 3899 struct hn_tx_ring *txr = xtxr; 3900 3901 mtx_lock(&txr->hn_tx_lock); 3902 hn_start_locked(txr, 0); 3903 mtx_unlock(&txr->hn_tx_lock); 3904 } 3905 3906 static int 3907 hn_start_locked(struct hn_tx_ring *txr, int len) 3908 { 3909 struct hn_softc *sc = txr->hn_sc; 3910 struct ifnet *ifp = sc->hn_ifp; 3911 int sched = 0; 3912 3913 KASSERT(hn_use_if_start, 3914 ("hn_start_locked is called, when if_start is disabled")); 3915 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 3916 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 3917 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3918 3919 if (__predict_false(txr->hn_suspended)) 3920 return (0); 3921 3922 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 3923 IFF_DRV_RUNNING) 3924 return (0); 3925 3926 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 3927 struct hn_txdesc *txd; 3928 struct mbuf *m_head; 3929 int error; 3930 3931 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 3932 if (m_head == NULL) 3933 break; 3934 3935 if (len > 0 && m_head->m_pkthdr.len > len) { 3936 /* 3937 * This sending could be time consuming; let callers 3938 * dispatch this packet sending (and sending of any 3939 * following up packets) to tx taskqueue. 3940 */ 3941 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3942 sched = 1; 3943 break; 3944 } 3945 3946 #if defined(INET6) || defined(INET) 3947 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3948 m_head = hn_tso_fixup(m_head); 3949 if (__predict_false(m_head == NULL)) { 3950 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3951 continue; 3952 } 3953 } 3954 #endif 3955 3956 txd = hn_txdesc_get(txr); 3957 if (txd == NULL) { 3958 txr->hn_no_txdescs++; 3959 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3960 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3961 break; 3962 } 3963 3964 error = hn_encap(ifp, txr, txd, &m_head); 3965 if (error) { 3966 /* Both txd and m_head are freed */ 3967 KASSERT(txr->hn_agg_txd == NULL, 3968 ("encap failed w/ pending aggregating txdesc")); 3969 continue; 3970 } 3971 3972 if (txr->hn_agg_pktleft == 0) { 3973 if (txr->hn_agg_txd != NULL) { 3974 KASSERT(m_head == NULL, 3975 ("pending mbuf for aggregating txdesc")); 3976 error = hn_flush_txagg(ifp, txr); 3977 if (__predict_false(error)) { 3978 atomic_set_int(&ifp->if_drv_flags, 3979 IFF_DRV_OACTIVE); 3980 break; 3981 } 3982 } else { 3983 KASSERT(m_head != NULL, ("mbuf was freed")); 3984 error = hn_txpkt(ifp, txr, txd); 3985 if (__predict_false(error)) { 3986 /* txd is freed, but m_head is not */ 3987 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 3988 atomic_set_int(&ifp->if_drv_flags, 3989 IFF_DRV_OACTIVE); 3990 break; 3991 } 3992 } 3993 } 3994 #ifdef INVARIANTS 3995 else { 3996 KASSERT(txr->hn_agg_txd != NULL, 3997 ("no aggregating txdesc")); 3998 KASSERT(m_head == NULL, 3999 ("pending mbuf for aggregating txdesc")); 4000 } 4001 #endif 4002 } 4003 4004 /* Flush pending aggerated transmission. */ 4005 if (txr->hn_agg_txd != NULL) 4006 hn_flush_txagg(ifp, txr); 4007 return (sched); 4008 } 4009 4010 static void 4011 hn_start(struct ifnet *ifp) 4012 { 4013 struct hn_softc *sc = ifp->if_softc; 4014 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 4015 4016 if (txr->hn_sched_tx) 4017 goto do_sched; 4018 4019 if (mtx_trylock(&txr->hn_tx_lock)) { 4020 int sched; 4021 4022 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4023 mtx_unlock(&txr->hn_tx_lock); 4024 if (!sched) 4025 return; 4026 } 4027 do_sched: 4028 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4029 } 4030 4031 static void 4032 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 4033 { 4034 struct hn_tx_ring *txr = xtxr; 4035 4036 mtx_lock(&txr->hn_tx_lock); 4037 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 4038 hn_start_locked(txr, 0); 4039 mtx_unlock(&txr->hn_tx_lock); 4040 } 4041 4042 static void 4043 hn_start_txeof(struct hn_tx_ring *txr) 4044 { 4045 struct hn_softc *sc = txr->hn_sc; 4046 struct ifnet *ifp = sc->hn_ifp; 4047 4048 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 4049 4050 if (txr->hn_sched_tx) 4051 goto do_sched; 4052 4053 if (mtx_trylock(&txr->hn_tx_lock)) { 4054 int sched; 4055 4056 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4057 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 4058 mtx_unlock(&txr->hn_tx_lock); 4059 if (sched) { 4060 taskqueue_enqueue(txr->hn_tx_taskq, 4061 &txr->hn_tx_task); 4062 } 4063 } else { 4064 do_sched: 4065 /* 4066 * Release the OACTIVE earlier, with the hope, that 4067 * others could catch up. The task will clear the 4068 * flag again with the hn_tx_lock to avoid possible 4069 * races. 4070 */ 4071 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4072 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4073 } 4074 } 4075 4076 #endif /* HN_IFSTART_SUPPORT */ 4077 4078 static int 4079 hn_xmit(struct hn_tx_ring *txr, int len) 4080 { 4081 struct hn_softc *sc = txr->hn_sc; 4082 struct ifnet *ifp = sc->hn_ifp; 4083 struct mbuf *m_head; 4084 int sched = 0; 4085 4086 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 4087 #ifdef HN_IFSTART_SUPPORT 4088 KASSERT(hn_use_if_start == 0, 4089 ("hn_xmit is called, when if_start is enabled")); 4090 #endif 4091 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 4092 4093 if (__predict_false(txr->hn_suspended)) 4094 return (0); 4095 4096 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 4097 return (0); 4098 4099 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 4100 struct hn_txdesc *txd; 4101 int error; 4102 4103 if (len > 0 && m_head->m_pkthdr.len > len) { 4104 /* 4105 * This sending could be time consuming; let callers 4106 * dispatch this packet sending (and sending of any 4107 * following up packets) to tx taskqueue. 4108 */ 4109 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4110 sched = 1; 4111 break; 4112 } 4113 4114 txd = hn_txdesc_get(txr); 4115 if (txd == NULL) { 4116 txr->hn_no_txdescs++; 4117 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 4118 txr->hn_oactive = 1; 4119 break; 4120 } 4121 4122 error = hn_encap(ifp, txr, txd, &m_head); 4123 if (error) { 4124 /* Both txd and m_head are freed; discard */ 4125 KASSERT(txr->hn_agg_txd == NULL, 4126 ("encap failed w/ pending aggregating txdesc")); 4127 drbr_advance(ifp, txr->hn_mbuf_br); 4128 continue; 4129 } 4130 4131 if (txr->hn_agg_pktleft == 0) { 4132 if (txr->hn_agg_txd != NULL) { 4133 KASSERT(m_head == NULL, 4134 ("pending mbuf for aggregating txdesc")); 4135 error = hn_flush_txagg(ifp, txr); 4136 if (__predict_false(error)) { 4137 txr->hn_oactive = 1; 4138 break; 4139 } 4140 } else { 4141 KASSERT(m_head != NULL, ("mbuf was freed")); 4142 error = hn_txpkt(ifp, txr, txd); 4143 if (__predict_false(error)) { 4144 /* txd is freed, but m_head is not */ 4145 drbr_putback(ifp, txr->hn_mbuf_br, 4146 m_head); 4147 txr->hn_oactive = 1; 4148 break; 4149 } 4150 } 4151 } 4152 #ifdef INVARIANTS 4153 else { 4154 KASSERT(txr->hn_agg_txd != NULL, 4155 ("no aggregating txdesc")); 4156 KASSERT(m_head == NULL, 4157 ("pending mbuf for aggregating txdesc")); 4158 } 4159 #endif 4160 4161 /* Sent */ 4162 drbr_advance(ifp, txr->hn_mbuf_br); 4163 } 4164 4165 /* Flush pending aggerated transmission. */ 4166 if (txr->hn_agg_txd != NULL) 4167 hn_flush_txagg(ifp, txr); 4168 return (sched); 4169 } 4170 4171 static int 4172 hn_transmit(struct ifnet *ifp, struct mbuf *m) 4173 { 4174 struct hn_softc *sc = ifp->if_softc; 4175 struct hn_tx_ring *txr; 4176 int error, idx = 0; 4177 4178 #if defined(INET6) || defined(INET) 4179 /* 4180 * Perform TSO packet header fixup now, since the TSO 4181 * packet header should be cache-hot. 4182 */ 4183 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 4184 m = hn_tso_fixup(m); 4185 if (__predict_false(m == NULL)) { 4186 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 4187 return EIO; 4188 } 4189 } 4190 #endif 4191 4192 /* 4193 * Select the TX ring based on flowid 4194 */ 4195 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 4196 #ifdef RSS 4197 uint32_t bid; 4198 4199 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 4200 &bid) == 0) 4201 idx = bid % sc->hn_tx_ring_inuse; 4202 else 4203 #endif 4204 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 4205 } 4206 txr = &sc->hn_tx_ring[idx]; 4207 4208 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 4209 if (error) { 4210 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 4211 return error; 4212 } 4213 4214 if (txr->hn_oactive) 4215 return 0; 4216 4217 if (txr->hn_sched_tx) 4218 goto do_sched; 4219 4220 if (mtx_trylock(&txr->hn_tx_lock)) { 4221 int sched; 4222 4223 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4224 mtx_unlock(&txr->hn_tx_lock); 4225 if (!sched) 4226 return 0; 4227 } 4228 do_sched: 4229 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 4230 return 0; 4231 } 4232 4233 static void 4234 hn_tx_ring_qflush(struct hn_tx_ring *txr) 4235 { 4236 struct mbuf *m; 4237 4238 mtx_lock(&txr->hn_tx_lock); 4239 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 4240 m_freem(m); 4241 mtx_unlock(&txr->hn_tx_lock); 4242 } 4243 4244 static void 4245 hn_xmit_qflush(struct ifnet *ifp) 4246 { 4247 struct hn_softc *sc = ifp->if_softc; 4248 int i; 4249 4250 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4251 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4252 if_qflush(ifp); 4253 } 4254 4255 static void 4256 hn_xmit_txeof(struct hn_tx_ring *txr) 4257 { 4258 4259 if (txr->hn_sched_tx) 4260 goto do_sched; 4261 4262 if (mtx_trylock(&txr->hn_tx_lock)) { 4263 int sched; 4264 4265 txr->hn_oactive = 0; 4266 sched = hn_xmit(txr, txr->hn_direct_tx_size); 4267 mtx_unlock(&txr->hn_tx_lock); 4268 if (sched) { 4269 taskqueue_enqueue(txr->hn_tx_taskq, 4270 &txr->hn_tx_task); 4271 } 4272 } else { 4273 do_sched: 4274 /* 4275 * Release the oactive earlier, with the hope, that 4276 * others could catch up. The task will clear the 4277 * oactive again with the hn_tx_lock to avoid possible 4278 * races. 4279 */ 4280 txr->hn_oactive = 0; 4281 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4282 } 4283 } 4284 4285 static void 4286 hn_xmit_taskfunc(void *xtxr, int pending __unused) 4287 { 4288 struct hn_tx_ring *txr = xtxr; 4289 4290 mtx_lock(&txr->hn_tx_lock); 4291 hn_xmit(txr, 0); 4292 mtx_unlock(&txr->hn_tx_lock); 4293 } 4294 4295 static void 4296 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 4297 { 4298 struct hn_tx_ring *txr = xtxr; 4299 4300 mtx_lock(&txr->hn_tx_lock); 4301 txr->hn_oactive = 0; 4302 hn_xmit(txr, 0); 4303 mtx_unlock(&txr->hn_tx_lock); 4304 } 4305 4306 static int 4307 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 4308 { 4309 struct vmbus_chan_br cbr; 4310 struct hn_rx_ring *rxr; 4311 struct hn_tx_ring *txr = NULL; 4312 int idx, error; 4313 4314 idx = vmbus_chan_subidx(chan); 4315 4316 /* 4317 * Link this channel to RX/TX ring. 4318 */ 4319 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4320 ("invalid channel index %d, should > 0 && < %d", 4321 idx, sc->hn_rx_ring_inuse)); 4322 rxr = &sc->hn_rx_ring[idx]; 4323 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 4324 ("RX ring %d already attached", idx)); 4325 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 4326 4327 if (bootverbose) { 4328 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 4329 idx, vmbus_chan_id(chan)); 4330 } 4331 4332 if (idx < sc->hn_tx_ring_inuse) { 4333 txr = &sc->hn_tx_ring[idx]; 4334 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 4335 ("TX ring %d already attached", idx)); 4336 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 4337 4338 txr->hn_chan = chan; 4339 if (bootverbose) { 4340 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 4341 idx, vmbus_chan_id(chan)); 4342 } 4343 } 4344 4345 /* Bind this channel to a proper CPU. */ 4346 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 4347 4348 /* 4349 * Open this channel 4350 */ 4351 cbr.cbr = rxr->hn_br; 4352 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 4353 cbr.cbr_txsz = HN_TXBR_SIZE; 4354 cbr.cbr_rxsz = HN_RXBR_SIZE; 4355 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 4356 if (error) { 4357 if (error == EISCONN) { 4358 if_printf(sc->hn_ifp, "bufring is connected after " 4359 "chan%u open failure\n", vmbus_chan_id(chan)); 4360 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4361 } else { 4362 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 4363 vmbus_chan_id(chan), error); 4364 } 4365 } 4366 return (error); 4367 } 4368 4369 static void 4370 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 4371 { 4372 struct hn_rx_ring *rxr; 4373 int idx, error; 4374 4375 idx = vmbus_chan_subidx(chan); 4376 4377 /* 4378 * Link this channel to RX/TX ring. 4379 */ 4380 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 4381 ("invalid channel index %d, should > 0 && < %d", 4382 idx, sc->hn_rx_ring_inuse)); 4383 rxr = &sc->hn_rx_ring[idx]; 4384 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 4385 ("RX ring %d is not attached", idx)); 4386 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 4387 4388 if (idx < sc->hn_tx_ring_inuse) { 4389 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 4390 4391 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 4392 ("TX ring %d is not attached attached", idx)); 4393 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 4394 } 4395 4396 /* 4397 * Close this channel. 4398 * 4399 * NOTE: 4400 * Channel closing does _not_ destroy the target channel. 4401 */ 4402 error = vmbus_chan_close_direct(chan); 4403 if (error == EISCONN) { 4404 if_printf(sc->hn_ifp, "chan%u bufring is connected " 4405 "after being closed\n", vmbus_chan_id(chan)); 4406 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 4407 } else if (error) { 4408 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 4409 vmbus_chan_id(chan), error); 4410 } 4411 } 4412 4413 static int 4414 hn_attach_subchans(struct hn_softc *sc) 4415 { 4416 struct vmbus_channel **subchans; 4417 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4418 int i, error = 0; 4419 4420 KASSERT(subchan_cnt > 0, ("no sub-channels")); 4421 4422 /* Attach the sub-channels. */ 4423 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4424 for (i = 0; i < subchan_cnt; ++i) { 4425 int error1; 4426 4427 error1 = hn_chan_attach(sc, subchans[i]); 4428 if (error1) { 4429 error = error1; 4430 /* Move on; all channels will be detached later. */ 4431 } 4432 } 4433 vmbus_subchan_rel(subchans, subchan_cnt); 4434 4435 if (error) { 4436 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 4437 } else { 4438 if (bootverbose) { 4439 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 4440 subchan_cnt); 4441 } 4442 } 4443 return (error); 4444 } 4445 4446 static void 4447 hn_detach_allchans(struct hn_softc *sc) 4448 { 4449 struct vmbus_channel **subchans; 4450 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 4451 int i; 4452 4453 if (subchan_cnt == 0) 4454 goto back; 4455 4456 /* Detach the sub-channels. */ 4457 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 4458 for (i = 0; i < subchan_cnt; ++i) 4459 hn_chan_detach(sc, subchans[i]); 4460 vmbus_subchan_rel(subchans, subchan_cnt); 4461 4462 back: 4463 /* 4464 * Detach the primary channel, _after_ all sub-channels 4465 * are detached. 4466 */ 4467 hn_chan_detach(sc, sc->hn_prichan); 4468 4469 /* Wait for sub-channels to be destroyed, if any. */ 4470 vmbus_subchan_drain(sc->hn_prichan); 4471 4472 #ifdef INVARIANTS 4473 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4474 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 4475 HN_RX_FLAG_ATTACHED) == 0, 4476 ("%dth RX ring is still attached", i)); 4477 } 4478 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4479 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 4480 HN_TX_FLAG_ATTACHED) == 0, 4481 ("%dth TX ring is still attached", i)); 4482 } 4483 #endif 4484 } 4485 4486 static int 4487 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 4488 { 4489 struct vmbus_channel **subchans; 4490 int nchan, rxr_cnt, error; 4491 4492 nchan = *nsubch + 1; 4493 if (nchan == 1) { 4494 /* 4495 * Multiple RX/TX rings are not requested. 4496 */ 4497 *nsubch = 0; 4498 return (0); 4499 } 4500 4501 /* 4502 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 4503 * table entries. 4504 */ 4505 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 4506 if (error) { 4507 /* No RSS; this is benign. */ 4508 *nsubch = 0; 4509 return (0); 4510 } 4511 if (bootverbose) { 4512 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 4513 rxr_cnt, nchan); 4514 } 4515 4516 if (nchan > rxr_cnt) 4517 nchan = rxr_cnt; 4518 if (nchan == 1) { 4519 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 4520 *nsubch = 0; 4521 return (0); 4522 } 4523 4524 /* 4525 * Allocate sub-channels from NVS. 4526 */ 4527 *nsubch = nchan - 1; 4528 error = hn_nvs_alloc_subchans(sc, nsubch); 4529 if (error || *nsubch == 0) { 4530 /* Failed to allocate sub-channels. */ 4531 *nsubch = 0; 4532 return (0); 4533 } 4534 4535 /* 4536 * Wait for all sub-channels to become ready before moving on. 4537 */ 4538 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 4539 vmbus_subchan_rel(subchans, *nsubch); 4540 return (0); 4541 } 4542 4543 static bool 4544 hn_synth_attachable(const struct hn_softc *sc) 4545 { 4546 int i; 4547 4548 if (sc->hn_flags & HN_FLAG_ERRORS) 4549 return (false); 4550 4551 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4552 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4553 4554 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 4555 return (false); 4556 } 4557 return (true); 4558 } 4559 4560 static int 4561 hn_synth_attach(struct hn_softc *sc, int mtu) 4562 { 4563 #define ATTACHED_NVS 0x0002 4564 #define ATTACHED_RNDIS 0x0004 4565 4566 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 4567 int error, nsubch, nchan, i; 4568 uint32_t old_caps, attached = 0; 4569 4570 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 4571 ("synthetic parts were attached")); 4572 4573 if (!hn_synth_attachable(sc)) 4574 return (ENXIO); 4575 4576 /* Save capabilities for later verification. */ 4577 old_caps = sc->hn_caps; 4578 sc->hn_caps = 0; 4579 4580 /* Clear RSS stuffs. */ 4581 sc->hn_rss_ind_size = 0; 4582 sc->hn_rss_hash = 0; 4583 4584 /* 4585 * Attach the primary channel _before_ attaching NVS and RNDIS. 4586 */ 4587 error = hn_chan_attach(sc, sc->hn_prichan); 4588 if (error) 4589 goto failed; 4590 4591 /* 4592 * Attach NVS. 4593 */ 4594 error = hn_nvs_attach(sc, mtu); 4595 if (error) 4596 goto failed; 4597 attached |= ATTACHED_NVS; 4598 4599 /* 4600 * Attach RNDIS _after_ NVS is attached. 4601 */ 4602 error = hn_rndis_attach(sc, mtu); 4603 if (error) 4604 goto failed; 4605 attached |= ATTACHED_RNDIS; 4606 4607 /* 4608 * Make sure capabilities are not changed. 4609 */ 4610 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 4611 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 4612 old_caps, sc->hn_caps); 4613 error = ENXIO; 4614 goto failed; 4615 } 4616 4617 /* 4618 * Allocate sub-channels for multi-TX/RX rings. 4619 * 4620 * NOTE: 4621 * The # of RX rings that can be used is equivalent to the # of 4622 * channels to be requested. 4623 */ 4624 nsubch = sc->hn_rx_ring_cnt - 1; 4625 error = hn_synth_alloc_subchans(sc, &nsubch); 4626 if (error) 4627 goto failed; 4628 /* NOTE: _Full_ synthetic parts detach is required now. */ 4629 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 4630 4631 /* 4632 * Set the # of TX/RX rings that could be used according to 4633 * the # of channels that NVS offered. 4634 */ 4635 nchan = nsubch + 1; 4636 hn_set_ring_inuse(sc, nchan); 4637 if (nchan == 1) { 4638 /* Only the primary channel can be used; done */ 4639 goto back; 4640 } 4641 4642 /* 4643 * Attach the sub-channels. 4644 * 4645 * NOTE: hn_set_ring_inuse() _must_ have been called. 4646 */ 4647 error = hn_attach_subchans(sc); 4648 if (error) 4649 goto failed; 4650 4651 /* 4652 * Configure RSS key and indirect table _after_ all sub-channels 4653 * are attached. 4654 */ 4655 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 4656 /* 4657 * RSS key is not set yet; set it to the default RSS key. 4658 */ 4659 if (bootverbose) 4660 if_printf(sc->hn_ifp, "setup default RSS key\n"); 4661 #ifdef RSS 4662 rss_getkey(rss->rss_key); 4663 #else 4664 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 4665 #endif 4666 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4667 } 4668 4669 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 4670 /* 4671 * RSS indirect table is not set yet; set it up in round- 4672 * robin fashion. 4673 */ 4674 if (bootverbose) { 4675 if_printf(sc->hn_ifp, "setup default RSS indirect " 4676 "table\n"); 4677 } 4678 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 4679 uint32_t subidx; 4680 4681 #ifdef RSS 4682 subidx = rss_get_indirection_to_bucket(i); 4683 #else 4684 subidx = i; 4685 #endif 4686 rss->rss_ind[i] = subidx % nchan; 4687 } 4688 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4689 } else { 4690 /* 4691 * # of usable channels may be changed, so we have to 4692 * make sure that all entries in RSS indirect table 4693 * are valid. 4694 * 4695 * NOTE: hn_set_ring_inuse() _must_ have been called. 4696 */ 4697 hn_rss_ind_fixup(sc); 4698 } 4699 4700 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 4701 if (error) 4702 goto failed; 4703 back: 4704 /* 4705 * Fixup transmission aggregation setup. 4706 */ 4707 hn_set_txagg(sc); 4708 return (0); 4709 4710 failed: 4711 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 4712 hn_synth_detach(sc); 4713 } else { 4714 if (attached & ATTACHED_RNDIS) 4715 hn_rndis_detach(sc); 4716 if (attached & ATTACHED_NVS) 4717 hn_nvs_detach(sc); 4718 hn_chan_detach(sc, sc->hn_prichan); 4719 /* Restore old capabilities. */ 4720 sc->hn_caps = old_caps; 4721 } 4722 return (error); 4723 4724 #undef ATTACHED_RNDIS 4725 #undef ATTACHED_NVS 4726 } 4727 4728 /* 4729 * NOTE: 4730 * The interface must have been suspended though hn_suspend(), before 4731 * this function get called. 4732 */ 4733 static void 4734 hn_synth_detach(struct hn_softc *sc) 4735 { 4736 4737 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4738 ("synthetic parts were not attached")); 4739 4740 /* Detach the RNDIS first. */ 4741 hn_rndis_detach(sc); 4742 4743 /* Detach NVS. */ 4744 hn_nvs_detach(sc); 4745 4746 /* Detach all of the channels. */ 4747 hn_detach_allchans(sc); 4748 4749 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 4750 } 4751 4752 static void 4753 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 4754 { 4755 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 4756 ("invalid ring count %d", ring_cnt)); 4757 4758 if (sc->hn_tx_ring_cnt > ring_cnt) 4759 sc->hn_tx_ring_inuse = ring_cnt; 4760 else 4761 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4762 sc->hn_rx_ring_inuse = ring_cnt; 4763 4764 #ifdef RSS 4765 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 4766 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 4767 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 4768 rss_getnumbuckets()); 4769 } 4770 #endif 4771 4772 if (bootverbose) { 4773 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 4774 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 4775 } 4776 } 4777 4778 static void 4779 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 4780 { 4781 4782 /* 4783 * NOTE: 4784 * The TX bufring will not be drained by the hypervisor, 4785 * if the primary channel is revoked. 4786 */ 4787 while (!vmbus_chan_rx_empty(chan) || 4788 (!vmbus_chan_is_revoked(sc->hn_prichan) && 4789 !vmbus_chan_tx_empty(chan))) 4790 pause("waitch", 1); 4791 vmbus_chan_intr_drain(chan); 4792 } 4793 4794 static void 4795 hn_suspend_data(struct hn_softc *sc) 4796 { 4797 struct vmbus_channel **subch = NULL; 4798 struct hn_tx_ring *txr; 4799 int i, nsubch; 4800 4801 HN_LOCK_ASSERT(sc); 4802 4803 /* 4804 * Suspend TX. 4805 */ 4806 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4807 txr = &sc->hn_tx_ring[i]; 4808 4809 mtx_lock(&txr->hn_tx_lock); 4810 txr->hn_suspended = 1; 4811 mtx_unlock(&txr->hn_tx_lock); 4812 /* No one is able send more packets now. */ 4813 4814 /* 4815 * Wait for all pending sends to finish. 4816 * 4817 * NOTE: 4818 * We will _not_ receive all pending send-done, if the 4819 * primary channel is revoked. 4820 */ 4821 while (hn_tx_ring_pending(txr) && 4822 !vmbus_chan_is_revoked(sc->hn_prichan)) 4823 pause("hnwtx", 1 /* 1 tick */); 4824 } 4825 4826 /* 4827 * Disable RX by clearing RX filter. 4828 */ 4829 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 4830 4831 /* 4832 * Give RNDIS enough time to flush all pending data packets. 4833 */ 4834 pause("waitrx", (200 * hz) / 1000); 4835 4836 /* 4837 * Drain RX/TX bufrings and interrupts. 4838 */ 4839 nsubch = sc->hn_rx_ring_inuse - 1; 4840 if (nsubch > 0) 4841 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4842 4843 if (subch != NULL) { 4844 for (i = 0; i < nsubch; ++i) 4845 hn_chan_drain(sc, subch[i]); 4846 } 4847 hn_chan_drain(sc, sc->hn_prichan); 4848 4849 if (subch != NULL) 4850 vmbus_subchan_rel(subch, nsubch); 4851 4852 /* 4853 * Drain any pending TX tasks. 4854 * 4855 * NOTE: 4856 * The above hn_chan_drain() can dispatch TX tasks, so the TX 4857 * tasks will have to be drained _after_ the above hn_chan_drain() 4858 * calls. 4859 */ 4860 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4861 txr = &sc->hn_tx_ring[i]; 4862 4863 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 4864 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 4865 } 4866 } 4867 4868 static void 4869 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 4870 { 4871 4872 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 4873 } 4874 4875 static void 4876 hn_suspend_mgmt(struct hn_softc *sc) 4877 { 4878 struct task task; 4879 4880 HN_LOCK_ASSERT(sc); 4881 4882 /* 4883 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 4884 * through hn_mgmt_taskq. 4885 */ 4886 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 4887 vmbus_chan_run_task(sc->hn_prichan, &task); 4888 4889 /* 4890 * Make sure that all pending management tasks are completed. 4891 */ 4892 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 4893 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 4894 taskqueue_drain_all(sc->hn_mgmt_taskq0); 4895 } 4896 4897 static void 4898 hn_suspend(struct hn_softc *sc) 4899 { 4900 4901 /* Disable polling. */ 4902 hn_polling(sc, 0); 4903 4904 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4905 hn_suspend_data(sc); 4906 hn_suspend_mgmt(sc); 4907 } 4908 4909 static void 4910 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 4911 { 4912 int i; 4913 4914 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 4915 ("invalid TX ring count %d", tx_ring_cnt)); 4916 4917 for (i = 0; i < tx_ring_cnt; ++i) { 4918 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4919 4920 mtx_lock(&txr->hn_tx_lock); 4921 txr->hn_suspended = 0; 4922 mtx_unlock(&txr->hn_tx_lock); 4923 } 4924 } 4925 4926 static void 4927 hn_resume_data(struct hn_softc *sc) 4928 { 4929 int i; 4930 4931 HN_LOCK_ASSERT(sc); 4932 4933 /* 4934 * Re-enable RX. 4935 */ 4936 hn_rxfilter_config(sc); 4937 4938 /* 4939 * Make sure to clear suspend status on "all" TX rings, 4940 * since hn_tx_ring_inuse can be changed after 4941 * hn_suspend_data(). 4942 */ 4943 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 4944 4945 #ifdef HN_IFSTART_SUPPORT 4946 if (!hn_use_if_start) 4947 #endif 4948 { 4949 /* 4950 * Flush unused drbrs, since hn_tx_ring_inuse may be 4951 * reduced. 4952 */ 4953 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 4954 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 4955 } 4956 4957 /* 4958 * Kick start TX. 4959 */ 4960 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 4961 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 4962 4963 /* 4964 * Use txeof task, so that any pending oactive can be 4965 * cleared properly. 4966 */ 4967 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 4968 } 4969 } 4970 4971 static void 4972 hn_resume_mgmt(struct hn_softc *sc) 4973 { 4974 4975 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 4976 4977 /* 4978 * Kick off network change detection, if it was pending. 4979 * If no network change was pending, start link status 4980 * checks, which is more lightweight than network change 4981 * detection. 4982 */ 4983 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 4984 hn_change_network(sc); 4985 else 4986 hn_update_link_status(sc); 4987 } 4988 4989 static void 4990 hn_resume(struct hn_softc *sc) 4991 { 4992 4993 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 4994 hn_resume_data(sc); 4995 hn_resume_mgmt(sc); 4996 4997 /* 4998 * Re-enable polling if this interface is running and 4999 * the polling is requested. 5000 */ 5001 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 5002 hn_polling(sc, sc->hn_pollhz); 5003 } 5004 5005 static void 5006 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 5007 { 5008 const struct rndis_status_msg *msg; 5009 int ofs; 5010 5011 if (dlen < sizeof(*msg)) { 5012 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 5013 return; 5014 } 5015 msg = data; 5016 5017 switch (msg->rm_status) { 5018 case RNDIS_STATUS_MEDIA_CONNECT: 5019 case RNDIS_STATUS_MEDIA_DISCONNECT: 5020 hn_update_link_status(sc); 5021 break; 5022 5023 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 5024 /* Not really useful; ignore. */ 5025 break; 5026 5027 case RNDIS_STATUS_NETWORK_CHANGE: 5028 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 5029 if (dlen < ofs + msg->rm_stbuflen || 5030 msg->rm_stbuflen < sizeof(uint32_t)) { 5031 if_printf(sc->hn_ifp, "network changed\n"); 5032 } else { 5033 uint32_t change; 5034 5035 memcpy(&change, ((const uint8_t *)msg) + ofs, 5036 sizeof(change)); 5037 if_printf(sc->hn_ifp, "network changed, change %u\n", 5038 change); 5039 } 5040 hn_change_network(sc); 5041 break; 5042 5043 default: 5044 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 5045 msg->rm_status); 5046 break; 5047 } 5048 } 5049 5050 static int 5051 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 5052 { 5053 const struct rndis_pktinfo *pi = info_data; 5054 uint32_t mask = 0; 5055 5056 while (info_dlen != 0) { 5057 const void *data; 5058 uint32_t dlen; 5059 5060 if (__predict_false(info_dlen < sizeof(*pi))) 5061 return (EINVAL); 5062 if (__predict_false(info_dlen < pi->rm_size)) 5063 return (EINVAL); 5064 info_dlen -= pi->rm_size; 5065 5066 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 5067 return (EINVAL); 5068 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 5069 return (EINVAL); 5070 dlen = pi->rm_size - pi->rm_pktinfooffset; 5071 data = pi->rm_data; 5072 5073 switch (pi->rm_type) { 5074 case NDIS_PKTINFO_TYPE_VLAN: 5075 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 5076 return (EINVAL); 5077 info->vlan_info = *((const uint32_t *)data); 5078 mask |= HN_RXINFO_VLAN; 5079 break; 5080 5081 case NDIS_PKTINFO_TYPE_CSUM: 5082 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 5083 return (EINVAL); 5084 info->csum_info = *((const uint32_t *)data); 5085 mask |= HN_RXINFO_CSUM; 5086 break; 5087 5088 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 5089 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 5090 return (EINVAL); 5091 info->hash_value = *((const uint32_t *)data); 5092 mask |= HN_RXINFO_HASHVAL; 5093 break; 5094 5095 case HN_NDIS_PKTINFO_TYPE_HASHINF: 5096 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 5097 return (EINVAL); 5098 info->hash_info = *((const uint32_t *)data); 5099 mask |= HN_RXINFO_HASHINF; 5100 break; 5101 5102 default: 5103 goto next; 5104 } 5105 5106 if (mask == HN_RXINFO_ALL) { 5107 /* All found; done */ 5108 break; 5109 } 5110 next: 5111 pi = (const struct rndis_pktinfo *) 5112 ((const uint8_t *)pi + pi->rm_size); 5113 } 5114 5115 /* 5116 * Final fixup. 5117 * - If there is no hash value, invalidate the hash info. 5118 */ 5119 if ((mask & HN_RXINFO_HASHVAL) == 0) 5120 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 5121 return (0); 5122 } 5123 5124 static __inline bool 5125 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 5126 { 5127 5128 if (off < check_off) { 5129 if (__predict_true(off + len <= check_off)) 5130 return (false); 5131 } else if (off > check_off) { 5132 if (__predict_true(check_off + check_len <= off)) 5133 return (false); 5134 } 5135 return (true); 5136 } 5137 5138 static void 5139 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 5140 { 5141 const struct rndis_packet_msg *pkt; 5142 struct hn_rxinfo info; 5143 int data_off, pktinfo_off, data_len, pktinfo_len; 5144 5145 /* 5146 * Check length. 5147 */ 5148 if (__predict_false(dlen < sizeof(*pkt))) { 5149 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 5150 return; 5151 } 5152 pkt = data; 5153 5154 if (__predict_false(dlen < pkt->rm_len)) { 5155 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 5156 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 5157 return; 5158 } 5159 if (__predict_false(pkt->rm_len < 5160 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 5161 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 5162 "msglen %u, data %u, oob %u, pktinfo %u\n", 5163 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 5164 pkt->rm_pktinfolen); 5165 return; 5166 } 5167 if (__predict_false(pkt->rm_datalen == 0)) { 5168 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 5169 return; 5170 } 5171 5172 /* 5173 * Check offests. 5174 */ 5175 #define IS_OFFSET_INVALID(ofs) \ 5176 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 5177 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 5178 5179 /* XXX Hyper-V does not meet data offset alignment requirement */ 5180 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 5181 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5182 "data offset %u\n", pkt->rm_dataoffset); 5183 return; 5184 } 5185 if (__predict_false(pkt->rm_oobdataoffset > 0 && 5186 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 5187 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5188 "oob offset %u\n", pkt->rm_oobdataoffset); 5189 return; 5190 } 5191 if (__predict_true(pkt->rm_pktinfooffset > 0) && 5192 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 5193 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5194 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 5195 return; 5196 } 5197 5198 #undef IS_OFFSET_INVALID 5199 5200 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 5201 data_len = pkt->rm_datalen; 5202 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 5203 pktinfo_len = pkt->rm_pktinfolen; 5204 5205 /* 5206 * Check OOB coverage. 5207 */ 5208 if (__predict_false(pkt->rm_oobdatalen != 0)) { 5209 int oob_off, oob_len; 5210 5211 if_printf(rxr->hn_ifp, "got oobdata\n"); 5212 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 5213 oob_len = pkt->rm_oobdatalen; 5214 5215 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 5216 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5217 "oob overflow, msglen %u, oob abs %d len %d\n", 5218 pkt->rm_len, oob_off, oob_len); 5219 return; 5220 } 5221 5222 /* 5223 * Check against data. 5224 */ 5225 if (hn_rndis_check_overlap(oob_off, oob_len, 5226 data_off, data_len)) { 5227 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5228 "oob overlaps data, oob abs %d len %d, " 5229 "data abs %d len %d\n", 5230 oob_off, oob_len, data_off, data_len); 5231 return; 5232 } 5233 5234 /* 5235 * Check against pktinfo. 5236 */ 5237 if (pktinfo_len != 0 && 5238 hn_rndis_check_overlap(oob_off, oob_len, 5239 pktinfo_off, pktinfo_len)) { 5240 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5241 "oob overlaps pktinfo, oob abs %d len %d, " 5242 "pktinfo abs %d len %d\n", 5243 oob_off, oob_len, pktinfo_off, pktinfo_len); 5244 return; 5245 } 5246 } 5247 5248 /* 5249 * Check per-packet-info coverage and find useful per-packet-info. 5250 */ 5251 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 5252 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 5253 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 5254 if (__predict_true(pktinfo_len != 0)) { 5255 bool overlap; 5256 int error; 5257 5258 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 5259 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5260 "pktinfo overflow, msglen %u, " 5261 "pktinfo abs %d len %d\n", 5262 pkt->rm_len, pktinfo_off, pktinfo_len); 5263 return; 5264 } 5265 5266 /* 5267 * Check packet info coverage. 5268 */ 5269 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 5270 data_off, data_len); 5271 if (__predict_false(overlap)) { 5272 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5273 "pktinfo overlap data, pktinfo abs %d len %d, " 5274 "data abs %d len %d\n", 5275 pktinfo_off, pktinfo_len, data_off, data_len); 5276 return; 5277 } 5278 5279 /* 5280 * Find useful per-packet-info. 5281 */ 5282 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 5283 pktinfo_len, &info); 5284 if (__predict_false(error)) { 5285 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 5286 "pktinfo\n"); 5287 return; 5288 } 5289 } 5290 5291 if (__predict_false(data_off + data_len > pkt->rm_len)) { 5292 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 5293 "data overflow, msglen %u, data abs %d len %d\n", 5294 pkt->rm_len, data_off, data_len); 5295 return; 5296 } 5297 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 5298 } 5299 5300 static __inline void 5301 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 5302 { 5303 const struct rndis_msghdr *hdr; 5304 5305 if (__predict_false(dlen < sizeof(*hdr))) { 5306 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 5307 return; 5308 } 5309 hdr = data; 5310 5311 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 5312 /* Hot data path. */ 5313 hn_rndis_rx_data(rxr, data, dlen); 5314 /* Done! */ 5315 return; 5316 } 5317 5318 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 5319 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 5320 else 5321 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 5322 } 5323 5324 static void 5325 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 5326 { 5327 const struct hn_nvs_hdr *hdr; 5328 5329 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 5330 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 5331 return; 5332 } 5333 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 5334 5335 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 5336 /* Useless; ignore */ 5337 return; 5338 } 5339 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 5340 } 5341 5342 static void 5343 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 5344 const struct vmbus_chanpkt_hdr *pkt) 5345 { 5346 struct hn_nvs_sendctx *sndc; 5347 5348 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 5349 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 5350 VMBUS_CHANPKT_DATALEN(pkt)); 5351 /* 5352 * NOTE: 5353 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 5354 * its callback. 5355 */ 5356 } 5357 5358 static void 5359 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5360 const struct vmbus_chanpkt_hdr *pkthdr) 5361 { 5362 const struct vmbus_chanpkt_rxbuf *pkt; 5363 const struct hn_nvs_hdr *nvs_hdr; 5364 int count, i, hlen; 5365 5366 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 5367 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 5368 return; 5369 } 5370 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 5371 5372 /* Make sure that this is a RNDIS message. */ 5373 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 5374 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 5375 nvs_hdr->nvs_type); 5376 return; 5377 } 5378 5379 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 5380 if (__predict_false(hlen < sizeof(*pkt))) { 5381 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 5382 return; 5383 } 5384 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 5385 5386 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 5387 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 5388 pkt->cp_rxbuf_id); 5389 return; 5390 } 5391 5392 count = pkt->cp_rxbuf_cnt; 5393 if (__predict_false(hlen < 5394 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 5395 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 5396 return; 5397 } 5398 5399 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 5400 for (i = 0; i < count; ++i) { 5401 int ofs, len; 5402 5403 ofs = pkt->cp_rxbuf[i].rb_ofs; 5404 len = pkt->cp_rxbuf[i].rb_len; 5405 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 5406 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 5407 "ofs %d, len %d\n", i, ofs, len); 5408 continue; 5409 } 5410 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 5411 } 5412 5413 /* 5414 * Ack the consumed RXBUF associated w/ this channel packet, 5415 * so that this RXBUF can be recycled by the hypervisor. 5416 */ 5417 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 5418 } 5419 5420 static void 5421 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 5422 uint64_t tid) 5423 { 5424 struct hn_nvs_rndis_ack ack; 5425 int retries, error; 5426 5427 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 5428 ack.nvs_status = HN_NVS_STATUS_OK; 5429 5430 retries = 0; 5431 again: 5432 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 5433 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 5434 if (__predict_false(error == EAGAIN)) { 5435 /* 5436 * NOTE: 5437 * This should _not_ happen in real world, since the 5438 * consumption of the TX bufring from the TX path is 5439 * controlled. 5440 */ 5441 if (rxr->hn_ack_failed == 0) 5442 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 5443 rxr->hn_ack_failed++; 5444 retries++; 5445 if (retries < 10) { 5446 DELAY(100); 5447 goto again; 5448 } 5449 /* RXBUF leaks! */ 5450 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 5451 } 5452 } 5453 5454 static void 5455 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 5456 { 5457 struct hn_rx_ring *rxr = xrxr; 5458 struct hn_softc *sc = rxr->hn_ifp->if_softc; 5459 5460 for (;;) { 5461 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 5462 int error, pktlen; 5463 5464 pktlen = rxr->hn_pktbuf_len; 5465 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 5466 if (__predict_false(error == ENOBUFS)) { 5467 void *nbuf; 5468 int nlen; 5469 5470 /* 5471 * Expand channel packet buffer. 5472 * 5473 * XXX 5474 * Use M_WAITOK here, since allocation failure 5475 * is fatal. 5476 */ 5477 nlen = rxr->hn_pktbuf_len * 2; 5478 while (nlen < pktlen) 5479 nlen *= 2; 5480 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 5481 5482 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 5483 rxr->hn_pktbuf_len, nlen); 5484 5485 free(rxr->hn_pktbuf, M_DEVBUF); 5486 rxr->hn_pktbuf = nbuf; 5487 rxr->hn_pktbuf_len = nlen; 5488 /* Retry! */ 5489 continue; 5490 } else if (__predict_false(error == EAGAIN)) { 5491 /* No more channel packets; done! */ 5492 break; 5493 } 5494 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 5495 5496 switch (pkt->cph_type) { 5497 case VMBUS_CHANPKT_TYPE_COMP: 5498 hn_nvs_handle_comp(sc, chan, pkt); 5499 break; 5500 5501 case VMBUS_CHANPKT_TYPE_RXBUF: 5502 hn_nvs_handle_rxbuf(rxr, chan, pkt); 5503 break; 5504 5505 case VMBUS_CHANPKT_TYPE_INBAND: 5506 hn_nvs_handle_notify(sc, pkt); 5507 break; 5508 5509 default: 5510 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 5511 pkt->cph_type); 5512 break; 5513 } 5514 } 5515 hn_chan_rollup(rxr, rxr->hn_txr); 5516 } 5517 5518 static void 5519 hn_tx_taskq_create(void *arg __unused) 5520 { 5521 int i; 5522 5523 /* 5524 * Fix the # of TX taskqueues. 5525 */ 5526 if (hn_tx_taskq_cnt <= 0) 5527 hn_tx_taskq_cnt = 1; 5528 else if (hn_tx_taskq_cnt > mp_ncpus) 5529 hn_tx_taskq_cnt = mp_ncpus; 5530 5531 /* 5532 * Fix the TX taskqueue mode. 5533 */ 5534 switch (hn_tx_taskq_mode) { 5535 case HN_TX_TASKQ_M_INDEP: 5536 case HN_TX_TASKQ_M_GLOBAL: 5537 case HN_TX_TASKQ_M_EVTTQ: 5538 break; 5539 default: 5540 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 5541 break; 5542 } 5543 5544 if (vm_guest != VM_GUEST_HV) 5545 return; 5546 5547 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 5548 return; 5549 5550 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 5551 M_DEVBUF, M_WAITOK); 5552 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 5553 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 5554 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 5555 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 5556 "hn tx%d", i); 5557 } 5558 } 5559 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5560 hn_tx_taskq_create, NULL); 5561 5562 static void 5563 hn_tx_taskq_destroy(void *arg __unused) 5564 { 5565 5566 if (hn_tx_taskque != NULL) { 5567 int i; 5568 5569 for (i = 0; i < hn_tx_taskq_cnt; ++i) 5570 taskqueue_free(hn_tx_taskque[i]); 5571 free(hn_tx_taskque, M_DEVBUF); 5572 } 5573 } 5574 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, 5575 hn_tx_taskq_destroy, NULL); 5576