1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/bus.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/rmlock.h> 73 #include <sys/sbuf.h> 74 #include <sys/smp.h> 75 #include <sys/socket.h> 76 #include <sys/sockio.h> 77 #include <sys/sx.h> 78 #include <sys/sysctl.h> 79 #include <sys/systm.h> 80 #include <sys/taskqueue.h> 81 #include <sys/buf_ring.h> 82 #include <sys/eventhandler.h> 83 84 #include <machine/atomic.h> 85 #include <machine/in_cksum.h> 86 87 #include <net/bpf.h> 88 #include <net/ethernet.h> 89 #include <net/if.h> 90 #include <net/if_dl.h> 91 #include <net/if_media.h> 92 #include <net/if_types.h> 93 #include <net/if_var.h> 94 #include <net/rndis.h> 95 #ifdef RSS 96 #include <net/rss_config.h> 97 #endif 98 99 #include <netinet/in_systm.h> 100 #include <netinet/in.h> 101 #include <netinet/ip.h> 102 #include <netinet/ip6.h> 103 #include <netinet/tcp.h> 104 #include <netinet/tcp_lro.h> 105 #include <netinet/udp.h> 106 107 #include <dev/hyperv/include/hyperv.h> 108 #include <dev/hyperv/include/hyperv_busdma.h> 109 #include <dev/hyperv/include/vmbus.h> 110 #include <dev/hyperv/include/vmbus_xact.h> 111 112 #include <dev/hyperv/netvsc/ndis.h> 113 #include <dev/hyperv/netvsc/if_hnreg.h> 114 #include <dev/hyperv/netvsc/if_hnvar.h> 115 #include <dev/hyperv/netvsc/hn_nvs.h> 116 #include <dev/hyperv/netvsc/hn_rndis.h> 117 118 #include "vmbus_if.h" 119 120 #define HN_IFSTART_SUPPORT 121 122 #define HN_RING_CNT_DEF_MAX 8 123 124 #define HN_VFMAP_SIZE_DEF 8 125 126 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 127 128 /* YYY should get it from the underlying channel */ 129 #define HN_TX_DESC_CNT 512 130 131 #define HN_RNDIS_PKT_LEN \ 132 (sizeof(struct rndis_packet_msg) + \ 133 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 134 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 135 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 136 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 137 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 138 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 139 140 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 141 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 142 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 143 /* -1 for RNDIS packet message */ 144 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 145 146 #define HN_DIRECT_TX_SIZE_DEF 128 147 148 #define HN_EARLY_TXEOF_THRESH 8 149 150 #define HN_PKTBUF_LEN_DEF (16 * 1024) 151 152 #define HN_LROENT_CNT_DEF 128 153 154 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 155 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 156 /* YYY 2*MTU is a bit rough, but should be good enough. */ 157 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 158 159 #define HN_LRO_ACKCNT_DEF 1 160 161 #define HN_LOCK_INIT(sc) \ 162 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 163 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 164 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 165 #define HN_LOCK(sc) \ 166 do { \ 167 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 168 DELAY(1000); \ 169 } while (0) 170 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 171 172 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 173 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 174 #define HN_CSUM_IP_HWASSIST(sc) \ 175 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 176 #define HN_CSUM_IP6_HWASSIST(sc) \ 177 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 178 179 #define HN_PKTSIZE_MIN(align) \ 180 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 181 HN_RNDIS_PKT_LEN, (align)) 182 #define HN_PKTSIZE(m, align) \ 183 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 184 185 #ifdef RSS 186 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 187 #else 188 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 189 #endif 190 191 struct hn_txdesc { 192 #ifndef HN_USE_TXDESC_BUFRING 193 SLIST_ENTRY(hn_txdesc) link; 194 #endif 195 STAILQ_ENTRY(hn_txdesc) agg_link; 196 197 /* Aggregated txdescs, in sending order. */ 198 STAILQ_HEAD(, hn_txdesc) agg_list; 199 200 /* The oldest packet, if transmission aggregation happens. */ 201 struct mbuf *m; 202 struct hn_tx_ring *txr; 203 int refs; 204 uint32_t flags; /* HN_TXD_FLAG_ */ 205 struct hn_nvs_sendctx send_ctx; 206 uint32_t chim_index; 207 int chim_size; 208 209 bus_dmamap_t data_dmap; 210 211 bus_addr_t rndis_pkt_paddr; 212 struct rndis_packet_msg *rndis_pkt; 213 bus_dmamap_t rndis_pkt_dmap; 214 }; 215 216 #define HN_TXD_FLAG_ONLIST 0x0001 217 #define HN_TXD_FLAG_DMAMAP 0x0002 218 #define HN_TXD_FLAG_ONAGG 0x0004 219 220 struct hn_rxinfo { 221 uint32_t vlan_info; 222 uint32_t csum_info; 223 uint32_t hash_info; 224 uint32_t hash_value; 225 }; 226 227 struct hn_rxvf_setarg { 228 struct hn_rx_ring *rxr; 229 struct ifnet *vf_ifp; 230 }; 231 232 #define HN_RXINFO_VLAN 0x0001 233 #define HN_RXINFO_CSUM 0x0002 234 #define HN_RXINFO_HASHINF 0x0004 235 #define HN_RXINFO_HASHVAL 0x0008 236 #define HN_RXINFO_ALL \ 237 (HN_RXINFO_VLAN | \ 238 HN_RXINFO_CSUM | \ 239 HN_RXINFO_HASHINF | \ 240 HN_RXINFO_HASHVAL) 241 242 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 243 #define HN_NDIS_RXCSUM_INFO_INVALID 0 244 #define HN_NDIS_HASH_INFO_INVALID 0 245 246 static int hn_probe(device_t); 247 static int hn_attach(device_t); 248 static int hn_detach(device_t); 249 static int hn_shutdown(device_t); 250 static void hn_chan_callback(struct vmbus_channel *, 251 void *); 252 253 static void hn_init(void *); 254 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 255 #ifdef HN_IFSTART_SUPPORT 256 static void hn_start(struct ifnet *); 257 #endif 258 static int hn_transmit(struct ifnet *, struct mbuf *); 259 static void hn_xmit_qflush(struct ifnet *); 260 static int hn_ifmedia_upd(struct ifnet *); 261 static void hn_ifmedia_sts(struct ifnet *, 262 struct ifmediareq *); 263 264 static void hn_ifnet_event(void *, struct ifnet *, int); 265 static void hn_ifaddr_event(void *, struct ifnet *); 266 static void hn_ifnet_attevent(void *, struct ifnet *); 267 static void hn_ifnet_detevent(void *, struct ifnet *); 268 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 269 270 static bool hn_ismyvf(const struct hn_softc *, 271 const struct ifnet *); 272 static void hn_rxvf_change(struct hn_softc *, 273 struct ifnet *, bool); 274 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 275 static void hn_rxvf_set_task(void *, int); 276 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 277 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 278 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 279 struct ifreq *); 280 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 281 static bool hn_xpnt_vf_isready(struct hn_softc *); 282 static void hn_xpnt_vf_setready(struct hn_softc *); 283 static void hn_xpnt_vf_init_taskfunc(void *, int); 284 static void hn_xpnt_vf_init(struct hn_softc *); 285 286 static int hn_rndis_rxinfo(const void *, int, 287 struct hn_rxinfo *); 288 static void hn_rndis_rx_data(struct hn_rx_ring *, 289 const void *, int); 290 static void hn_rndis_rx_status(struct hn_softc *, 291 const void *, int); 292 static void hn_rndis_init_fixat(struct hn_softc *, int); 293 294 static void hn_nvs_handle_notify(struct hn_softc *, 295 const struct vmbus_chanpkt_hdr *); 296 static void hn_nvs_handle_comp(struct hn_softc *, 297 struct vmbus_channel *, 298 const struct vmbus_chanpkt_hdr *); 299 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 300 struct vmbus_channel *, 301 const struct vmbus_chanpkt_hdr *); 302 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 303 struct vmbus_channel *, uint64_t); 304 305 #if __FreeBSD_version >= 1100099 306 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 307 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 308 #endif 309 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 310 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 311 #if __FreeBSD_version < 1100095 312 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 313 #else 314 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 315 #endif 316 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 317 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 318 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 319 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 320 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 321 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 322 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 323 #ifndef RSS 324 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 325 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 326 #endif 327 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 328 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 329 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 331 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 339 340 static void hn_stop(struct hn_softc *, bool); 341 static void hn_init_locked(struct hn_softc *); 342 static int hn_chan_attach(struct hn_softc *, 343 struct vmbus_channel *); 344 static void hn_chan_detach(struct hn_softc *, 345 struct vmbus_channel *); 346 static int hn_attach_subchans(struct hn_softc *); 347 static void hn_detach_allchans(struct hn_softc *); 348 static void hn_chan_rollup(struct hn_rx_ring *, 349 struct hn_tx_ring *); 350 static void hn_set_ring_inuse(struct hn_softc *, int); 351 static int hn_synth_attach(struct hn_softc *, int); 352 static void hn_synth_detach(struct hn_softc *); 353 static int hn_synth_alloc_subchans(struct hn_softc *, 354 int *); 355 static bool hn_synth_attachable(const struct hn_softc *); 356 static void hn_suspend(struct hn_softc *); 357 static void hn_suspend_data(struct hn_softc *); 358 static void hn_suspend_mgmt(struct hn_softc *); 359 static void hn_resume(struct hn_softc *); 360 static void hn_resume_data(struct hn_softc *); 361 static void hn_resume_mgmt(struct hn_softc *); 362 static void hn_suspend_mgmt_taskfunc(void *, int); 363 static void hn_chan_drain(struct hn_softc *, 364 struct vmbus_channel *); 365 static void hn_disable_rx(struct hn_softc *); 366 static void hn_drain_rxtx(struct hn_softc *, int); 367 static void hn_polling(struct hn_softc *, u_int); 368 static void hn_chan_polling(struct vmbus_channel *, u_int); 369 static void hn_mtu_change_fixup(struct hn_softc *); 370 371 static void hn_update_link_status(struct hn_softc *); 372 static void hn_change_network(struct hn_softc *); 373 static void hn_link_taskfunc(void *, int); 374 static void hn_netchg_init_taskfunc(void *, int); 375 static void hn_netchg_status_taskfunc(void *, int); 376 static void hn_link_status(struct hn_softc *); 377 378 static int hn_create_rx_data(struct hn_softc *, int); 379 static void hn_destroy_rx_data(struct hn_softc *); 380 static int hn_check_iplen(const struct mbuf *, int); 381 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 382 static int hn_rxfilter_config(struct hn_softc *); 383 #ifndef RSS 384 static int hn_rss_reconfig(struct hn_softc *); 385 #endif 386 static void hn_rss_ind_fixup(struct hn_softc *); 387 static int hn_rxpkt(struct hn_rx_ring *, const void *, 388 int, const struct hn_rxinfo *); 389 390 static int hn_tx_ring_create(struct hn_softc *, int); 391 static void hn_tx_ring_destroy(struct hn_tx_ring *); 392 static int hn_create_tx_data(struct hn_softc *, int); 393 static void hn_fixup_tx_data(struct hn_softc *); 394 static void hn_destroy_tx_data(struct hn_softc *); 395 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 396 static void hn_txdesc_gc(struct hn_tx_ring *, 397 struct hn_txdesc *); 398 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 399 struct hn_txdesc *, struct mbuf **); 400 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 401 struct hn_txdesc *); 402 static void hn_set_chim_size(struct hn_softc *, int); 403 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 404 static bool hn_tx_ring_pending(struct hn_tx_ring *); 405 static void hn_tx_ring_qflush(struct hn_tx_ring *); 406 static void hn_resume_tx(struct hn_softc *, int); 407 static void hn_set_txagg(struct hn_softc *); 408 static void *hn_try_txagg(struct ifnet *, 409 struct hn_tx_ring *, struct hn_txdesc *, 410 int); 411 static int hn_get_txswq_depth(const struct hn_tx_ring *); 412 static void hn_txpkt_done(struct hn_nvs_sendctx *, 413 struct hn_softc *, struct vmbus_channel *, 414 const void *, int); 415 static int hn_txpkt_sglist(struct hn_tx_ring *, 416 struct hn_txdesc *); 417 static int hn_txpkt_chim(struct hn_tx_ring *, 418 struct hn_txdesc *); 419 static int hn_xmit(struct hn_tx_ring *, int); 420 static void hn_xmit_taskfunc(void *, int); 421 static void hn_xmit_txeof(struct hn_tx_ring *); 422 static void hn_xmit_txeof_taskfunc(void *, int); 423 #ifdef HN_IFSTART_SUPPORT 424 static int hn_start_locked(struct hn_tx_ring *, int); 425 static void hn_start_taskfunc(void *, int); 426 static void hn_start_txeof(struct hn_tx_ring *); 427 static void hn_start_txeof_taskfunc(void *, int); 428 #endif 429 430 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 431 "Hyper-V network interface"); 432 433 /* Trust tcp segements verification on host side. */ 434 static int hn_trust_hosttcp = 1; 435 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 436 &hn_trust_hosttcp, 0, 437 "Trust tcp segement verification on host side, " 438 "when csum info is missing (global setting)"); 439 440 /* Trust udp datagrams verification on host side. */ 441 static int hn_trust_hostudp = 1; 442 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 443 &hn_trust_hostudp, 0, 444 "Trust udp datagram verification on host side, " 445 "when csum info is missing (global setting)"); 446 447 /* Trust ip packets verification on host side. */ 448 static int hn_trust_hostip = 1; 449 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 450 &hn_trust_hostip, 0, 451 "Trust ip packet verification on host side, " 452 "when csum info is missing (global setting)"); 453 454 /* Limit TSO burst size */ 455 static int hn_tso_maxlen = IP_MAXPACKET; 456 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 457 &hn_tso_maxlen, 0, "TSO burst limit"); 458 459 /* Limit chimney send size */ 460 static int hn_tx_chimney_size = 0; 461 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 462 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 463 464 /* Limit the size of packet for direct transmission */ 465 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 466 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 467 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 468 469 /* # of LRO entries per RX ring */ 470 #if defined(INET) || defined(INET6) 471 #if __FreeBSD_version >= 1100095 472 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 473 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 474 &hn_lro_entry_count, 0, "LRO entry count"); 475 #endif 476 #endif 477 478 static int hn_tx_taskq_cnt = 1; 479 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 480 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 481 482 #define HN_TX_TASKQ_M_INDEP 0 483 #define HN_TX_TASKQ_M_GLOBAL 1 484 #define HN_TX_TASKQ_M_EVTTQ 2 485 486 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 488 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 489 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 490 491 #ifndef HN_USE_TXDESC_BUFRING 492 static int hn_use_txdesc_bufring = 0; 493 #else 494 static int hn_use_txdesc_bufring = 1; 495 #endif 496 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 497 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 498 499 #ifdef HN_IFSTART_SUPPORT 500 /* Use ifnet.if_start instead of ifnet.if_transmit */ 501 static int hn_use_if_start = 0; 502 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 503 &hn_use_if_start, 0, "Use if_start TX method"); 504 #endif 505 506 /* # of channels to use */ 507 static int hn_chan_cnt = 0; 508 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 509 &hn_chan_cnt, 0, 510 "# of channels to use; each channel has one RX ring and one TX ring"); 511 512 /* # of transmit rings to use */ 513 static int hn_tx_ring_cnt = 0; 514 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 515 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 516 517 /* Software TX ring deptch */ 518 static int hn_tx_swq_depth = 0; 519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 520 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 521 522 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 523 #if __FreeBSD_version >= 1100095 524 static u_int hn_lro_mbufq_depth = 0; 525 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 526 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 527 #endif 528 529 /* Packet transmission aggregation size limit */ 530 static int hn_tx_agg_size = -1; 531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 532 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 533 534 /* Packet transmission aggregation count limit */ 535 static int hn_tx_agg_pkts = -1; 536 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 537 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 538 539 /* VF list */ 540 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING, 541 0, 0, hn_vflist_sysctl, "A", "VF list"); 542 543 /* VF mapping */ 544 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING, 545 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); 546 547 /* Transparent VF */ 548 static int hn_xpnt_vf = 0; 549 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 550 &hn_xpnt_vf, 0, "Transparent VF mod"); 551 552 /* Accurate BPF support for Transparent VF */ 553 static int hn_xpnt_vf_accbpf = 0; 554 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 555 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 556 557 /* Extra wait for transparent VF attach routing; unit seconds. */ 558 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 559 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 560 &hn_xpnt_vf_attwait, 0, 561 "Extra wait for transparent VF attach routing; unit: seconds"); 562 563 static u_int hn_cpu_index; /* next CPU for channel */ 564 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 565 566 static struct rmlock hn_vfmap_lock; 567 static int hn_vfmap_size; 568 static struct ifnet **hn_vfmap; 569 570 #ifndef RSS 571 static const uint8_t 572 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 573 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 574 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 575 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 576 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 577 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 578 }; 579 #endif /* !RSS */ 580 581 static device_method_t hn_methods[] = { 582 /* Device interface */ 583 DEVMETHOD(device_probe, hn_probe), 584 DEVMETHOD(device_attach, hn_attach), 585 DEVMETHOD(device_detach, hn_detach), 586 DEVMETHOD(device_shutdown, hn_shutdown), 587 DEVMETHOD_END 588 }; 589 590 static driver_t hn_driver = { 591 "hn", 592 hn_methods, 593 sizeof(struct hn_softc) 594 }; 595 596 static devclass_t hn_devclass; 597 598 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 599 MODULE_VERSION(hn, 1); 600 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 601 602 #if __FreeBSD_version >= 1100099 603 static void 604 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 605 { 606 int i; 607 608 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 609 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 610 } 611 #endif 612 613 static int 614 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 615 { 616 617 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 618 txd->chim_size == 0, ("invalid rndis sglist txd")); 619 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 620 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 621 } 622 623 static int 624 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 625 { 626 struct hn_nvs_rndis rndis; 627 628 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 629 txd->chim_size > 0, ("invalid rndis chim txd")); 630 631 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 632 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 633 rndis.nvs_chim_idx = txd->chim_index; 634 rndis.nvs_chim_sz = txd->chim_size; 635 636 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 637 &rndis, sizeof(rndis), &txd->send_ctx)); 638 } 639 640 static __inline uint32_t 641 hn_chim_alloc(struct hn_softc *sc) 642 { 643 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 644 u_long *bmap = sc->hn_chim_bmap; 645 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 646 647 for (i = 0; i < bmap_cnt; ++i) { 648 int idx; 649 650 idx = ffsl(~bmap[i]); 651 if (idx == 0) 652 continue; 653 654 --idx; /* ffsl is 1-based */ 655 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 656 ("invalid i %d and idx %d", i, idx)); 657 658 if (atomic_testandset_long(&bmap[i], idx)) 659 continue; 660 661 ret = i * LONG_BIT + idx; 662 break; 663 } 664 return (ret); 665 } 666 667 static __inline void 668 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 669 { 670 u_long mask; 671 uint32_t idx; 672 673 idx = chim_idx / LONG_BIT; 674 KASSERT(idx < sc->hn_chim_bmap_cnt, 675 ("invalid chimney index 0x%x", chim_idx)); 676 677 mask = 1UL << (chim_idx % LONG_BIT); 678 KASSERT(sc->hn_chim_bmap[idx] & mask, 679 ("index bitmap 0x%lx, chimney index %u, " 680 "bitmap idx %d, bitmask 0x%lx", 681 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 682 683 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 684 } 685 686 #if defined(INET6) || defined(INET) 687 688 #define PULLUP_HDR(m, len) \ 689 do { \ 690 if (__predict_false((m)->m_len < (len))) { \ 691 (m) = m_pullup((m), (len)); \ 692 if ((m) == NULL) \ 693 return (NULL); \ 694 } \ 695 } while (0) 696 697 /* 698 * NOTE: If this function failed, the m_head would be freed. 699 */ 700 static __inline struct mbuf * 701 hn_tso_fixup(struct mbuf *m_head) 702 { 703 struct ether_vlan_header *evl; 704 struct tcphdr *th; 705 int ehlen; 706 707 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 708 709 PULLUP_HDR(m_head, sizeof(*evl)); 710 evl = mtod(m_head, struct ether_vlan_header *); 711 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 712 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 713 else 714 ehlen = ETHER_HDR_LEN; 715 716 #ifdef INET 717 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 718 struct ip *ip; 719 int iphlen; 720 721 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 722 ip = mtodo(m_head, ehlen); 723 iphlen = ip->ip_hl << 2; 724 725 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 726 th = mtodo(m_head, ehlen + iphlen); 727 728 ip->ip_len = 0; 729 ip->ip_sum = 0; 730 th->th_sum = in_pseudo(ip->ip_src.s_addr, 731 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 732 } 733 #endif 734 #if defined(INET6) && defined(INET) 735 else 736 #endif 737 #ifdef INET6 738 { 739 struct ip6_hdr *ip6; 740 741 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 742 ip6 = mtodo(m_head, ehlen); 743 if (ip6->ip6_nxt != IPPROTO_TCP) { 744 m_freem(m_head); 745 return (NULL); 746 } 747 748 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 749 th = mtodo(m_head, ehlen + sizeof(*ip6)); 750 751 ip6->ip6_plen = 0; 752 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 753 } 754 #endif 755 return (m_head); 756 757 } 758 759 /* 760 * NOTE: If this function failed, the m_head would be freed. 761 */ 762 static __inline struct mbuf * 763 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 764 { 765 const struct ether_vlan_header *evl; 766 const struct tcphdr *th; 767 int ehlen; 768 769 *tcpsyn = 0; 770 771 PULLUP_HDR(m_head, sizeof(*evl)); 772 evl = mtod(m_head, const struct ether_vlan_header *); 773 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 774 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 775 else 776 ehlen = ETHER_HDR_LEN; 777 778 #ifdef INET 779 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) { 780 const struct ip *ip; 781 int iphlen; 782 783 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 784 ip = mtodo(m_head, ehlen); 785 iphlen = ip->ip_hl << 2; 786 787 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 788 th = mtodo(m_head, ehlen + iphlen); 789 if (th->th_flags & TH_SYN) 790 *tcpsyn = 1; 791 } 792 #endif 793 #if defined(INET6) && defined(INET) 794 else 795 #endif 796 #ifdef INET6 797 { 798 const struct ip6_hdr *ip6; 799 800 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 801 ip6 = mtodo(m_head, ehlen); 802 if (ip6->ip6_nxt != IPPROTO_TCP) 803 return (m_head); 804 805 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 806 th = mtodo(m_head, ehlen + sizeof(*ip6)); 807 if (th->th_flags & TH_SYN) 808 *tcpsyn = 1; 809 } 810 #endif 811 return (m_head); 812 } 813 814 #undef PULLUP_HDR 815 816 #endif /* INET6 || INET */ 817 818 static int 819 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 820 { 821 int error = 0; 822 823 HN_LOCK_ASSERT(sc); 824 825 if (sc->hn_rx_filter != filter) { 826 error = hn_rndis_set_rxfilter(sc, filter); 827 if (!error) 828 sc->hn_rx_filter = filter; 829 } 830 return (error); 831 } 832 833 static int 834 hn_rxfilter_config(struct hn_softc *sc) 835 { 836 struct ifnet *ifp = sc->hn_ifp; 837 uint32_t filter; 838 839 HN_LOCK_ASSERT(sc); 840 841 /* 842 * If the non-transparent mode VF is activated, we don't know how 843 * its RX filter is configured, so stick the synthetic device in 844 * the promiscous mode. 845 */ 846 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 847 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 848 } else { 849 filter = NDIS_PACKET_TYPE_DIRECTED; 850 if (ifp->if_flags & IFF_BROADCAST) 851 filter |= NDIS_PACKET_TYPE_BROADCAST; 852 /* TODO: support multicast list */ 853 if ((ifp->if_flags & IFF_ALLMULTI) || 854 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 855 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 856 } 857 return (hn_set_rxfilter(sc, filter)); 858 } 859 860 static void 861 hn_set_txagg(struct hn_softc *sc) 862 { 863 uint32_t size, pkts; 864 int i; 865 866 /* 867 * Setup aggregation size. 868 */ 869 if (sc->hn_agg_size < 0) 870 size = UINT32_MAX; 871 else 872 size = sc->hn_agg_size; 873 874 if (sc->hn_rndis_agg_size < size) 875 size = sc->hn_rndis_agg_size; 876 877 /* NOTE: We only aggregate packets using chimney sending buffers. */ 878 if (size > (uint32_t)sc->hn_chim_szmax) 879 size = sc->hn_chim_szmax; 880 881 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 882 /* Disable */ 883 size = 0; 884 pkts = 0; 885 goto done; 886 } 887 888 /* NOTE: Type of the per TX ring setting is 'int'. */ 889 if (size > INT_MAX) 890 size = INT_MAX; 891 892 /* 893 * Setup aggregation packet count. 894 */ 895 if (sc->hn_agg_pkts < 0) 896 pkts = UINT32_MAX; 897 else 898 pkts = sc->hn_agg_pkts; 899 900 if (sc->hn_rndis_agg_pkts < pkts) 901 pkts = sc->hn_rndis_agg_pkts; 902 903 if (pkts <= 1) { 904 /* Disable */ 905 size = 0; 906 pkts = 0; 907 goto done; 908 } 909 910 /* NOTE: Type of the per TX ring setting is 'short'. */ 911 if (pkts > SHRT_MAX) 912 pkts = SHRT_MAX; 913 914 done: 915 /* NOTE: Type of the per TX ring setting is 'short'. */ 916 if (sc->hn_rndis_agg_align > SHRT_MAX) { 917 /* Disable */ 918 size = 0; 919 pkts = 0; 920 } 921 922 if (bootverbose) { 923 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 924 size, pkts, sc->hn_rndis_agg_align); 925 } 926 927 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 928 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 929 930 mtx_lock(&txr->hn_tx_lock); 931 txr->hn_agg_szmax = size; 932 txr->hn_agg_pktmax = pkts; 933 txr->hn_agg_align = sc->hn_rndis_agg_align; 934 mtx_unlock(&txr->hn_tx_lock); 935 } 936 } 937 938 static int 939 hn_get_txswq_depth(const struct hn_tx_ring *txr) 940 { 941 942 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 943 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 944 return txr->hn_txdesc_cnt; 945 return hn_tx_swq_depth; 946 } 947 948 #ifndef RSS 949 static int 950 hn_rss_reconfig(struct hn_softc *sc) 951 { 952 int error; 953 954 HN_LOCK_ASSERT(sc); 955 956 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 957 return (ENXIO); 958 959 /* 960 * Disable RSS first. 961 * 962 * NOTE: 963 * Direct reconfiguration by setting the UNCHG flags does 964 * _not_ work properly. 965 */ 966 if (bootverbose) 967 if_printf(sc->hn_ifp, "disable RSS\n"); 968 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 969 if (error) { 970 if_printf(sc->hn_ifp, "RSS disable failed\n"); 971 return (error); 972 } 973 974 /* 975 * Reenable the RSS w/ the updated RSS key or indirect 976 * table. 977 */ 978 if (bootverbose) 979 if_printf(sc->hn_ifp, "reconfig RSS\n"); 980 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 981 if (error) { 982 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 983 return (error); 984 } 985 return (0); 986 } 987 #endif /* !RSS */ 988 989 static void 990 hn_rss_ind_fixup(struct hn_softc *sc) 991 { 992 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 993 int i, nchan; 994 995 nchan = sc->hn_rx_ring_inuse; 996 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 997 998 /* 999 * Check indirect table to make sure that all channels in it 1000 * can be used. 1001 */ 1002 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1003 if (rss->rss_ind[i] >= nchan) { 1004 if_printf(sc->hn_ifp, 1005 "RSS indirect table %d fixup: %u -> %d\n", 1006 i, rss->rss_ind[i], nchan - 1); 1007 rss->rss_ind[i] = nchan - 1; 1008 } 1009 } 1010 } 1011 1012 static int 1013 hn_ifmedia_upd(struct ifnet *ifp __unused) 1014 { 1015 1016 return EOPNOTSUPP; 1017 } 1018 1019 static void 1020 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1021 { 1022 struct hn_softc *sc = ifp->if_softc; 1023 1024 ifmr->ifm_status = IFM_AVALID; 1025 ifmr->ifm_active = IFM_ETHER; 1026 1027 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1028 ifmr->ifm_active |= IFM_NONE; 1029 return; 1030 } 1031 ifmr->ifm_status |= IFM_ACTIVE; 1032 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1033 } 1034 1035 static void 1036 hn_rxvf_set_task(void *xarg, int pending __unused) 1037 { 1038 struct hn_rxvf_setarg *arg = xarg; 1039 1040 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1041 } 1042 1043 static void 1044 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1045 { 1046 struct hn_rx_ring *rxr; 1047 struct hn_rxvf_setarg arg; 1048 struct task task; 1049 int i; 1050 1051 HN_LOCK_ASSERT(sc); 1052 1053 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1054 1055 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1056 rxr = &sc->hn_rx_ring[i]; 1057 1058 if (i < sc->hn_rx_ring_inuse) { 1059 arg.rxr = rxr; 1060 arg.vf_ifp = vf_ifp; 1061 vmbus_chan_run_task(rxr->hn_chan, &task); 1062 } else { 1063 rxr->hn_rxvf_ifp = vf_ifp; 1064 } 1065 } 1066 } 1067 1068 static bool 1069 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1070 { 1071 const struct ifnet *hn_ifp; 1072 1073 hn_ifp = sc->hn_ifp; 1074 1075 if (ifp == hn_ifp) 1076 return (false); 1077 1078 if (ifp->if_alloctype != IFT_ETHER) 1079 return (false); 1080 1081 /* Ignore lagg/vlan interfaces */ 1082 if (strcmp(ifp->if_dname, "lagg") == 0 || 1083 strcmp(ifp->if_dname, "vlan") == 0) 1084 return (false); 1085 1086 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1087 return (false); 1088 1089 return (true); 1090 } 1091 1092 static void 1093 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1094 { 1095 struct ifnet *hn_ifp; 1096 1097 HN_LOCK(sc); 1098 1099 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1100 goto out; 1101 1102 if (!hn_ismyvf(sc, ifp)) 1103 goto out; 1104 hn_ifp = sc->hn_ifp; 1105 1106 if (rxvf) { 1107 if (sc->hn_flags & HN_FLAG_RXVF) 1108 goto out; 1109 1110 sc->hn_flags |= HN_FLAG_RXVF; 1111 hn_rxfilter_config(sc); 1112 } else { 1113 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1114 goto out; 1115 1116 sc->hn_flags &= ~HN_FLAG_RXVF; 1117 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1118 hn_rxfilter_config(sc); 1119 else 1120 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1121 } 1122 1123 hn_nvs_set_datapath(sc, 1124 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1125 1126 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1127 1128 if (rxvf) { 1129 hn_suspend_mgmt(sc); 1130 sc->hn_link_flags &= 1131 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1132 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1133 } else { 1134 hn_resume_mgmt(sc); 1135 } 1136 1137 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1138 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1139 1140 if (bootverbose) { 1141 if_printf(hn_ifp, "datapath is switched %s %s\n", 1142 rxvf ? "to" : "from", ifp->if_xname); 1143 } 1144 out: 1145 HN_UNLOCK(sc); 1146 } 1147 1148 static void 1149 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1150 { 1151 1152 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1153 return; 1154 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1155 } 1156 1157 static void 1158 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1159 { 1160 1161 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1162 } 1163 1164 static int 1165 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1166 { 1167 struct ifnet *ifp, *vf_ifp; 1168 uint64_t tmp; 1169 int error; 1170 1171 HN_LOCK_ASSERT(sc); 1172 ifp = sc->hn_ifp; 1173 vf_ifp = sc->hn_vf_ifp; 1174 1175 /* 1176 * Fix up requested capabilities w/ supported capabilities, 1177 * since the supported capabilities could have been changed. 1178 */ 1179 ifr->ifr_reqcap &= ifp->if_capabilities; 1180 /* Pass SIOCSIFCAP to VF. */ 1181 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1182 1183 /* 1184 * NOTE: 1185 * The error will be propagated to the callers, however, it 1186 * is _not_ useful here. 1187 */ 1188 1189 /* 1190 * Merge VF's enabled capabilities. 1191 */ 1192 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1193 1194 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1195 if (ifp->if_capenable & IFCAP_TXCSUM) 1196 ifp->if_hwassist |= tmp; 1197 else 1198 ifp->if_hwassist &= ~tmp; 1199 1200 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1201 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1202 ifp->if_hwassist |= tmp; 1203 else 1204 ifp->if_hwassist &= ~tmp; 1205 1206 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1207 if (ifp->if_capenable & IFCAP_TSO4) 1208 ifp->if_hwassist |= tmp; 1209 else 1210 ifp->if_hwassist &= ~tmp; 1211 1212 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1213 if (ifp->if_capenable & IFCAP_TSO6) 1214 ifp->if_hwassist |= tmp; 1215 else 1216 ifp->if_hwassist &= ~tmp; 1217 1218 return (error); 1219 } 1220 1221 static int 1222 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1223 { 1224 struct ifnet *vf_ifp; 1225 struct ifreq ifr; 1226 1227 HN_LOCK_ASSERT(sc); 1228 vf_ifp = sc->hn_vf_ifp; 1229 1230 memset(&ifr, 0, sizeof(ifr)); 1231 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1232 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1233 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1234 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1235 } 1236 1237 static void 1238 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1239 { 1240 struct ifnet *ifp = sc->hn_ifp; 1241 int allmulti = 0; 1242 1243 HN_LOCK_ASSERT(sc); 1244 1245 /* XXX vlan(4) style mcast addr maintenance */ 1246 if (!TAILQ_EMPTY(&ifp->if_multiaddrs)) 1247 allmulti = IFF_ALLMULTI; 1248 1249 /* Always set the VF's if_flags */ 1250 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1251 } 1252 1253 static void 1254 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1255 { 1256 struct rm_priotracker pt; 1257 struct ifnet *hn_ifp = NULL; 1258 struct mbuf *mn; 1259 1260 /* 1261 * XXX racy, if hn(4) ever detached. 1262 */ 1263 rm_rlock(&hn_vfmap_lock, &pt); 1264 if (vf_ifp->if_index < hn_vfmap_size) 1265 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1266 rm_runlock(&hn_vfmap_lock, &pt); 1267 1268 if (hn_ifp != NULL) { 1269 /* 1270 * Fix up rcvif and go through hn(4)'s if_input and 1271 * increase ipackets. 1272 */ 1273 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1274 /* Allow tapping on the VF. */ 1275 ETHER_BPF_MTAP(vf_ifp, mn); 1276 mn->m_pkthdr.rcvif = hn_ifp; 1277 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1278 } 1279 hn_ifp->if_input(hn_ifp, m); 1280 } else { 1281 /* 1282 * In the middle of the transition; free this 1283 * mbuf chain. 1284 */ 1285 while (m != NULL) { 1286 mn = m->m_nextpkt; 1287 m->m_nextpkt = NULL; 1288 m_freem(m); 1289 m = mn; 1290 } 1291 } 1292 } 1293 1294 static void 1295 hn_mtu_change_fixup(struct hn_softc *sc) 1296 { 1297 struct ifnet *ifp; 1298 1299 HN_LOCK_ASSERT(sc); 1300 ifp = sc->hn_ifp; 1301 1302 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1303 #if __FreeBSD_version >= 1100099 1304 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1305 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1306 #endif 1307 } 1308 1309 static void 1310 hn_xpnt_vf_setready(struct hn_softc *sc) 1311 { 1312 struct ifnet *ifp, *vf_ifp; 1313 struct ifreq ifr; 1314 1315 HN_LOCK_ASSERT(sc); 1316 ifp = sc->hn_ifp; 1317 vf_ifp = sc->hn_vf_ifp; 1318 1319 /* 1320 * Mark the VF ready. 1321 */ 1322 sc->hn_vf_rdytick = 0; 1323 1324 /* 1325 * Save information for restoration. 1326 */ 1327 sc->hn_saved_caps = ifp->if_capabilities; 1328 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1329 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1330 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1331 1332 /* 1333 * Intersect supported/enabled capabilities. 1334 * 1335 * NOTE: 1336 * if_hwassist is not changed here. 1337 */ 1338 ifp->if_capabilities &= vf_ifp->if_capabilities; 1339 ifp->if_capenable &= ifp->if_capabilities; 1340 1341 /* 1342 * Fix TSO settings. 1343 */ 1344 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1345 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1346 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1347 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1348 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1349 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1350 1351 /* 1352 * Change VF's enabled capabilities. 1353 */ 1354 memset(&ifr, 0, sizeof(ifr)); 1355 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1356 ifr.ifr_reqcap = ifp->if_capenable; 1357 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1358 1359 if (ifp->if_mtu != ETHERMTU) { 1360 int error; 1361 1362 /* 1363 * Change VF's MTU. 1364 */ 1365 memset(&ifr, 0, sizeof(ifr)); 1366 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1367 ifr.ifr_mtu = ifp->if_mtu; 1368 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1369 if (error) { 1370 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1371 vf_ifp->if_xname, ifp->if_mtu); 1372 if (ifp->if_mtu > ETHERMTU) { 1373 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1374 1375 /* 1376 * XXX 1377 * No need to adjust the synthetic parts' MTU; 1378 * failure of the adjustment will cause us 1379 * infinite headache. 1380 */ 1381 ifp->if_mtu = ETHERMTU; 1382 hn_mtu_change_fixup(sc); 1383 } 1384 } 1385 } 1386 } 1387 1388 static bool 1389 hn_xpnt_vf_isready(struct hn_softc *sc) 1390 { 1391 1392 HN_LOCK_ASSERT(sc); 1393 1394 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1395 return (false); 1396 1397 if (sc->hn_vf_rdytick == 0) 1398 return (true); 1399 1400 if (sc->hn_vf_rdytick > ticks) 1401 return (false); 1402 1403 /* Mark VF as ready. */ 1404 hn_xpnt_vf_setready(sc); 1405 return (true); 1406 } 1407 1408 static void 1409 hn_xpnt_vf_init(struct hn_softc *sc) 1410 { 1411 int error; 1412 1413 HN_LOCK_ASSERT(sc); 1414 1415 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1416 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1417 1418 if (bootverbose) { 1419 if_printf(sc->hn_ifp, "try bringing up %s\n", 1420 sc->hn_vf_ifp->if_xname); 1421 } 1422 1423 /* 1424 * Bring the VF up. 1425 */ 1426 hn_xpnt_vf_saveifflags(sc); 1427 sc->hn_vf_ifp->if_flags |= IFF_UP; 1428 error = hn_xpnt_vf_iocsetflags(sc); 1429 if (error) { 1430 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1431 sc->hn_vf_ifp->if_xname, error); 1432 return; 1433 } 1434 1435 /* 1436 * NOTE: 1437 * Datapath setting must happen _after_ bringing the VF up. 1438 */ 1439 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1440 1441 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1442 rm_wlock(&sc->hn_vf_lock); 1443 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1444 rm_wunlock(&sc->hn_vf_lock); 1445 } 1446 1447 static void 1448 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1449 { 1450 struct hn_softc *sc = xsc; 1451 1452 HN_LOCK(sc); 1453 1454 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1455 goto done; 1456 if (sc->hn_vf_ifp == NULL) 1457 goto done; 1458 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1459 goto done; 1460 1461 if (sc->hn_vf_rdytick != 0) { 1462 /* Mark VF as ready. */ 1463 hn_xpnt_vf_setready(sc); 1464 } 1465 1466 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1467 /* 1468 * Delayed VF initialization. 1469 */ 1470 if (bootverbose) { 1471 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1472 sc->hn_vf_ifp->if_xname); 1473 } 1474 hn_xpnt_vf_init(sc); 1475 } 1476 done: 1477 HN_UNLOCK(sc); 1478 } 1479 1480 static void 1481 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1482 { 1483 struct hn_softc *sc = xsc; 1484 1485 HN_LOCK(sc); 1486 1487 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1488 goto done; 1489 1490 if (!hn_ismyvf(sc, ifp)) 1491 goto done; 1492 1493 if (sc->hn_vf_ifp != NULL) { 1494 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1495 sc->hn_vf_ifp->if_xname); 1496 goto done; 1497 } 1498 1499 if (hn_xpnt_vf && ifp->if_start != NULL) { 1500 /* 1501 * ifnet.if_start is _not_ supported by transparent 1502 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1503 */ 1504 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1505 "in transparent VF mode.\n", ifp->if_xname); 1506 goto done; 1507 } 1508 1509 rm_wlock(&hn_vfmap_lock); 1510 1511 if (ifp->if_index >= hn_vfmap_size) { 1512 struct ifnet **newmap; 1513 int newsize; 1514 1515 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1516 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1517 M_WAITOK | M_ZERO); 1518 1519 memcpy(newmap, hn_vfmap, 1520 sizeof(struct ifnet *) * hn_vfmap_size); 1521 free(hn_vfmap, M_DEVBUF); 1522 hn_vfmap = newmap; 1523 hn_vfmap_size = newsize; 1524 } 1525 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1526 ("%s: ifindex %d was mapped to %s", 1527 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1528 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1529 1530 rm_wunlock(&hn_vfmap_lock); 1531 1532 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1533 rm_wlock(&sc->hn_vf_lock); 1534 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1535 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1536 sc->hn_vf_ifp = ifp; 1537 rm_wunlock(&sc->hn_vf_lock); 1538 1539 if (hn_xpnt_vf) { 1540 int wait_ticks; 1541 1542 /* 1543 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1544 * Save vf_ifp's current if_input for later restoration. 1545 */ 1546 sc->hn_vf_input = ifp->if_input; 1547 ifp->if_input = hn_xpnt_vf_input; 1548 1549 /* 1550 * Stop link status management; use the VF's. 1551 */ 1552 hn_suspend_mgmt(sc); 1553 1554 /* 1555 * Give VF sometime to complete its attach routing. 1556 */ 1557 wait_ticks = hn_xpnt_vf_attwait * hz; 1558 sc->hn_vf_rdytick = ticks + wait_ticks; 1559 1560 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1561 wait_ticks); 1562 } 1563 done: 1564 HN_UNLOCK(sc); 1565 } 1566 1567 static void 1568 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1569 { 1570 struct hn_softc *sc = xsc; 1571 1572 HN_LOCK(sc); 1573 1574 if (sc->hn_vf_ifp == NULL) 1575 goto done; 1576 1577 if (!hn_ismyvf(sc, ifp)) 1578 goto done; 1579 1580 if (hn_xpnt_vf) { 1581 /* 1582 * Make sure that the delayed initialization is not running. 1583 * 1584 * NOTE: 1585 * - This lock _must_ be released, since the hn_vf_init task 1586 * will try holding this lock. 1587 * - It is safe to release this lock here, since the 1588 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1589 * 1590 * XXX racy, if hn(4) ever detached. 1591 */ 1592 HN_UNLOCK(sc); 1593 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 1594 HN_LOCK(sc); 1595 1596 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 1597 sc->hn_ifp->if_xname)); 1598 ifp->if_input = sc->hn_vf_input; 1599 sc->hn_vf_input = NULL; 1600 1601 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1602 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 1603 1604 if (sc->hn_vf_rdytick == 0) { 1605 /* 1606 * The VF was ready; restore some settings. 1607 */ 1608 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 1609 /* 1610 * NOTE: 1611 * There is _no_ need to fixup if_capenable and 1612 * if_hwassist, since the if_capabilities before 1613 * restoration was an intersection of the VF's 1614 * if_capabilites and the synthetic device's 1615 * if_capabilites. 1616 */ 1617 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 1618 sc->hn_ifp->if_hw_tsomaxsegcount = 1619 sc->hn_saved_tsosegcnt; 1620 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 1621 } 1622 1623 /* 1624 * Resume link status management, which was suspended 1625 * by hn_ifnet_attevent(). 1626 */ 1627 hn_resume_mgmt(sc); 1628 } 1629 1630 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1631 rm_wlock(&sc->hn_vf_lock); 1632 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1633 sc->hn_vf_ifp = NULL; 1634 rm_wunlock(&sc->hn_vf_lock); 1635 1636 rm_wlock(&hn_vfmap_lock); 1637 1638 KASSERT(ifp->if_index < hn_vfmap_size, 1639 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 1640 if (hn_vfmap[ifp->if_index] != NULL) { 1641 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 1642 ("%s: ifindex %d was mapped to %s", 1643 ifp->if_xname, ifp->if_index, 1644 hn_vfmap[ifp->if_index]->if_xname)); 1645 hn_vfmap[ifp->if_index] = NULL; 1646 } 1647 1648 rm_wunlock(&hn_vfmap_lock); 1649 done: 1650 HN_UNLOCK(sc); 1651 } 1652 1653 static void 1654 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 1655 { 1656 struct hn_softc *sc = xsc; 1657 1658 if (sc->hn_vf_ifp == ifp) 1659 if_link_state_change(sc->hn_ifp, link_state); 1660 } 1661 1662 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ 1663 static const struct hyperv_guid g_net_vsc_device_type = { 1664 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 1665 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} 1666 }; 1667 1668 static int 1669 hn_probe(device_t dev) 1670 { 1671 1672 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, 1673 &g_net_vsc_device_type) == 0) { 1674 device_set_desc(dev, "Hyper-V Network Interface"); 1675 return BUS_PROBE_DEFAULT; 1676 } 1677 return ENXIO; 1678 } 1679 1680 static int 1681 hn_attach(device_t dev) 1682 { 1683 struct hn_softc *sc = device_get_softc(dev); 1684 struct sysctl_oid_list *child; 1685 struct sysctl_ctx_list *ctx; 1686 uint8_t eaddr[ETHER_ADDR_LEN]; 1687 struct ifnet *ifp = NULL; 1688 int error, ring_cnt, tx_ring_cnt; 1689 1690 sc->hn_dev = dev; 1691 sc->hn_prichan = vmbus_get_channel(dev); 1692 HN_LOCK_INIT(sc); 1693 rm_init(&sc->hn_vf_lock, "hnvf"); 1694 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 1695 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 1696 1697 /* 1698 * Initialize these tunables once. 1699 */ 1700 sc->hn_agg_size = hn_tx_agg_size; 1701 sc->hn_agg_pkts = hn_tx_agg_pkts; 1702 1703 /* 1704 * Setup taskqueue for transmission. 1705 */ 1706 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 1707 int i; 1708 1709 sc->hn_tx_taskqs = 1710 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 1711 M_DEVBUF, M_WAITOK); 1712 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 1713 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 1714 M_WAITOK, taskqueue_thread_enqueue, 1715 &sc->hn_tx_taskqs[i]); 1716 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 1717 "%s tx%d", device_get_nameunit(dev), i); 1718 } 1719 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 1720 sc->hn_tx_taskqs = hn_tx_taskque; 1721 } 1722 1723 /* 1724 * Setup taskqueue for mangement tasks, e.g. link status. 1725 */ 1726 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 1727 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 1728 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 1729 device_get_nameunit(dev)); 1730 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 1731 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 1732 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 1733 hn_netchg_status_taskfunc, sc); 1734 1735 if (hn_xpnt_vf) { 1736 /* 1737 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 1738 */ 1739 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 1740 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 1741 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 1742 device_get_nameunit(dev)); 1743 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 1744 hn_xpnt_vf_init_taskfunc, sc); 1745 } 1746 1747 /* 1748 * Allocate ifnet and setup its name earlier, so that if_printf 1749 * can be used by functions, which will be called after 1750 * ether_ifattach(). 1751 */ 1752 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 1753 ifp->if_softc = sc; 1754 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 1755 1756 /* 1757 * Initialize ifmedia earlier so that it can be unconditionally 1758 * destroyed, if error happened later on. 1759 */ 1760 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 1761 1762 /* 1763 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 1764 * to use (tx_ring_cnt). 1765 * 1766 * NOTE: 1767 * The # of RX rings to use is same as the # of channels to use. 1768 */ 1769 ring_cnt = hn_chan_cnt; 1770 if (ring_cnt <= 0) { 1771 /* Default */ 1772 ring_cnt = mp_ncpus; 1773 if (ring_cnt > HN_RING_CNT_DEF_MAX) 1774 ring_cnt = HN_RING_CNT_DEF_MAX; 1775 } else if (ring_cnt > mp_ncpus) { 1776 ring_cnt = mp_ncpus; 1777 } 1778 #ifdef RSS 1779 if (ring_cnt > rss_getnumbuckets()) 1780 ring_cnt = rss_getnumbuckets(); 1781 #endif 1782 1783 tx_ring_cnt = hn_tx_ring_cnt; 1784 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 1785 tx_ring_cnt = ring_cnt; 1786 #ifdef HN_IFSTART_SUPPORT 1787 if (hn_use_if_start) { 1788 /* ifnet.if_start only needs one TX ring. */ 1789 tx_ring_cnt = 1; 1790 } 1791 #endif 1792 1793 /* 1794 * Set the leader CPU for channels. 1795 */ 1796 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 1797 1798 /* 1799 * Create enough TX/RX rings, even if only limited number of 1800 * channels can be allocated. 1801 */ 1802 error = hn_create_tx_data(sc, tx_ring_cnt); 1803 if (error) 1804 goto failed; 1805 error = hn_create_rx_data(sc, ring_cnt); 1806 if (error) 1807 goto failed; 1808 1809 /* 1810 * Create transaction context for NVS and RNDIS transactions. 1811 */ 1812 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 1813 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 1814 if (sc->hn_xact == NULL) { 1815 error = ENXIO; 1816 goto failed; 1817 } 1818 1819 /* 1820 * Install orphan handler for the revocation of this device's 1821 * primary channel. 1822 * 1823 * NOTE: 1824 * The processing order is critical here: 1825 * Install the orphan handler, _before_ testing whether this 1826 * device's primary channel has been revoked or not. 1827 */ 1828 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 1829 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 1830 error = ENXIO; 1831 goto failed; 1832 } 1833 1834 /* 1835 * Attach the synthetic parts, i.e. NVS and RNDIS. 1836 */ 1837 error = hn_synth_attach(sc, ETHERMTU); 1838 if (error) 1839 goto failed; 1840 1841 error = hn_rndis_get_eaddr(sc, eaddr); 1842 if (error) 1843 goto failed; 1844 1845 #if __FreeBSD_version >= 1100099 1846 if (sc->hn_rx_ring_inuse > 1) { 1847 /* 1848 * Reduce TCP segment aggregation limit for multiple 1849 * RX rings to increase ACK timeliness. 1850 */ 1851 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 1852 } 1853 #endif 1854 1855 /* 1856 * Fixup TX stuffs after synthetic parts are attached. 1857 */ 1858 hn_fixup_tx_data(sc); 1859 1860 ctx = device_get_sysctl_ctx(dev); 1861 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 1862 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 1863 &sc->hn_nvs_ver, 0, "NVS version"); 1864 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 1865 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1866 hn_ndis_version_sysctl, "A", "NDIS version"); 1867 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 1868 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1869 hn_caps_sysctl, "A", "capabilities"); 1870 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 1871 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1872 hn_hwassist_sysctl, "A", "hwassist"); 1873 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 1874 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 1875 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 1876 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 1877 "max # of TSO segments"); 1878 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 1879 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 1880 "max size of TSO segment"); 1881 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 1882 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1883 hn_rxfilter_sysctl, "A", "rxfilter"); 1884 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 1885 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1886 hn_rss_hash_sysctl, "A", "RSS hash"); 1887 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 1888 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 1889 #ifndef RSS 1890 /* 1891 * Don't allow RSS key/indirect table changes, if RSS is defined. 1892 */ 1893 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 1894 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1895 hn_rss_key_sysctl, "IU", "RSS key"); 1896 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 1897 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1898 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 1899 #endif 1900 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 1901 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 1902 "RNDIS offered packet transmission aggregation size limit"); 1903 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 1904 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 1905 "RNDIS offered packet transmission aggregation count limit"); 1906 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 1907 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 1908 "RNDIS packet transmission aggregation alignment"); 1909 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 1910 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1911 hn_txagg_size_sysctl, "I", 1912 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 1913 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 1914 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1915 hn_txagg_pkts_sysctl, "I", 1916 "Packet transmission aggregation packets, " 1917 "0 -- disable, -1 -- auto"); 1918 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 1919 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1920 hn_polling_sysctl, "I", 1921 "Polling frequency: [100,1000000], 0 disable polling"); 1922 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 1923 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1924 hn_vf_sysctl, "A", "Virtual Function's name"); 1925 if (!hn_xpnt_vf) { 1926 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 1927 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1928 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 1929 } else { 1930 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 1931 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 1932 hn_xpnt_vf_enabled_sysctl, "I", 1933 "Transparent VF enabled"); 1934 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 1935 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 1936 hn_xpnt_vf_accbpf_sysctl, "I", 1937 "Accurate BPF for transparent VF"); 1938 } 1939 1940 /* 1941 * Setup the ifmedia, which has been initialized earlier. 1942 */ 1943 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 1944 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 1945 /* XXX ifmedia_set really should do this for us */ 1946 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 1947 1948 /* 1949 * Setup the ifnet for this interface. 1950 */ 1951 1952 ifp->if_baudrate = IF_Gbps(10); 1953 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 1954 ifp->if_ioctl = hn_ioctl; 1955 ifp->if_init = hn_init; 1956 #ifdef HN_IFSTART_SUPPORT 1957 if (hn_use_if_start) { 1958 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 1959 1960 ifp->if_start = hn_start; 1961 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 1962 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 1963 IFQ_SET_READY(&ifp->if_snd); 1964 } else 1965 #endif 1966 { 1967 ifp->if_transmit = hn_transmit; 1968 ifp->if_qflush = hn_xmit_qflush; 1969 } 1970 1971 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 1972 #ifdef foo 1973 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 1974 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 1975 #endif 1976 if (sc->hn_caps & HN_CAP_VLAN) { 1977 /* XXX not sure about VLAN_MTU. */ 1978 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 1979 } 1980 1981 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 1982 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 1983 ifp->if_capabilities |= IFCAP_TXCSUM; 1984 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 1985 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 1986 if (sc->hn_caps & HN_CAP_TSO4) { 1987 ifp->if_capabilities |= IFCAP_TSO4; 1988 ifp->if_hwassist |= CSUM_IP_TSO; 1989 } 1990 if (sc->hn_caps & HN_CAP_TSO6) { 1991 ifp->if_capabilities |= IFCAP_TSO6; 1992 ifp->if_hwassist |= CSUM_IP6_TSO; 1993 } 1994 1995 /* Enable all available capabilities by default. */ 1996 ifp->if_capenable = ifp->if_capabilities; 1997 1998 /* 1999 * Disable IPv6 TSO and TXCSUM by default, they still can 2000 * be enabled through SIOCSIFCAP. 2001 */ 2002 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2003 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2004 2005 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2006 /* 2007 * Lock hn_set_tso_maxsize() to simplify its 2008 * internal logic. 2009 */ 2010 HN_LOCK(sc); 2011 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2012 HN_UNLOCK(sc); 2013 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2014 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2015 } 2016 2017 ether_ifattach(ifp, eaddr); 2018 2019 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2020 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2021 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2022 } 2023 2024 /* Inform the upper layer about the long frame support. */ 2025 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2026 2027 /* 2028 * Kick off link status check. 2029 */ 2030 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2031 hn_update_link_status(sc); 2032 2033 if (!hn_xpnt_vf) { 2034 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2035 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2036 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2037 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2038 } else { 2039 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2040 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2041 } 2042 2043 /* 2044 * NOTE: 2045 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2046 * since interface's LLADDR is needed; interface LLADDR is not 2047 * available when ifnet_arrival event is triggered. 2048 */ 2049 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2050 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2051 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2052 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2053 2054 return (0); 2055 failed: 2056 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2057 hn_synth_detach(sc); 2058 hn_detach(dev); 2059 return (error); 2060 } 2061 2062 static int 2063 hn_detach(device_t dev) 2064 { 2065 struct hn_softc *sc = device_get_softc(dev); 2066 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2067 2068 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2069 /* 2070 * In case that the vmbus missed the orphan handler 2071 * installation. 2072 */ 2073 vmbus_xact_ctx_orphan(sc->hn_xact); 2074 } 2075 2076 if (sc->hn_ifaddr_evthand != NULL) 2077 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2078 if (sc->hn_ifnet_evthand != NULL) 2079 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2080 if (sc->hn_ifnet_atthand != NULL) { 2081 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2082 sc->hn_ifnet_atthand); 2083 } 2084 if (sc->hn_ifnet_dethand != NULL) { 2085 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2086 sc->hn_ifnet_dethand); 2087 } 2088 if (sc->hn_ifnet_lnkhand != NULL) 2089 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2090 2091 vf_ifp = sc->hn_vf_ifp; 2092 __compiler_membar(); 2093 if (vf_ifp != NULL) 2094 hn_ifnet_detevent(sc, vf_ifp); 2095 2096 if (device_is_attached(dev)) { 2097 HN_LOCK(sc); 2098 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2099 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2100 hn_stop(sc, true); 2101 /* 2102 * NOTE: 2103 * hn_stop() only suspends data, so managment 2104 * stuffs have to be suspended manually here. 2105 */ 2106 hn_suspend_mgmt(sc); 2107 hn_synth_detach(sc); 2108 } 2109 HN_UNLOCK(sc); 2110 ether_ifdetach(ifp); 2111 } 2112 2113 ifmedia_removeall(&sc->hn_media); 2114 hn_destroy_rx_data(sc); 2115 hn_destroy_tx_data(sc); 2116 2117 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2118 int i; 2119 2120 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2121 taskqueue_free(sc->hn_tx_taskqs[i]); 2122 free(sc->hn_tx_taskqs, M_DEVBUF); 2123 } 2124 taskqueue_free(sc->hn_mgmt_taskq0); 2125 if (sc->hn_vf_taskq != NULL) 2126 taskqueue_free(sc->hn_vf_taskq); 2127 2128 if (sc->hn_xact != NULL) { 2129 /* 2130 * Uninstall the orphan handler _before_ the xact is 2131 * destructed. 2132 */ 2133 vmbus_chan_unset_orphan(sc->hn_prichan); 2134 vmbus_xact_ctx_destroy(sc->hn_xact); 2135 } 2136 2137 if_free(ifp); 2138 2139 HN_LOCK_DESTROY(sc); 2140 rm_destroy(&sc->hn_vf_lock); 2141 return (0); 2142 } 2143 2144 static int 2145 hn_shutdown(device_t dev) 2146 { 2147 2148 return (0); 2149 } 2150 2151 static void 2152 hn_link_status(struct hn_softc *sc) 2153 { 2154 uint32_t link_status; 2155 int error; 2156 2157 error = hn_rndis_get_linkstatus(sc, &link_status); 2158 if (error) { 2159 /* XXX what to do? */ 2160 return; 2161 } 2162 2163 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2164 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2165 else 2166 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2167 if_link_state_change(sc->hn_ifp, 2168 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2169 LINK_STATE_UP : LINK_STATE_DOWN); 2170 } 2171 2172 static void 2173 hn_link_taskfunc(void *xsc, int pending __unused) 2174 { 2175 struct hn_softc *sc = xsc; 2176 2177 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2178 return; 2179 hn_link_status(sc); 2180 } 2181 2182 static void 2183 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2184 { 2185 struct hn_softc *sc = xsc; 2186 2187 /* Prevent any link status checks from running. */ 2188 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2189 2190 /* 2191 * Fake up a [link down --> link up] state change; 5 seconds 2192 * delay is used, which closely simulates miibus reaction 2193 * upon link down event. 2194 */ 2195 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2196 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2197 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2198 &sc->hn_netchg_status, 5 * hz); 2199 } 2200 2201 static void 2202 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2203 { 2204 struct hn_softc *sc = xsc; 2205 2206 /* Re-allow link status checks. */ 2207 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2208 hn_link_status(sc); 2209 } 2210 2211 static void 2212 hn_update_link_status(struct hn_softc *sc) 2213 { 2214 2215 if (sc->hn_mgmt_taskq != NULL) 2216 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2217 } 2218 2219 static void 2220 hn_change_network(struct hn_softc *sc) 2221 { 2222 2223 if (sc->hn_mgmt_taskq != NULL) 2224 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2225 } 2226 2227 static __inline int 2228 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2229 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2230 { 2231 struct mbuf *m = *m_head; 2232 int error; 2233 2234 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2235 2236 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2237 m, segs, nsegs, BUS_DMA_NOWAIT); 2238 if (error == EFBIG) { 2239 struct mbuf *m_new; 2240 2241 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2242 if (m_new == NULL) 2243 return ENOBUFS; 2244 else 2245 *m_head = m = m_new; 2246 txr->hn_tx_collapsed++; 2247 2248 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2249 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2250 } 2251 if (!error) { 2252 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2253 BUS_DMASYNC_PREWRITE); 2254 txd->flags |= HN_TXD_FLAG_DMAMAP; 2255 } 2256 return error; 2257 } 2258 2259 static __inline int 2260 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2261 { 2262 2263 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2264 ("put an onlist txd %#x", txd->flags)); 2265 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2266 ("put an onagg txd %#x", txd->flags)); 2267 2268 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2269 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2270 return 0; 2271 2272 if (!STAILQ_EMPTY(&txd->agg_list)) { 2273 struct hn_txdesc *tmp_txd; 2274 2275 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2276 int freed; 2277 2278 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2279 ("resursive aggregation on aggregated txdesc")); 2280 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2281 ("not aggregated txdesc")); 2282 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2283 ("aggregated txdesc uses dmamap")); 2284 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2285 ("aggregated txdesc consumes " 2286 "chimney sending buffer")); 2287 KASSERT(tmp_txd->chim_size == 0, 2288 ("aggregated txdesc has non-zero " 2289 "chimney sending size")); 2290 2291 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2292 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2293 freed = hn_txdesc_put(txr, tmp_txd); 2294 KASSERT(freed, ("failed to free aggregated txdesc")); 2295 } 2296 } 2297 2298 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2299 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2300 ("chim txd uses dmamap")); 2301 hn_chim_free(txr->hn_sc, txd->chim_index); 2302 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2303 txd->chim_size = 0; 2304 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2305 bus_dmamap_sync(txr->hn_tx_data_dtag, 2306 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2307 bus_dmamap_unload(txr->hn_tx_data_dtag, 2308 txd->data_dmap); 2309 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2310 } 2311 2312 if (txd->m != NULL) { 2313 m_freem(txd->m); 2314 txd->m = NULL; 2315 } 2316 2317 txd->flags |= HN_TXD_FLAG_ONLIST; 2318 #ifndef HN_USE_TXDESC_BUFRING 2319 mtx_lock_spin(&txr->hn_txlist_spin); 2320 KASSERT(txr->hn_txdesc_avail >= 0 && 2321 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2322 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2323 txr->hn_txdesc_avail++; 2324 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2325 mtx_unlock_spin(&txr->hn_txlist_spin); 2326 #else /* HN_USE_TXDESC_BUFRING */ 2327 #ifdef HN_DEBUG 2328 atomic_add_int(&txr->hn_txdesc_avail, 1); 2329 #endif 2330 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2331 #endif /* !HN_USE_TXDESC_BUFRING */ 2332 2333 return 1; 2334 } 2335 2336 static __inline struct hn_txdesc * 2337 hn_txdesc_get(struct hn_tx_ring *txr) 2338 { 2339 struct hn_txdesc *txd; 2340 2341 #ifndef HN_USE_TXDESC_BUFRING 2342 mtx_lock_spin(&txr->hn_txlist_spin); 2343 txd = SLIST_FIRST(&txr->hn_txlist); 2344 if (txd != NULL) { 2345 KASSERT(txr->hn_txdesc_avail > 0, 2346 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2347 txr->hn_txdesc_avail--; 2348 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2349 } 2350 mtx_unlock_spin(&txr->hn_txlist_spin); 2351 #else 2352 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2353 #endif 2354 2355 if (txd != NULL) { 2356 #ifdef HN_USE_TXDESC_BUFRING 2357 #ifdef HN_DEBUG 2358 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2359 #endif 2360 #endif /* HN_USE_TXDESC_BUFRING */ 2361 KASSERT(txd->m == NULL && txd->refs == 0 && 2362 STAILQ_EMPTY(&txd->agg_list) && 2363 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2364 txd->chim_size == 0 && 2365 (txd->flags & HN_TXD_FLAG_ONLIST) && 2366 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2367 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2368 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2369 txd->refs = 1; 2370 } 2371 return txd; 2372 } 2373 2374 static __inline void 2375 hn_txdesc_hold(struct hn_txdesc *txd) 2376 { 2377 2378 /* 0->1 transition will never work */ 2379 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2380 atomic_add_int(&txd->refs, 1); 2381 } 2382 2383 static __inline void 2384 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2385 { 2386 2387 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2388 ("recursive aggregation on aggregating txdesc")); 2389 2390 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2391 ("already aggregated")); 2392 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2393 ("recursive aggregation on to-be-aggregated txdesc")); 2394 2395 txd->flags |= HN_TXD_FLAG_ONAGG; 2396 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2397 } 2398 2399 static bool 2400 hn_tx_ring_pending(struct hn_tx_ring *txr) 2401 { 2402 bool pending = false; 2403 2404 #ifndef HN_USE_TXDESC_BUFRING 2405 mtx_lock_spin(&txr->hn_txlist_spin); 2406 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2407 pending = true; 2408 mtx_unlock_spin(&txr->hn_txlist_spin); 2409 #else 2410 if (!buf_ring_full(txr->hn_txdesc_br)) 2411 pending = true; 2412 #endif 2413 return (pending); 2414 } 2415 2416 static __inline void 2417 hn_txeof(struct hn_tx_ring *txr) 2418 { 2419 txr->hn_has_txeof = 0; 2420 txr->hn_txeof(txr); 2421 } 2422 2423 static void 2424 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2425 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2426 { 2427 struct hn_txdesc *txd = sndc->hn_cbarg; 2428 struct hn_tx_ring *txr; 2429 2430 txr = txd->txr; 2431 KASSERT(txr->hn_chan == chan, 2432 ("channel mismatch, on chan%u, should be chan%u", 2433 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2434 2435 txr->hn_has_txeof = 1; 2436 hn_txdesc_put(txr, txd); 2437 2438 ++txr->hn_txdone_cnt; 2439 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2440 txr->hn_txdone_cnt = 0; 2441 if (txr->hn_oactive) 2442 hn_txeof(txr); 2443 } 2444 } 2445 2446 static void 2447 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2448 { 2449 #if defined(INET) || defined(INET6) 2450 tcp_lro_flush_all(&rxr->hn_lro); 2451 #endif 2452 2453 /* 2454 * NOTE: 2455 * 'txr' could be NULL, if multiple channels and 2456 * ifnet.if_start method are enabled. 2457 */ 2458 if (txr == NULL || !txr->hn_has_txeof) 2459 return; 2460 2461 txr->hn_txdone_cnt = 0; 2462 hn_txeof(txr); 2463 } 2464 2465 static __inline uint32_t 2466 hn_rndis_pktmsg_offset(uint32_t ofs) 2467 { 2468 2469 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2470 ("invalid RNDIS packet msg offset %u", ofs)); 2471 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2472 } 2473 2474 static __inline void * 2475 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2476 size_t pi_dlen, uint32_t pi_type) 2477 { 2478 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2479 struct rndis_pktinfo *pi; 2480 2481 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2482 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2483 2484 /* 2485 * Per-packet-info does not move; it only grows. 2486 * 2487 * NOTE: 2488 * rm_pktinfooffset in this phase counts from the beginning 2489 * of rndis_packet_msg. 2490 */ 2491 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2492 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2493 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2494 pkt->rm_pktinfolen); 2495 pkt->rm_pktinfolen += pi_size; 2496 2497 pi->rm_size = pi_size; 2498 pi->rm_type = pi_type; 2499 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2500 2501 return (pi->rm_data); 2502 } 2503 2504 static __inline int 2505 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2506 { 2507 struct hn_txdesc *txd; 2508 struct mbuf *m; 2509 int error, pkts; 2510 2511 txd = txr->hn_agg_txd; 2512 KASSERT(txd != NULL, ("no aggregate txdesc")); 2513 2514 /* 2515 * Since hn_txpkt() will reset this temporary stat, save 2516 * it now, so that oerrors can be updated properly, if 2517 * hn_txpkt() ever fails. 2518 */ 2519 pkts = txr->hn_stat_pkts; 2520 2521 /* 2522 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2523 * failure, save it for later freeing, if hn_txpkt() ever 2524 * fails. 2525 */ 2526 m = txd->m; 2527 error = hn_txpkt(ifp, txr, txd); 2528 if (__predict_false(error)) { 2529 /* txd is freed, but m is not. */ 2530 m_freem(m); 2531 2532 txr->hn_flush_failed++; 2533 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2534 } 2535 2536 /* Reset all aggregation states. */ 2537 txr->hn_agg_txd = NULL; 2538 txr->hn_agg_szleft = 0; 2539 txr->hn_agg_pktleft = 0; 2540 txr->hn_agg_prevpkt = NULL; 2541 2542 return (error); 2543 } 2544 2545 static void * 2546 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2547 int pktsize) 2548 { 2549 void *chim; 2550 2551 if (txr->hn_agg_txd != NULL) { 2552 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2553 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2554 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2555 int olen; 2556 2557 /* 2558 * Update the previous RNDIS packet's total length, 2559 * it can be increased due to the mandatory alignment 2560 * padding for this RNDIS packet. And update the 2561 * aggregating txdesc's chimney sending buffer size 2562 * accordingly. 2563 * 2564 * XXX 2565 * Zero-out the padding, as required by the RNDIS spec. 2566 */ 2567 olen = pkt->rm_len; 2568 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 2569 agg_txd->chim_size += pkt->rm_len - olen; 2570 2571 /* Link this txdesc to the parent. */ 2572 hn_txdesc_agg(agg_txd, txd); 2573 2574 chim = (uint8_t *)pkt + pkt->rm_len; 2575 /* Save the current packet for later fixup. */ 2576 txr->hn_agg_prevpkt = chim; 2577 2578 txr->hn_agg_pktleft--; 2579 txr->hn_agg_szleft -= pktsize; 2580 if (txr->hn_agg_szleft <= 2581 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 2582 /* 2583 * Probably can't aggregate more packets, 2584 * flush this aggregating txdesc proactively. 2585 */ 2586 txr->hn_agg_pktleft = 0; 2587 } 2588 /* Done! */ 2589 return (chim); 2590 } 2591 hn_flush_txagg(ifp, txr); 2592 } 2593 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 2594 2595 txr->hn_tx_chimney_tried++; 2596 txd->chim_index = hn_chim_alloc(txr->hn_sc); 2597 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 2598 return (NULL); 2599 txr->hn_tx_chimney++; 2600 2601 chim = txr->hn_sc->hn_chim + 2602 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 2603 2604 if (txr->hn_agg_pktmax > 1 && 2605 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 2606 txr->hn_agg_txd = txd; 2607 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 2608 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 2609 txr->hn_agg_prevpkt = chim; 2610 } 2611 return (chim); 2612 } 2613 2614 /* 2615 * NOTE: 2616 * If this function fails, then both txd and m_head0 will be freed. 2617 */ 2618 static int 2619 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2620 struct mbuf **m_head0) 2621 { 2622 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 2623 int error, nsegs, i; 2624 struct mbuf *m_head = *m_head0; 2625 struct rndis_packet_msg *pkt; 2626 uint32_t *pi_data; 2627 void *chim = NULL; 2628 int pkt_hlen, pkt_size; 2629 2630 pkt = txd->rndis_pkt; 2631 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 2632 if (pkt_size < txr->hn_chim_size) { 2633 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 2634 if (chim != NULL) 2635 pkt = chim; 2636 } else { 2637 if (txr->hn_agg_txd != NULL) 2638 hn_flush_txagg(ifp, txr); 2639 } 2640 2641 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 2642 pkt->rm_len = m_head->m_pkthdr.len; 2643 pkt->rm_dataoffset = 0; 2644 pkt->rm_datalen = m_head->m_pkthdr.len; 2645 pkt->rm_oobdataoffset = 0; 2646 pkt->rm_oobdatalen = 0; 2647 pkt->rm_oobdataelements = 0; 2648 pkt->rm_pktinfooffset = sizeof(*pkt); 2649 pkt->rm_pktinfolen = 0; 2650 pkt->rm_vchandle = 0; 2651 pkt->rm_reserved = 0; 2652 2653 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 2654 /* 2655 * Set the hash value for this packet, so that the host could 2656 * dispatch the TX done event for this packet back to this TX 2657 * ring's channel. 2658 */ 2659 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 2660 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 2661 *pi_data = txr->hn_tx_idx; 2662 } 2663 2664 if (m_head->m_flags & M_VLANTAG) { 2665 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 2666 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 2667 *pi_data = NDIS_VLAN_INFO_MAKE( 2668 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 2669 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 2670 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 2671 } 2672 2673 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 2674 #if defined(INET6) || defined(INET) 2675 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 2676 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 2677 #ifdef INET 2678 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 2679 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, 2680 m_head->m_pkthdr.tso_segsz); 2681 } 2682 #endif 2683 #if defined(INET6) && defined(INET) 2684 else 2685 #endif 2686 #ifdef INET6 2687 { 2688 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, 2689 m_head->m_pkthdr.tso_segsz); 2690 } 2691 #endif 2692 #endif /* INET6 || INET */ 2693 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 2694 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 2695 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 2696 if (m_head->m_pkthdr.csum_flags & 2697 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 2698 *pi_data = NDIS_TXCSUM_INFO_IPV6; 2699 } else { 2700 *pi_data = NDIS_TXCSUM_INFO_IPV4; 2701 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 2702 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 2703 } 2704 2705 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) 2706 *pi_data |= NDIS_TXCSUM_INFO_TCPCS; 2707 else if (m_head->m_pkthdr.csum_flags & 2708 (CSUM_IP_UDP | CSUM_IP6_UDP)) 2709 *pi_data |= NDIS_TXCSUM_INFO_UDPCS; 2710 } 2711 2712 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 2713 /* Fixup RNDIS packet message total length */ 2714 pkt->rm_len += pkt_hlen; 2715 /* Convert RNDIS packet message offsets */ 2716 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 2717 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 2718 2719 /* 2720 * Fast path: Chimney sending. 2721 */ 2722 if (chim != NULL) { 2723 struct hn_txdesc *tgt_txd = txd; 2724 2725 if (txr->hn_agg_txd != NULL) { 2726 tgt_txd = txr->hn_agg_txd; 2727 #ifdef INVARIANTS 2728 *m_head0 = NULL; 2729 #endif 2730 } 2731 2732 KASSERT(pkt == chim, 2733 ("RNDIS pkt not in chimney sending buffer")); 2734 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 2735 ("chimney sending buffer is not used")); 2736 tgt_txd->chim_size += pkt->rm_len; 2737 2738 m_copydata(m_head, 0, m_head->m_pkthdr.len, 2739 ((uint8_t *)chim) + pkt_hlen); 2740 2741 txr->hn_gpa_cnt = 0; 2742 txr->hn_sendpkt = hn_txpkt_chim; 2743 goto done; 2744 } 2745 2746 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 2747 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2748 ("chimney buffer is used")); 2749 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 2750 2751 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 2752 if (__predict_false(error)) { 2753 int freed; 2754 2755 /* 2756 * This mbuf is not linked w/ the txd yet, so free it now. 2757 */ 2758 m_freem(m_head); 2759 *m_head0 = NULL; 2760 2761 freed = hn_txdesc_put(txr, txd); 2762 KASSERT(freed != 0, 2763 ("fail to free txd upon txdma error")); 2764 2765 txr->hn_txdma_failed++; 2766 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 2767 return error; 2768 } 2769 *m_head0 = m_head; 2770 2771 /* +1 RNDIS packet message */ 2772 txr->hn_gpa_cnt = nsegs + 1; 2773 2774 /* send packet with page buffer */ 2775 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 2776 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 2777 txr->hn_gpa[0].gpa_len = pkt_hlen; 2778 2779 /* 2780 * Fill the page buffers with mbuf info after the page 2781 * buffer for RNDIS packet message. 2782 */ 2783 for (i = 0; i < nsegs; ++i) { 2784 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 2785 2786 gpa->gpa_page = atop(segs[i].ds_addr); 2787 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 2788 gpa->gpa_len = segs[i].ds_len; 2789 } 2790 2791 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2792 txd->chim_size = 0; 2793 txr->hn_sendpkt = hn_txpkt_sglist; 2794 done: 2795 txd->m = m_head; 2796 2797 /* Set the completion routine */ 2798 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 2799 2800 /* Update temporary stats for later use. */ 2801 txr->hn_stat_pkts++; 2802 txr->hn_stat_size += m_head->m_pkthdr.len; 2803 if (m_head->m_flags & M_MCAST) 2804 txr->hn_stat_mcasts++; 2805 2806 return 0; 2807 } 2808 2809 /* 2810 * NOTE: 2811 * If this function fails, then txd will be freed, but the mbuf 2812 * associated w/ the txd will _not_ be freed. 2813 */ 2814 static int 2815 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 2816 { 2817 int error, send_failed = 0, has_bpf; 2818 2819 again: 2820 has_bpf = bpf_peers_present(ifp->if_bpf); 2821 if (has_bpf) { 2822 /* 2823 * Make sure that this txd and any aggregated txds are not 2824 * freed before ETHER_BPF_MTAP. 2825 */ 2826 hn_txdesc_hold(txd); 2827 } 2828 error = txr->hn_sendpkt(txr, txd); 2829 if (!error) { 2830 if (has_bpf) { 2831 const struct hn_txdesc *tmp_txd; 2832 2833 ETHER_BPF_MTAP(ifp, txd->m); 2834 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 2835 ETHER_BPF_MTAP(ifp, tmp_txd->m); 2836 } 2837 2838 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 2839 #ifdef HN_IFSTART_SUPPORT 2840 if (!hn_use_if_start) 2841 #endif 2842 { 2843 if_inc_counter(ifp, IFCOUNTER_OBYTES, 2844 txr->hn_stat_size); 2845 if (txr->hn_stat_mcasts != 0) { 2846 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 2847 txr->hn_stat_mcasts); 2848 } 2849 } 2850 txr->hn_pkts += txr->hn_stat_pkts; 2851 txr->hn_sends++; 2852 } 2853 if (has_bpf) 2854 hn_txdesc_put(txr, txd); 2855 2856 if (__predict_false(error)) { 2857 int freed; 2858 2859 /* 2860 * This should "really rarely" happen. 2861 * 2862 * XXX Too many RX to be acked or too many sideband 2863 * commands to run? Ask netvsc_channel_rollup() 2864 * to kick start later. 2865 */ 2866 txr->hn_has_txeof = 1; 2867 if (!send_failed) { 2868 txr->hn_send_failed++; 2869 send_failed = 1; 2870 /* 2871 * Try sending again after set hn_has_txeof; 2872 * in case that we missed the last 2873 * netvsc_channel_rollup(). 2874 */ 2875 goto again; 2876 } 2877 if_printf(ifp, "send failed\n"); 2878 2879 /* 2880 * Caller will perform further processing on the 2881 * associated mbuf, so don't free it in hn_txdesc_put(); 2882 * only unload it from the DMA map in hn_txdesc_put(), 2883 * if it was loaded. 2884 */ 2885 txd->m = NULL; 2886 freed = hn_txdesc_put(txr, txd); 2887 KASSERT(freed != 0, 2888 ("fail to free txd upon send error")); 2889 2890 txr->hn_send_failed++; 2891 } 2892 2893 /* Reset temporary stats, after this sending is done. */ 2894 txr->hn_stat_size = 0; 2895 txr->hn_stat_pkts = 0; 2896 txr->hn_stat_mcasts = 0; 2897 2898 return (error); 2899 } 2900 2901 /* 2902 * Append the specified data to the indicated mbuf chain, 2903 * Extend the mbuf chain if the new data does not fit in 2904 * existing space. 2905 * 2906 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 2907 * There should be an equivalent in the kernel mbuf code, 2908 * but there does not appear to be one yet. 2909 * 2910 * Differs from m_append() in that additional mbufs are 2911 * allocated with cluster size MJUMPAGESIZE, and filled 2912 * accordingly. 2913 * 2914 * Return 1 if able to complete the job; otherwise 0. 2915 */ 2916 static int 2917 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 2918 { 2919 struct mbuf *m, *n; 2920 int remainder, space; 2921 2922 for (m = m0; m->m_next != NULL; m = m->m_next) 2923 ; 2924 remainder = len; 2925 space = M_TRAILINGSPACE(m); 2926 if (space > 0) { 2927 /* 2928 * Copy into available space. 2929 */ 2930 if (space > remainder) 2931 space = remainder; 2932 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 2933 m->m_len += space; 2934 cp += space; 2935 remainder -= space; 2936 } 2937 while (remainder > 0) { 2938 /* 2939 * Allocate a new mbuf; could check space 2940 * and allocate a cluster instead. 2941 */ 2942 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 2943 if (n == NULL) 2944 break; 2945 n->m_len = min(MJUMPAGESIZE, remainder); 2946 bcopy(cp, mtod(n, caddr_t), n->m_len); 2947 cp += n->m_len; 2948 remainder -= n->m_len; 2949 m->m_next = n; 2950 m = n; 2951 } 2952 if (m0->m_flags & M_PKTHDR) 2953 m0->m_pkthdr.len += len - remainder; 2954 2955 return (remainder == 0); 2956 } 2957 2958 #if defined(INET) || defined(INET6) 2959 static __inline int 2960 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 2961 { 2962 #if __FreeBSD_version >= 1100095 2963 if (hn_lro_mbufq_depth) { 2964 tcp_lro_queue_mbuf(lc, m); 2965 return 0; 2966 } 2967 #endif 2968 return tcp_lro_rx(lc, m, 0); 2969 } 2970 #endif 2971 2972 static int 2973 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 2974 const struct hn_rxinfo *info) 2975 { 2976 struct ifnet *ifp; 2977 struct mbuf *m_new; 2978 int size, do_lro = 0, do_csum = 1; 2979 int hash_type; 2980 2981 /* If the VF is active, inject the packet through the VF */ 2982 ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : rxr->hn_ifp; 2983 2984 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 2985 /* 2986 * NOTE: 2987 * See the NOTE of hn_rndis_init_fixat(). This 2988 * function can be reached, immediately after the 2989 * RNDIS is initialized but before the ifnet is 2990 * setup on the hn_attach() path; drop the unexpected 2991 * packets. 2992 */ 2993 return (0); 2994 } 2995 2996 if (dlen <= MHLEN) { 2997 m_new = m_gethdr(M_NOWAIT, MT_DATA); 2998 if (m_new == NULL) { 2999 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 3000 return (0); 3001 } 3002 memcpy(mtod(m_new, void *), data, dlen); 3003 m_new->m_pkthdr.len = m_new->m_len = dlen; 3004 rxr->hn_small_pkts++; 3005 } else { 3006 /* 3007 * Get an mbuf with a cluster. For packets 2K or less, 3008 * get a standard 2K cluster. For anything larger, get a 3009 * 4K cluster. Any buffers larger than 4K can cause problems 3010 * if looped around to the Hyper-V TX channel, so avoid them. 3011 */ 3012 size = MCLBYTES; 3013 if (dlen > MCLBYTES) { 3014 /* 4096 */ 3015 size = MJUMPAGESIZE; 3016 } 3017 3018 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3019 if (m_new == NULL) { 3020 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); 3021 return (0); 3022 } 3023 3024 hv_m_append(m_new, dlen, data); 3025 } 3026 m_new->m_pkthdr.rcvif = ifp; 3027 3028 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3029 do_csum = 0; 3030 3031 /* receive side checksum offload */ 3032 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 3033 /* IP csum offload */ 3034 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3035 m_new->m_pkthdr.csum_flags |= 3036 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3037 rxr->hn_csum_ip++; 3038 } 3039 3040 /* TCP/UDP csum offload */ 3041 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 3042 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3043 m_new->m_pkthdr.csum_flags |= 3044 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3045 m_new->m_pkthdr.csum_data = 0xffff; 3046 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 3047 rxr->hn_csum_tcp++; 3048 else 3049 rxr->hn_csum_udp++; 3050 } 3051 3052 /* 3053 * XXX 3054 * As of this write (Oct 28th, 2016), host side will turn 3055 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3056 * the do_lro setting here is actually _not_ accurate. We 3057 * depend on the RSS hash type check to reset do_lro. 3058 */ 3059 if ((info->csum_info & 3060 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3061 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3062 do_lro = 1; 3063 } else { 3064 const struct ether_header *eh; 3065 uint16_t etype; 3066 int hoff; 3067 3068 hoff = sizeof(*eh); 3069 if (m_new->m_len < hoff) 3070 goto skip; 3071 eh = mtod(m_new, struct ether_header *); 3072 etype = ntohs(eh->ether_type); 3073 if (etype == ETHERTYPE_VLAN) { 3074 const struct ether_vlan_header *evl; 3075 3076 hoff = sizeof(*evl); 3077 if (m_new->m_len < hoff) 3078 goto skip; 3079 evl = mtod(m_new, struct ether_vlan_header *); 3080 etype = ntohs(evl->evl_proto); 3081 } 3082 3083 if (etype == ETHERTYPE_IP) { 3084 int pr; 3085 3086 pr = hn_check_iplen(m_new, hoff); 3087 if (pr == IPPROTO_TCP) { 3088 if (do_csum && 3089 (rxr->hn_trust_hcsum & 3090 HN_TRUST_HCSUM_TCP)) { 3091 rxr->hn_csum_trusted++; 3092 m_new->m_pkthdr.csum_flags |= 3093 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3094 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3095 m_new->m_pkthdr.csum_data = 0xffff; 3096 } 3097 do_lro = 1; 3098 } else if (pr == IPPROTO_UDP) { 3099 if (do_csum && 3100 (rxr->hn_trust_hcsum & 3101 HN_TRUST_HCSUM_UDP)) { 3102 rxr->hn_csum_trusted++; 3103 m_new->m_pkthdr.csum_flags |= 3104 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3105 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3106 m_new->m_pkthdr.csum_data = 0xffff; 3107 } 3108 } else if (pr != IPPROTO_DONE && do_csum && 3109 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3110 rxr->hn_csum_trusted++; 3111 m_new->m_pkthdr.csum_flags |= 3112 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3113 } 3114 } 3115 } 3116 skip: 3117 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 3118 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3119 NDIS_VLAN_INFO_ID(info->vlan_info), 3120 NDIS_VLAN_INFO_PRI(info->vlan_info), 3121 NDIS_VLAN_INFO_CFI(info->vlan_info)); 3122 m_new->m_flags |= M_VLANTAG; 3123 } 3124 3125 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 3126 rxr->hn_rss_pkts++; 3127 m_new->m_pkthdr.flowid = info->hash_value; 3128 hash_type = M_HASHTYPE_OPAQUE_HASH; 3129 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 3130 NDIS_HASH_FUNCTION_TOEPLITZ) { 3131 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); 3132 3133 /* 3134 * NOTE: 3135 * do_lro is resetted, if the hash types are not TCP 3136 * related. See the comment in the above csum_flags 3137 * setup section. 3138 */ 3139 switch (type) { 3140 case NDIS_HASH_IPV4: 3141 hash_type = M_HASHTYPE_RSS_IPV4; 3142 do_lro = 0; 3143 break; 3144 3145 case NDIS_HASH_TCP_IPV4: 3146 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3147 break; 3148 3149 case NDIS_HASH_IPV6: 3150 hash_type = M_HASHTYPE_RSS_IPV6; 3151 do_lro = 0; 3152 break; 3153 3154 case NDIS_HASH_IPV6_EX: 3155 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3156 do_lro = 0; 3157 break; 3158 3159 case NDIS_HASH_TCP_IPV6: 3160 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3161 break; 3162 3163 case NDIS_HASH_TCP_IPV6_EX: 3164 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3165 break; 3166 } 3167 } 3168 } else { 3169 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3170 hash_type = M_HASHTYPE_OPAQUE; 3171 } 3172 M_HASHTYPE_SET(m_new, hash_type); 3173 3174 /* 3175 * Note: Moved RX completion back to hv_nv_on_receive() so all 3176 * messages (not just data messages) will trigger a response. 3177 */ 3178 3179 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3180 rxr->hn_pkts++; 3181 3182 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { 3183 #if defined(INET) || defined(INET6) 3184 struct lro_ctrl *lro = &rxr->hn_lro; 3185 3186 if (lro->lro_cnt) { 3187 rxr->hn_lro_tried++; 3188 if (hn_lro_rx(lro, m_new) == 0) { 3189 /* DONE! */ 3190 return 0; 3191 } 3192 } 3193 #endif 3194 } 3195 3196 /* We're not holding the lock here, so don't release it */ 3197 (*ifp->if_input)(ifp, m_new); 3198 3199 return (0); 3200 } 3201 3202 static int 3203 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3204 { 3205 struct hn_softc *sc = ifp->if_softc; 3206 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3207 struct ifnet *vf_ifp; 3208 int mask, error = 0; 3209 3210 switch (cmd) { 3211 case SIOCSIFMTU: 3212 if (ifr->ifr_mtu > HN_MTU_MAX) { 3213 error = EINVAL; 3214 break; 3215 } 3216 3217 HN_LOCK(sc); 3218 3219 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3220 HN_UNLOCK(sc); 3221 break; 3222 } 3223 3224 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3225 /* Can't change MTU */ 3226 HN_UNLOCK(sc); 3227 error = EOPNOTSUPP; 3228 break; 3229 } 3230 3231 if (ifp->if_mtu == ifr->ifr_mtu) { 3232 HN_UNLOCK(sc); 3233 break; 3234 } 3235 3236 if (hn_xpnt_vf_isready(sc)) { 3237 vf_ifp = sc->hn_vf_ifp; 3238 ifr_vf = *ifr; 3239 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3240 sizeof(ifr_vf.ifr_name)); 3241 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3242 (caddr_t)&ifr_vf); 3243 if (error) { 3244 HN_UNLOCK(sc); 3245 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3246 vf_ifp->if_xname, ifr->ifr_mtu, error); 3247 break; 3248 } 3249 } 3250 3251 /* 3252 * Suspend this interface before the synthetic parts 3253 * are ripped. 3254 */ 3255 hn_suspend(sc); 3256 3257 /* 3258 * Detach the synthetics parts, i.e. NVS and RNDIS. 3259 */ 3260 hn_synth_detach(sc); 3261 3262 /* 3263 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3264 * with the new MTU setting. 3265 */ 3266 error = hn_synth_attach(sc, ifr->ifr_mtu); 3267 if (error) { 3268 HN_UNLOCK(sc); 3269 break; 3270 } 3271 3272 /* 3273 * Commit the requested MTU, after the synthetic parts 3274 * have been successfully attached. 3275 */ 3276 ifp->if_mtu = ifr->ifr_mtu; 3277 3278 /* 3279 * Synthetic parts' reattach may change the chimney 3280 * sending size; update it. 3281 */ 3282 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3283 hn_set_chim_size(sc, sc->hn_chim_szmax); 3284 3285 /* 3286 * Make sure that various parameters based on MTU are 3287 * still valid, after the MTU change. 3288 */ 3289 hn_mtu_change_fixup(sc); 3290 3291 /* 3292 * All done! Resume the interface now. 3293 */ 3294 hn_resume(sc); 3295 3296 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 3297 /* 3298 * Since we have reattached the NVS part, 3299 * change the datapath to VF again; in case 3300 * that it is lost, after the NVS was detached. 3301 */ 3302 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3303 } 3304 3305 HN_UNLOCK(sc); 3306 break; 3307 3308 case SIOCSIFFLAGS: 3309 HN_LOCK(sc); 3310 3311 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3312 HN_UNLOCK(sc); 3313 break; 3314 } 3315 3316 if (hn_xpnt_vf_isready(sc)) 3317 hn_xpnt_vf_saveifflags(sc); 3318 3319 if (ifp->if_flags & IFF_UP) { 3320 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3321 /* 3322 * Caller meight hold mutex, e.g. 3323 * bpf; use busy-wait for the RNDIS 3324 * reply. 3325 */ 3326 HN_NO_SLEEPING(sc); 3327 hn_rxfilter_config(sc); 3328 HN_SLEEPING_OK(sc); 3329 3330 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3331 error = hn_xpnt_vf_iocsetflags(sc); 3332 } else { 3333 hn_init_locked(sc); 3334 } 3335 } else { 3336 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3337 hn_stop(sc, false); 3338 } 3339 sc->hn_if_flags = ifp->if_flags; 3340 3341 HN_UNLOCK(sc); 3342 break; 3343 3344 case SIOCSIFCAP: 3345 HN_LOCK(sc); 3346 3347 if (hn_xpnt_vf_isready(sc)) { 3348 ifr_vf = *ifr; 3349 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3350 sizeof(ifr_vf.ifr_name)); 3351 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3352 HN_UNLOCK(sc); 3353 break; 3354 } 3355 3356 /* 3357 * Fix up requested capabilities w/ supported capabilities, 3358 * since the supported capabilities could have been changed. 3359 */ 3360 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3361 ifp->if_capenable; 3362 3363 if (mask & IFCAP_TXCSUM) { 3364 ifp->if_capenable ^= IFCAP_TXCSUM; 3365 if (ifp->if_capenable & IFCAP_TXCSUM) 3366 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3367 else 3368 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3369 } 3370 if (mask & IFCAP_TXCSUM_IPV6) { 3371 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3372 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3373 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3374 else 3375 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3376 } 3377 3378 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3379 if (mask & IFCAP_RXCSUM) 3380 ifp->if_capenable ^= IFCAP_RXCSUM; 3381 #ifdef foo 3382 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3383 if (mask & IFCAP_RXCSUM_IPV6) 3384 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3385 #endif 3386 3387 if (mask & IFCAP_LRO) 3388 ifp->if_capenable ^= IFCAP_LRO; 3389 3390 if (mask & IFCAP_TSO4) { 3391 ifp->if_capenable ^= IFCAP_TSO4; 3392 if (ifp->if_capenable & IFCAP_TSO4) 3393 ifp->if_hwassist |= CSUM_IP_TSO; 3394 else 3395 ifp->if_hwassist &= ~CSUM_IP_TSO; 3396 } 3397 if (mask & IFCAP_TSO6) { 3398 ifp->if_capenable ^= IFCAP_TSO6; 3399 if (ifp->if_capenable & IFCAP_TSO6) 3400 ifp->if_hwassist |= CSUM_IP6_TSO; 3401 else 3402 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3403 } 3404 3405 HN_UNLOCK(sc); 3406 break; 3407 3408 case SIOCADDMULTI: 3409 case SIOCDELMULTI: 3410 HN_LOCK(sc); 3411 3412 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3413 HN_UNLOCK(sc); 3414 break; 3415 } 3416 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3417 /* 3418 * Multicast uses mutex; use busy-wait for 3419 * the RNDIS reply. 3420 */ 3421 HN_NO_SLEEPING(sc); 3422 hn_rxfilter_config(sc); 3423 HN_SLEEPING_OK(sc); 3424 } 3425 3426 /* XXX vlan(4) style mcast addr maintenance */ 3427 if (hn_xpnt_vf_isready(sc)) { 3428 int old_if_flags; 3429 3430 old_if_flags = sc->hn_vf_ifp->if_flags; 3431 hn_xpnt_vf_saveifflags(sc); 3432 3433 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3434 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3435 IFF_ALLMULTI)) 3436 error = hn_xpnt_vf_iocsetflags(sc); 3437 } 3438 3439 HN_UNLOCK(sc); 3440 break; 3441 3442 case SIOCSIFMEDIA: 3443 case SIOCGIFMEDIA: 3444 HN_LOCK(sc); 3445 if (hn_xpnt_vf_isready(sc)) { 3446 /* 3447 * SIOCGIFMEDIA expects ifmediareq, so don't 3448 * create and pass ifr_vf to the VF here; just 3449 * replace the ifr_name. 3450 */ 3451 vf_ifp = sc->hn_vf_ifp; 3452 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 3453 sizeof(ifr->ifr_name)); 3454 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 3455 /* Restore the ifr_name. */ 3456 strlcpy(ifr->ifr_name, ifp->if_xname, 3457 sizeof(ifr->ifr_name)); 3458 HN_UNLOCK(sc); 3459 break; 3460 } 3461 HN_UNLOCK(sc); 3462 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 3463 break; 3464 3465 default: 3466 error = ether_ioctl(ifp, cmd, data); 3467 break; 3468 } 3469 return (error); 3470 } 3471 3472 static void 3473 hn_stop(struct hn_softc *sc, bool detaching) 3474 { 3475 struct ifnet *ifp = sc->hn_ifp; 3476 int i; 3477 3478 HN_LOCK_ASSERT(sc); 3479 3480 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 3481 ("synthetic parts were not attached")); 3482 3483 /* Clear RUNNING bit ASAP. */ 3484 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 3485 3486 /* Disable polling. */ 3487 hn_polling(sc, 0); 3488 3489 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 3490 KASSERT(sc->hn_vf_ifp != NULL, 3491 ("%s: VF is not attached", ifp->if_xname)); 3492 3493 /* NOTE: hn_vf_lock for hn_transmit() */ 3494 rm_wlock(&sc->hn_vf_lock); 3495 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 3496 rm_wunlock(&sc->hn_vf_lock); 3497 3498 /* 3499 * NOTE: 3500 * Datapath setting must happen _before_ bringing 3501 * the VF down. 3502 */ 3503 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 3504 3505 /* 3506 * Bring the VF down. 3507 */ 3508 hn_xpnt_vf_saveifflags(sc); 3509 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 3510 hn_xpnt_vf_iocsetflags(sc); 3511 } 3512 3513 /* Suspend data transfers. */ 3514 hn_suspend_data(sc); 3515 3516 /* Clear OACTIVE bit. */ 3517 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3518 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 3519 sc->hn_tx_ring[i].hn_oactive = 0; 3520 3521 /* 3522 * If the non-transparent mode VF is active, make sure 3523 * that the RX filter still allows packet reception. 3524 */ 3525 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 3526 hn_rxfilter_config(sc); 3527 } 3528 3529 static void 3530 hn_init_locked(struct hn_softc *sc) 3531 { 3532 struct ifnet *ifp = sc->hn_ifp; 3533 int i; 3534 3535 HN_LOCK_ASSERT(sc); 3536 3537 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 3538 return; 3539 3540 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3541 return; 3542 3543 /* Configure RX filter */ 3544 hn_rxfilter_config(sc); 3545 3546 /* Clear OACTIVE bit. */ 3547 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 3548 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 3549 sc->hn_tx_ring[i].hn_oactive = 0; 3550 3551 /* Clear TX 'suspended' bit. */ 3552 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 3553 3554 if (hn_xpnt_vf_isready(sc)) { 3555 /* Initialize transparent VF. */ 3556 hn_xpnt_vf_init(sc); 3557 } 3558 3559 /* Everything is ready; unleash! */ 3560 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 3561 3562 /* Re-enable polling if requested. */ 3563 if (sc->hn_pollhz > 0) 3564 hn_polling(sc, sc->hn_pollhz); 3565 } 3566 3567 static void 3568 hn_init(void *xsc) 3569 { 3570 struct hn_softc *sc = xsc; 3571 3572 HN_LOCK(sc); 3573 hn_init_locked(sc); 3574 HN_UNLOCK(sc); 3575 } 3576 3577 #if __FreeBSD_version >= 1100099 3578 3579 static int 3580 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 3581 { 3582 struct hn_softc *sc = arg1; 3583 unsigned int lenlim; 3584 int error; 3585 3586 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 3587 error = sysctl_handle_int(oidp, &lenlim, 0, req); 3588 if (error || req->newptr == NULL) 3589 return error; 3590 3591 HN_LOCK(sc); 3592 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 3593 lenlim > TCP_LRO_LENGTH_MAX) { 3594 HN_UNLOCK(sc); 3595 return EINVAL; 3596 } 3597 hn_set_lro_lenlim(sc, lenlim); 3598 HN_UNLOCK(sc); 3599 3600 return 0; 3601 } 3602 3603 static int 3604 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 3605 { 3606 struct hn_softc *sc = arg1; 3607 int ackcnt, error, i; 3608 3609 /* 3610 * lro_ackcnt_lim is append count limit, 3611 * +1 to turn it into aggregation limit. 3612 */ 3613 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 3614 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 3615 if (error || req->newptr == NULL) 3616 return error; 3617 3618 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 3619 return EINVAL; 3620 3621 /* 3622 * Convert aggregation limit back to append 3623 * count limit. 3624 */ 3625 --ackcnt; 3626 HN_LOCK(sc); 3627 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 3628 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 3629 HN_UNLOCK(sc); 3630 return 0; 3631 } 3632 3633 #endif 3634 3635 static int 3636 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 3637 { 3638 struct hn_softc *sc = arg1; 3639 int hcsum = arg2; 3640 int on, error, i; 3641 3642 on = 0; 3643 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 3644 on = 1; 3645 3646 error = sysctl_handle_int(oidp, &on, 0, req); 3647 if (error || req->newptr == NULL) 3648 return error; 3649 3650 HN_LOCK(sc); 3651 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3652 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 3653 3654 if (on) 3655 rxr->hn_trust_hcsum |= hcsum; 3656 else 3657 rxr->hn_trust_hcsum &= ~hcsum; 3658 } 3659 HN_UNLOCK(sc); 3660 return 0; 3661 } 3662 3663 static int 3664 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 3665 { 3666 struct hn_softc *sc = arg1; 3667 int chim_size, error; 3668 3669 chim_size = sc->hn_tx_ring[0].hn_chim_size; 3670 error = sysctl_handle_int(oidp, &chim_size, 0, req); 3671 if (error || req->newptr == NULL) 3672 return error; 3673 3674 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 3675 return EINVAL; 3676 3677 HN_LOCK(sc); 3678 hn_set_chim_size(sc, chim_size); 3679 HN_UNLOCK(sc); 3680 return 0; 3681 } 3682 3683 #if __FreeBSD_version < 1100095 3684 static int 3685 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 3686 { 3687 struct hn_softc *sc = arg1; 3688 int ofs = arg2, i, error; 3689 struct hn_rx_ring *rxr; 3690 uint64_t stat; 3691 3692 stat = 0; 3693 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3694 rxr = &sc->hn_rx_ring[i]; 3695 stat += *((int *)((uint8_t *)rxr + ofs)); 3696 } 3697 3698 error = sysctl_handle_64(oidp, &stat, 0, req); 3699 if (error || req->newptr == NULL) 3700 return error; 3701 3702 /* Zero out this stat. */ 3703 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3704 rxr = &sc->hn_rx_ring[i]; 3705 *((int *)((uint8_t *)rxr + ofs)) = 0; 3706 } 3707 return 0; 3708 } 3709 #else 3710 static int 3711 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 3712 { 3713 struct hn_softc *sc = arg1; 3714 int ofs = arg2, i, error; 3715 struct hn_rx_ring *rxr; 3716 uint64_t stat; 3717 3718 stat = 0; 3719 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3720 rxr = &sc->hn_rx_ring[i]; 3721 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 3722 } 3723 3724 error = sysctl_handle_64(oidp, &stat, 0, req); 3725 if (error || req->newptr == NULL) 3726 return error; 3727 3728 /* Zero out this stat. */ 3729 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3730 rxr = &sc->hn_rx_ring[i]; 3731 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 3732 } 3733 return 0; 3734 } 3735 3736 #endif 3737 3738 static int 3739 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 3740 { 3741 struct hn_softc *sc = arg1; 3742 int ofs = arg2, i, error; 3743 struct hn_rx_ring *rxr; 3744 u_long stat; 3745 3746 stat = 0; 3747 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3748 rxr = &sc->hn_rx_ring[i]; 3749 stat += *((u_long *)((uint8_t *)rxr + ofs)); 3750 } 3751 3752 error = sysctl_handle_long(oidp, &stat, 0, req); 3753 if (error || req->newptr == NULL) 3754 return error; 3755 3756 /* Zero out this stat. */ 3757 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 3758 rxr = &sc->hn_rx_ring[i]; 3759 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 3760 } 3761 return 0; 3762 } 3763 3764 static int 3765 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 3766 { 3767 struct hn_softc *sc = arg1; 3768 int ofs = arg2, i, error; 3769 struct hn_tx_ring *txr; 3770 u_long stat; 3771 3772 stat = 0; 3773 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3774 txr = &sc->hn_tx_ring[i]; 3775 stat += *((u_long *)((uint8_t *)txr + ofs)); 3776 } 3777 3778 error = sysctl_handle_long(oidp, &stat, 0, req); 3779 if (error || req->newptr == NULL) 3780 return error; 3781 3782 /* Zero out this stat. */ 3783 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3784 txr = &sc->hn_tx_ring[i]; 3785 *((u_long *)((uint8_t *)txr + ofs)) = 0; 3786 } 3787 return 0; 3788 } 3789 3790 static int 3791 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 3792 { 3793 struct hn_softc *sc = arg1; 3794 int ofs = arg2, i, error, conf; 3795 struct hn_tx_ring *txr; 3796 3797 txr = &sc->hn_tx_ring[0]; 3798 conf = *((int *)((uint8_t *)txr + ofs)); 3799 3800 error = sysctl_handle_int(oidp, &conf, 0, req); 3801 if (error || req->newptr == NULL) 3802 return error; 3803 3804 HN_LOCK(sc); 3805 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 3806 txr = &sc->hn_tx_ring[i]; 3807 *((int *)((uint8_t *)txr + ofs)) = conf; 3808 } 3809 HN_UNLOCK(sc); 3810 3811 return 0; 3812 } 3813 3814 static int 3815 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 3816 { 3817 struct hn_softc *sc = arg1; 3818 int error, size; 3819 3820 size = sc->hn_agg_size; 3821 error = sysctl_handle_int(oidp, &size, 0, req); 3822 if (error || req->newptr == NULL) 3823 return (error); 3824 3825 HN_LOCK(sc); 3826 sc->hn_agg_size = size; 3827 hn_set_txagg(sc); 3828 HN_UNLOCK(sc); 3829 3830 return (0); 3831 } 3832 3833 static int 3834 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 3835 { 3836 struct hn_softc *sc = arg1; 3837 int error, pkts; 3838 3839 pkts = sc->hn_agg_pkts; 3840 error = sysctl_handle_int(oidp, &pkts, 0, req); 3841 if (error || req->newptr == NULL) 3842 return (error); 3843 3844 HN_LOCK(sc); 3845 sc->hn_agg_pkts = pkts; 3846 hn_set_txagg(sc); 3847 HN_UNLOCK(sc); 3848 3849 return (0); 3850 } 3851 3852 static int 3853 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 3854 { 3855 struct hn_softc *sc = arg1; 3856 int pkts; 3857 3858 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 3859 return (sysctl_handle_int(oidp, &pkts, 0, req)); 3860 } 3861 3862 static int 3863 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 3864 { 3865 struct hn_softc *sc = arg1; 3866 int align; 3867 3868 align = sc->hn_tx_ring[0].hn_agg_align; 3869 return (sysctl_handle_int(oidp, &align, 0, req)); 3870 } 3871 3872 static void 3873 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 3874 { 3875 if (pollhz == 0) 3876 vmbus_chan_poll_disable(chan); 3877 else 3878 vmbus_chan_poll_enable(chan, pollhz); 3879 } 3880 3881 static void 3882 hn_polling(struct hn_softc *sc, u_int pollhz) 3883 { 3884 int nsubch = sc->hn_rx_ring_inuse - 1; 3885 3886 HN_LOCK_ASSERT(sc); 3887 3888 if (nsubch > 0) { 3889 struct vmbus_channel **subch; 3890 int i; 3891 3892 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 3893 for (i = 0; i < nsubch; ++i) 3894 hn_chan_polling(subch[i], pollhz); 3895 vmbus_subchan_rel(subch, nsubch); 3896 } 3897 hn_chan_polling(sc->hn_prichan, pollhz); 3898 } 3899 3900 static int 3901 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 3902 { 3903 struct hn_softc *sc = arg1; 3904 int pollhz, error; 3905 3906 pollhz = sc->hn_pollhz; 3907 error = sysctl_handle_int(oidp, &pollhz, 0, req); 3908 if (error || req->newptr == NULL) 3909 return (error); 3910 3911 if (pollhz != 0 && 3912 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 3913 return (EINVAL); 3914 3915 HN_LOCK(sc); 3916 if (sc->hn_pollhz != pollhz) { 3917 sc->hn_pollhz = pollhz; 3918 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 3919 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 3920 hn_polling(sc, sc->hn_pollhz); 3921 } 3922 HN_UNLOCK(sc); 3923 3924 return (0); 3925 } 3926 3927 static int 3928 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 3929 { 3930 struct hn_softc *sc = arg1; 3931 char verstr[16]; 3932 3933 snprintf(verstr, sizeof(verstr), "%u.%u", 3934 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 3935 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 3936 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 3937 } 3938 3939 static int 3940 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 3941 { 3942 struct hn_softc *sc = arg1; 3943 char caps_str[128]; 3944 uint32_t caps; 3945 3946 HN_LOCK(sc); 3947 caps = sc->hn_caps; 3948 HN_UNLOCK(sc); 3949 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 3950 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 3951 } 3952 3953 static int 3954 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 3955 { 3956 struct hn_softc *sc = arg1; 3957 char assist_str[128]; 3958 uint32_t hwassist; 3959 3960 HN_LOCK(sc); 3961 hwassist = sc->hn_ifp->if_hwassist; 3962 HN_UNLOCK(sc); 3963 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 3964 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 3965 } 3966 3967 static int 3968 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 3969 { 3970 struct hn_softc *sc = arg1; 3971 char filter_str[128]; 3972 uint32_t filter; 3973 3974 HN_LOCK(sc); 3975 filter = sc->hn_rx_filter; 3976 HN_UNLOCK(sc); 3977 snprintf(filter_str, sizeof(filter_str), "%b", filter, 3978 NDIS_PACKET_TYPES); 3979 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 3980 } 3981 3982 #ifndef RSS 3983 3984 static int 3985 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 3986 { 3987 struct hn_softc *sc = arg1; 3988 int error; 3989 3990 HN_LOCK(sc); 3991 3992 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3993 if (error || req->newptr == NULL) 3994 goto back; 3995 3996 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 3997 if (error) 3998 goto back; 3999 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4000 4001 if (sc->hn_rx_ring_inuse > 1) { 4002 error = hn_rss_reconfig(sc); 4003 } else { 4004 /* Not RSS capable, at least for now; just save the RSS key. */ 4005 error = 0; 4006 } 4007 back: 4008 HN_UNLOCK(sc); 4009 return (error); 4010 } 4011 4012 static int 4013 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4014 { 4015 struct hn_softc *sc = arg1; 4016 int error; 4017 4018 HN_LOCK(sc); 4019 4020 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4021 if (error || req->newptr == NULL) 4022 goto back; 4023 4024 /* 4025 * Don't allow RSS indirect table change, if this interface is not 4026 * RSS capable currently. 4027 */ 4028 if (sc->hn_rx_ring_inuse == 1) { 4029 error = EOPNOTSUPP; 4030 goto back; 4031 } 4032 4033 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4034 if (error) 4035 goto back; 4036 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4037 4038 hn_rss_ind_fixup(sc); 4039 error = hn_rss_reconfig(sc); 4040 back: 4041 HN_UNLOCK(sc); 4042 return (error); 4043 } 4044 4045 #endif /* !RSS */ 4046 4047 static int 4048 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4049 { 4050 struct hn_softc *sc = arg1; 4051 char hash_str[128]; 4052 uint32_t hash; 4053 4054 HN_LOCK(sc); 4055 hash = sc->hn_rss_hash; 4056 HN_UNLOCK(sc); 4057 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4058 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4059 } 4060 4061 static int 4062 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4063 { 4064 struct hn_softc *sc = arg1; 4065 char vf_name[IFNAMSIZ + 1]; 4066 struct ifnet *vf_ifp; 4067 4068 HN_LOCK(sc); 4069 vf_name[0] = '\0'; 4070 vf_ifp = sc->hn_vf_ifp; 4071 if (vf_ifp != NULL) 4072 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4073 HN_UNLOCK(sc); 4074 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4075 } 4076 4077 static int 4078 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4079 { 4080 struct hn_softc *sc = arg1; 4081 char vf_name[IFNAMSIZ + 1]; 4082 struct ifnet *vf_ifp; 4083 4084 HN_LOCK(sc); 4085 vf_name[0] = '\0'; 4086 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4087 if (vf_ifp != NULL) 4088 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4089 HN_UNLOCK(sc); 4090 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4091 } 4092 4093 static int 4094 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4095 { 4096 struct rm_priotracker pt; 4097 struct sbuf *sb; 4098 int error, i; 4099 bool first; 4100 4101 error = sysctl_wire_old_buffer(req, 0); 4102 if (error != 0) 4103 return (error); 4104 4105 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4106 if (sb == NULL) 4107 return (ENOMEM); 4108 4109 rm_rlock(&hn_vfmap_lock, &pt); 4110 4111 first = true; 4112 for (i = 0; i < hn_vfmap_size; ++i) { 4113 struct ifnet *ifp; 4114 4115 if (hn_vfmap[i] == NULL) 4116 continue; 4117 4118 ifp = ifnet_byindex(i); 4119 if (ifp != NULL) { 4120 if (first) 4121 sbuf_printf(sb, "%s", ifp->if_xname); 4122 else 4123 sbuf_printf(sb, " %s", ifp->if_xname); 4124 first = false; 4125 } 4126 } 4127 4128 rm_runlock(&hn_vfmap_lock, &pt); 4129 4130 error = sbuf_finish(sb); 4131 sbuf_delete(sb); 4132 return (error); 4133 } 4134 4135 static int 4136 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4137 { 4138 struct rm_priotracker pt; 4139 struct sbuf *sb; 4140 int error, i; 4141 bool first; 4142 4143 error = sysctl_wire_old_buffer(req, 0); 4144 if (error != 0) 4145 return (error); 4146 4147 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4148 if (sb == NULL) 4149 return (ENOMEM); 4150 4151 rm_rlock(&hn_vfmap_lock, &pt); 4152 4153 first = true; 4154 for (i = 0; i < hn_vfmap_size; ++i) { 4155 struct ifnet *ifp, *hn_ifp; 4156 4157 hn_ifp = hn_vfmap[i]; 4158 if (hn_ifp == NULL) 4159 continue; 4160 4161 ifp = ifnet_byindex(i); 4162 if (ifp != NULL) { 4163 if (first) { 4164 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4165 hn_ifp->if_xname); 4166 } else { 4167 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4168 hn_ifp->if_xname); 4169 } 4170 first = false; 4171 } 4172 } 4173 4174 rm_runlock(&hn_vfmap_lock, &pt); 4175 4176 error = sbuf_finish(sb); 4177 sbuf_delete(sb); 4178 return (error); 4179 } 4180 4181 static int 4182 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4183 { 4184 struct hn_softc *sc = arg1; 4185 int error, onoff = 0; 4186 4187 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4188 onoff = 1; 4189 error = sysctl_handle_int(oidp, &onoff, 0, req); 4190 if (error || req->newptr == NULL) 4191 return (error); 4192 4193 HN_LOCK(sc); 4194 /* NOTE: hn_vf_lock for hn_transmit() */ 4195 rm_wlock(&sc->hn_vf_lock); 4196 if (onoff) 4197 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4198 else 4199 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4200 rm_wunlock(&sc->hn_vf_lock); 4201 HN_UNLOCK(sc); 4202 4203 return (0); 4204 } 4205 4206 static int 4207 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4208 { 4209 struct hn_softc *sc = arg1; 4210 int enabled = 0; 4211 4212 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4213 enabled = 1; 4214 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4215 } 4216 4217 static int 4218 hn_check_iplen(const struct mbuf *m, int hoff) 4219 { 4220 const struct ip *ip; 4221 int len, iphlen, iplen; 4222 const struct tcphdr *th; 4223 int thoff; /* TCP data offset */ 4224 4225 len = hoff + sizeof(struct ip); 4226 4227 /* The packet must be at least the size of an IP header. */ 4228 if (m->m_pkthdr.len < len) 4229 return IPPROTO_DONE; 4230 4231 /* The fixed IP header must reside completely in the first mbuf. */ 4232 if (m->m_len < len) 4233 return IPPROTO_DONE; 4234 4235 ip = mtodo(m, hoff); 4236 4237 /* Bound check the packet's stated IP header length. */ 4238 iphlen = ip->ip_hl << 2; 4239 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4240 return IPPROTO_DONE; 4241 4242 /* The full IP header must reside completely in the one mbuf. */ 4243 if (m->m_len < hoff + iphlen) 4244 return IPPROTO_DONE; 4245 4246 iplen = ntohs(ip->ip_len); 4247 4248 /* 4249 * Check that the amount of data in the buffers is as 4250 * at least much as the IP header would have us expect. 4251 */ 4252 if (m->m_pkthdr.len < hoff + iplen) 4253 return IPPROTO_DONE; 4254 4255 /* 4256 * Ignore IP fragments. 4257 */ 4258 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4259 return IPPROTO_DONE; 4260 4261 /* 4262 * The TCP/IP or UDP/IP header must be entirely contained within 4263 * the first fragment of a packet. 4264 */ 4265 switch (ip->ip_p) { 4266 case IPPROTO_TCP: 4267 if (iplen < iphlen + sizeof(struct tcphdr)) 4268 return IPPROTO_DONE; 4269 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4270 return IPPROTO_DONE; 4271 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4272 thoff = th->th_off << 2; 4273 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4274 return IPPROTO_DONE; 4275 if (m->m_len < hoff + iphlen + thoff) 4276 return IPPROTO_DONE; 4277 break; 4278 case IPPROTO_UDP: 4279 if (iplen < iphlen + sizeof(struct udphdr)) 4280 return IPPROTO_DONE; 4281 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4282 return IPPROTO_DONE; 4283 break; 4284 default: 4285 if (iplen < iphlen) 4286 return IPPROTO_DONE; 4287 break; 4288 } 4289 return ip->ip_p; 4290 } 4291 4292 static int 4293 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4294 { 4295 struct sysctl_oid_list *child; 4296 struct sysctl_ctx_list *ctx; 4297 device_t dev = sc->hn_dev; 4298 #if defined(INET) || defined(INET6) 4299 #if __FreeBSD_version >= 1100095 4300 int lroent_cnt; 4301 #endif 4302 #endif 4303 int i; 4304 4305 /* 4306 * Create RXBUF for reception. 4307 * 4308 * NOTE: 4309 * - It is shared by all channels. 4310 * - A large enough buffer is allocated, certain version of NVSes 4311 * may further limit the usable space. 4312 */ 4313 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4314 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4315 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4316 if (sc->hn_rxbuf == NULL) { 4317 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4318 return (ENOMEM); 4319 } 4320 4321 sc->hn_rx_ring_cnt = ring_cnt; 4322 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4323 4324 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4325 M_DEVBUF, M_WAITOK | M_ZERO); 4326 4327 #if defined(INET) || defined(INET6) 4328 #if __FreeBSD_version >= 1100095 4329 lroent_cnt = hn_lro_entry_count; 4330 if (lroent_cnt < TCP_LRO_ENTRIES) 4331 lroent_cnt = TCP_LRO_ENTRIES; 4332 if (bootverbose) 4333 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4334 #endif 4335 #endif /* INET || INET6 */ 4336 4337 ctx = device_get_sysctl_ctx(dev); 4338 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4339 4340 /* Create dev.hn.UNIT.rx sysctl tree */ 4341 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4342 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4343 4344 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4345 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4346 4347 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4348 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 4349 &rxr->hn_br_dma, BUS_DMA_WAITOK); 4350 if (rxr->hn_br == NULL) { 4351 device_printf(dev, "allocate bufring failed\n"); 4352 return (ENOMEM); 4353 } 4354 4355 if (hn_trust_hosttcp) 4356 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 4357 if (hn_trust_hostudp) 4358 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 4359 if (hn_trust_hostip) 4360 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 4361 rxr->hn_ifp = sc->hn_ifp; 4362 if (i < sc->hn_tx_ring_cnt) 4363 rxr->hn_txr = &sc->hn_tx_ring[i]; 4364 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 4365 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 4366 rxr->hn_rx_idx = i; 4367 rxr->hn_rxbuf = sc->hn_rxbuf; 4368 4369 /* 4370 * Initialize LRO. 4371 */ 4372 #if defined(INET) || defined(INET6) 4373 #if __FreeBSD_version >= 1100095 4374 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 4375 hn_lro_mbufq_depth); 4376 #else 4377 tcp_lro_init(&rxr->hn_lro); 4378 rxr->hn_lro.ifp = sc->hn_ifp; 4379 #endif 4380 #if __FreeBSD_version >= 1100099 4381 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 4382 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 4383 #endif 4384 #endif /* INET || INET6 */ 4385 4386 if (sc->hn_rx_sysctl_tree != NULL) { 4387 char name[16]; 4388 4389 /* 4390 * Create per RX ring sysctl tree: 4391 * dev.hn.UNIT.rx.RINGID 4392 */ 4393 snprintf(name, sizeof(name), "%d", i); 4394 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 4395 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 4396 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4397 4398 if (rxr->hn_rx_sysctl_tree != NULL) { 4399 SYSCTL_ADD_ULONG(ctx, 4400 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4401 OID_AUTO, "packets", CTLFLAG_RW, 4402 &rxr->hn_pkts, "# of packets received"); 4403 SYSCTL_ADD_ULONG(ctx, 4404 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4405 OID_AUTO, "rss_pkts", CTLFLAG_RW, 4406 &rxr->hn_rss_pkts, 4407 "# of packets w/ RSS info received"); 4408 SYSCTL_ADD_INT(ctx, 4409 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4410 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 4411 &rxr->hn_pktbuf_len, 0, 4412 "Temporary channel packet buffer length"); 4413 } 4414 } 4415 } 4416 4417 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 4418 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4419 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 4420 #if __FreeBSD_version < 1100095 4421 hn_rx_stat_int_sysctl, 4422 #else 4423 hn_rx_stat_u64_sysctl, 4424 #endif 4425 "LU", "LRO queued"); 4426 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 4427 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4428 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 4429 #if __FreeBSD_version < 1100095 4430 hn_rx_stat_int_sysctl, 4431 #else 4432 hn_rx_stat_u64_sysctl, 4433 #endif 4434 "LU", "LRO flushed"); 4435 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 4436 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4437 __offsetof(struct hn_rx_ring, hn_lro_tried), 4438 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 4439 #if __FreeBSD_version >= 1100099 4440 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 4441 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 4442 hn_lro_lenlim_sysctl, "IU", 4443 "Max # of data bytes to be aggregated by LRO"); 4444 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 4445 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 4446 hn_lro_ackcnt_sysctl, "I", 4447 "Max # of ACKs to be aggregated by LRO"); 4448 #endif 4449 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 4450 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 4451 hn_trust_hcsum_sysctl, "I", 4452 "Trust tcp segement verification on host side, " 4453 "when csum info is missing"); 4454 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 4455 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 4456 hn_trust_hcsum_sysctl, "I", 4457 "Trust udp datagram verification on host side, " 4458 "when csum info is missing"); 4459 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 4460 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 4461 hn_trust_hcsum_sysctl, "I", 4462 "Trust ip packet verification on host side, " 4463 "when csum info is missing"); 4464 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 4465 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4466 __offsetof(struct hn_rx_ring, hn_csum_ip), 4467 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 4468 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 4469 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4470 __offsetof(struct hn_rx_ring, hn_csum_tcp), 4471 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 4472 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 4473 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4474 __offsetof(struct hn_rx_ring, hn_csum_udp), 4475 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 4476 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 4477 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4478 __offsetof(struct hn_rx_ring, hn_csum_trusted), 4479 hn_rx_stat_ulong_sysctl, "LU", 4480 "# of packets that we trust host's csum verification"); 4481 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 4482 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4483 __offsetof(struct hn_rx_ring, hn_small_pkts), 4484 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 4485 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 4486 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4487 __offsetof(struct hn_rx_ring, hn_ack_failed), 4488 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 4489 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 4490 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 4491 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 4492 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 4493 4494 return (0); 4495 } 4496 4497 static void 4498 hn_destroy_rx_data(struct hn_softc *sc) 4499 { 4500 int i; 4501 4502 if (sc->hn_rxbuf != NULL) { 4503 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 4504 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 4505 else 4506 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 4507 sc->hn_rxbuf = NULL; 4508 } 4509 4510 if (sc->hn_rx_ring_cnt == 0) 4511 return; 4512 4513 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4514 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4515 4516 if (rxr->hn_br == NULL) 4517 continue; 4518 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 4519 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 4520 } else { 4521 device_printf(sc->hn_dev, 4522 "%dth channel bufring is referenced", i); 4523 } 4524 rxr->hn_br = NULL; 4525 4526 #if defined(INET) || defined(INET6) 4527 tcp_lro_free(&rxr->hn_lro); 4528 #endif 4529 free(rxr->hn_pktbuf, M_DEVBUF); 4530 } 4531 free(sc->hn_rx_ring, M_DEVBUF); 4532 sc->hn_rx_ring = NULL; 4533 4534 sc->hn_rx_ring_cnt = 0; 4535 sc->hn_rx_ring_inuse = 0; 4536 } 4537 4538 static int 4539 hn_tx_ring_create(struct hn_softc *sc, int id) 4540 { 4541 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 4542 device_t dev = sc->hn_dev; 4543 bus_dma_tag_t parent_dtag; 4544 int error, i; 4545 4546 txr->hn_sc = sc; 4547 txr->hn_tx_idx = id; 4548 4549 #ifndef HN_USE_TXDESC_BUFRING 4550 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 4551 #endif 4552 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 4553 4554 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 4555 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 4556 M_DEVBUF, M_WAITOK | M_ZERO); 4557 #ifndef HN_USE_TXDESC_BUFRING 4558 SLIST_INIT(&txr->hn_txlist); 4559 #else 4560 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 4561 M_WAITOK, &txr->hn_tx_lock); 4562 #endif 4563 4564 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 4565 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 4566 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 4567 } else { 4568 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 4569 } 4570 4571 #ifdef HN_IFSTART_SUPPORT 4572 if (hn_use_if_start) { 4573 txr->hn_txeof = hn_start_txeof; 4574 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 4575 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 4576 } else 4577 #endif 4578 { 4579 int br_depth; 4580 4581 txr->hn_txeof = hn_xmit_txeof; 4582 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 4583 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 4584 4585 br_depth = hn_get_txswq_depth(txr); 4586 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 4587 M_WAITOK, &txr->hn_tx_lock); 4588 } 4589 4590 txr->hn_direct_tx_size = hn_direct_tx_size; 4591 4592 /* 4593 * Always schedule transmission instead of trying to do direct 4594 * transmission. This one gives the best performance so far. 4595 */ 4596 txr->hn_sched_tx = 1; 4597 4598 parent_dtag = bus_get_dma_tag(dev); 4599 4600 /* DMA tag for RNDIS packet messages. */ 4601 error = bus_dma_tag_create(parent_dtag, /* parent */ 4602 HN_RNDIS_PKT_ALIGN, /* alignment */ 4603 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 4604 BUS_SPACE_MAXADDR, /* lowaddr */ 4605 BUS_SPACE_MAXADDR, /* highaddr */ 4606 NULL, NULL, /* filter, filterarg */ 4607 HN_RNDIS_PKT_LEN, /* maxsize */ 4608 1, /* nsegments */ 4609 HN_RNDIS_PKT_LEN, /* maxsegsize */ 4610 0, /* flags */ 4611 NULL, /* lockfunc */ 4612 NULL, /* lockfuncarg */ 4613 &txr->hn_tx_rndis_dtag); 4614 if (error) { 4615 device_printf(dev, "failed to create rndis dmatag\n"); 4616 return error; 4617 } 4618 4619 /* DMA tag for data. */ 4620 error = bus_dma_tag_create(parent_dtag, /* parent */ 4621 1, /* alignment */ 4622 HN_TX_DATA_BOUNDARY, /* boundary */ 4623 BUS_SPACE_MAXADDR, /* lowaddr */ 4624 BUS_SPACE_MAXADDR, /* highaddr */ 4625 NULL, NULL, /* filter, filterarg */ 4626 HN_TX_DATA_MAXSIZE, /* maxsize */ 4627 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 4628 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 4629 0, /* flags */ 4630 NULL, /* lockfunc */ 4631 NULL, /* lockfuncarg */ 4632 &txr->hn_tx_data_dtag); 4633 if (error) { 4634 device_printf(dev, "failed to create data dmatag\n"); 4635 return error; 4636 } 4637 4638 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 4639 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 4640 4641 txd->txr = txr; 4642 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 4643 STAILQ_INIT(&txd->agg_list); 4644 4645 /* 4646 * Allocate and load RNDIS packet message. 4647 */ 4648 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 4649 (void **)&txd->rndis_pkt, 4650 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 4651 &txd->rndis_pkt_dmap); 4652 if (error) { 4653 device_printf(dev, 4654 "failed to allocate rndis_packet_msg, %d\n", i); 4655 return error; 4656 } 4657 4658 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 4659 txd->rndis_pkt_dmap, 4660 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 4661 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 4662 BUS_DMA_NOWAIT); 4663 if (error) { 4664 device_printf(dev, 4665 "failed to load rndis_packet_msg, %d\n", i); 4666 bus_dmamem_free(txr->hn_tx_rndis_dtag, 4667 txd->rndis_pkt, txd->rndis_pkt_dmap); 4668 return error; 4669 } 4670 4671 /* DMA map for TX data. */ 4672 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 4673 &txd->data_dmap); 4674 if (error) { 4675 device_printf(dev, 4676 "failed to allocate tx data dmamap\n"); 4677 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 4678 txd->rndis_pkt_dmap); 4679 bus_dmamem_free(txr->hn_tx_rndis_dtag, 4680 txd->rndis_pkt, txd->rndis_pkt_dmap); 4681 return error; 4682 } 4683 4684 /* All set, put it to list */ 4685 txd->flags |= HN_TXD_FLAG_ONLIST; 4686 #ifndef HN_USE_TXDESC_BUFRING 4687 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 4688 #else 4689 buf_ring_enqueue(txr->hn_txdesc_br, txd); 4690 #endif 4691 } 4692 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 4693 4694 if (sc->hn_tx_sysctl_tree != NULL) { 4695 struct sysctl_oid_list *child; 4696 struct sysctl_ctx_list *ctx; 4697 char name[16]; 4698 4699 /* 4700 * Create per TX ring sysctl tree: 4701 * dev.hn.UNIT.tx.RINGID 4702 */ 4703 ctx = device_get_sysctl_ctx(dev); 4704 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 4705 4706 snprintf(name, sizeof(name), "%d", id); 4707 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 4708 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4709 4710 if (txr->hn_tx_sysctl_tree != NULL) { 4711 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 4712 4713 #ifdef HN_DEBUG 4714 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 4715 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 4716 "# of available TX descs"); 4717 #endif 4718 #ifdef HN_IFSTART_SUPPORT 4719 if (!hn_use_if_start) 4720 #endif 4721 { 4722 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 4723 CTLFLAG_RD, &txr->hn_oactive, 0, 4724 "over active"); 4725 } 4726 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 4727 CTLFLAG_RW, &txr->hn_pkts, 4728 "# of packets transmitted"); 4729 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 4730 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 4731 } 4732 } 4733 4734 return 0; 4735 } 4736 4737 static void 4738 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 4739 { 4740 struct hn_tx_ring *txr = txd->txr; 4741 4742 KASSERT(txd->m == NULL, ("still has mbuf installed")); 4743 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 4744 4745 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 4746 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 4747 txd->rndis_pkt_dmap); 4748 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 4749 } 4750 4751 static void 4752 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 4753 { 4754 4755 KASSERT(txd->refs == 0 || txd->refs == 1, 4756 ("invalid txd refs %d", txd->refs)); 4757 4758 /* Aggregated txds will be freed by their aggregating txd. */ 4759 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 4760 int freed; 4761 4762 freed = hn_txdesc_put(txr, txd); 4763 KASSERT(freed, ("can't free txdesc")); 4764 } 4765 } 4766 4767 static void 4768 hn_tx_ring_destroy(struct hn_tx_ring *txr) 4769 { 4770 int i; 4771 4772 if (txr->hn_txdesc == NULL) 4773 return; 4774 4775 /* 4776 * NOTE: 4777 * Because the freeing of aggregated txds will be deferred 4778 * to the aggregating txd, two passes are used here: 4779 * - The first pass GCes any pending txds. This GC is necessary, 4780 * since if the channels are revoked, hypervisor will not 4781 * deliver send-done for all pending txds. 4782 * - The second pass frees the busdma stuffs, i.e. after all txds 4783 * were freed. 4784 */ 4785 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 4786 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 4787 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 4788 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 4789 4790 if (txr->hn_tx_data_dtag != NULL) 4791 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 4792 if (txr->hn_tx_rndis_dtag != NULL) 4793 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 4794 4795 #ifdef HN_USE_TXDESC_BUFRING 4796 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 4797 #endif 4798 4799 free(txr->hn_txdesc, M_DEVBUF); 4800 txr->hn_txdesc = NULL; 4801 4802 if (txr->hn_mbuf_br != NULL) 4803 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 4804 4805 #ifndef HN_USE_TXDESC_BUFRING 4806 mtx_destroy(&txr->hn_txlist_spin); 4807 #endif 4808 mtx_destroy(&txr->hn_tx_lock); 4809 } 4810 4811 static int 4812 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 4813 { 4814 struct sysctl_oid_list *child; 4815 struct sysctl_ctx_list *ctx; 4816 int i; 4817 4818 /* 4819 * Create TXBUF for chimney sending. 4820 * 4821 * NOTE: It is shared by all channels. 4822 */ 4823 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 4824 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 4825 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4826 if (sc->hn_chim == NULL) { 4827 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 4828 return (ENOMEM); 4829 } 4830 4831 sc->hn_tx_ring_cnt = ring_cnt; 4832 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 4833 4834 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 4835 M_DEVBUF, M_WAITOK | M_ZERO); 4836 4837 ctx = device_get_sysctl_ctx(sc->hn_dev); 4838 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 4839 4840 /* Create dev.hn.UNIT.tx sysctl tree */ 4841 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 4842 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4843 4844 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4845 int error; 4846 4847 error = hn_tx_ring_create(sc, i); 4848 if (error) 4849 return error; 4850 } 4851 4852 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 4853 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4854 __offsetof(struct hn_tx_ring, hn_no_txdescs), 4855 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 4856 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 4857 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4858 __offsetof(struct hn_tx_ring, hn_send_failed), 4859 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 4860 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 4861 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4862 __offsetof(struct hn_tx_ring, hn_txdma_failed), 4863 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 4864 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 4865 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4866 __offsetof(struct hn_tx_ring, hn_flush_failed), 4867 hn_tx_stat_ulong_sysctl, "LU", 4868 "# of packet transmission aggregation flush failure"); 4869 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 4870 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4871 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 4872 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 4873 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 4874 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4875 __offsetof(struct hn_tx_ring, hn_tx_chimney), 4876 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 4877 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 4878 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4879 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 4880 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 4881 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 4882 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 4883 "# of total TX descs"); 4884 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 4885 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 4886 "Chimney send packet size upper boundary"); 4887 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 4888 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 4889 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 4890 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 4891 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4892 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 4893 hn_tx_conf_int_sysctl, "I", 4894 "Size of the packet for direct transmission"); 4895 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 4896 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4897 __offsetof(struct hn_tx_ring, hn_sched_tx), 4898 hn_tx_conf_int_sysctl, "I", 4899 "Always schedule transmission " 4900 "instead of doing direct transmission"); 4901 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 4902 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 4903 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 4904 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 4905 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 4906 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 4907 "Applied packet transmission aggregation size"); 4908 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 4909 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 4910 hn_txagg_pktmax_sysctl, "I", 4911 "Applied packet transmission aggregation packets"); 4912 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 4913 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 4914 hn_txagg_align_sysctl, "I", 4915 "Applied packet transmission aggregation alignment"); 4916 4917 return 0; 4918 } 4919 4920 static void 4921 hn_set_chim_size(struct hn_softc *sc, int chim_size) 4922 { 4923 int i; 4924 4925 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4926 sc->hn_tx_ring[i].hn_chim_size = chim_size; 4927 } 4928 4929 static void 4930 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 4931 { 4932 struct ifnet *ifp = sc->hn_ifp; 4933 u_int hw_tsomax; 4934 int tso_minlen; 4935 4936 HN_LOCK_ASSERT(sc); 4937 4938 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 4939 return; 4940 4941 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 4942 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 4943 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 4944 4945 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 4946 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 4947 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 4948 4949 if (tso_maxlen < tso_minlen) 4950 tso_maxlen = tso_minlen; 4951 else if (tso_maxlen > IP_MAXPACKET) 4952 tso_maxlen = IP_MAXPACKET; 4953 if (tso_maxlen > sc->hn_ndis_tso_szmax) 4954 tso_maxlen = sc->hn_ndis_tso_szmax; 4955 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 4956 4957 if (hn_xpnt_vf_isready(sc)) { 4958 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 4959 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 4960 } 4961 ifp->if_hw_tsomax = hw_tsomax; 4962 if (bootverbose) 4963 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 4964 } 4965 4966 static void 4967 hn_fixup_tx_data(struct hn_softc *sc) 4968 { 4969 uint64_t csum_assist; 4970 int i; 4971 4972 hn_set_chim_size(sc, sc->hn_chim_szmax); 4973 if (hn_tx_chimney_size > 0 && 4974 hn_tx_chimney_size < sc->hn_chim_szmax) 4975 hn_set_chim_size(sc, hn_tx_chimney_size); 4976 4977 csum_assist = 0; 4978 if (sc->hn_caps & HN_CAP_IPCS) 4979 csum_assist |= CSUM_IP; 4980 if (sc->hn_caps & HN_CAP_TCP4CS) 4981 csum_assist |= CSUM_IP_TCP; 4982 if (sc->hn_caps & HN_CAP_UDP4CS) 4983 csum_assist |= CSUM_IP_UDP; 4984 if (sc->hn_caps & HN_CAP_TCP6CS) 4985 csum_assist |= CSUM_IP6_TCP; 4986 if (sc->hn_caps & HN_CAP_UDP6CS) 4987 csum_assist |= CSUM_IP6_UDP; 4988 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4989 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 4990 4991 if (sc->hn_caps & HN_CAP_HASHVAL) { 4992 /* 4993 * Support HASHVAL pktinfo on TX path. 4994 */ 4995 if (bootverbose) 4996 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 4997 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 4998 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 4999 } 5000 } 5001 5002 static void 5003 hn_destroy_tx_data(struct hn_softc *sc) 5004 { 5005 int i; 5006 5007 if (sc->hn_chim != NULL) { 5008 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5009 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5010 } else { 5011 device_printf(sc->hn_dev, 5012 "chimney sending buffer is referenced"); 5013 } 5014 sc->hn_chim = NULL; 5015 } 5016 5017 if (sc->hn_tx_ring_cnt == 0) 5018 return; 5019 5020 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5021 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5022 5023 free(sc->hn_tx_ring, M_DEVBUF); 5024 sc->hn_tx_ring = NULL; 5025 5026 sc->hn_tx_ring_cnt = 0; 5027 sc->hn_tx_ring_inuse = 0; 5028 } 5029 5030 #ifdef HN_IFSTART_SUPPORT 5031 5032 static void 5033 hn_start_taskfunc(void *xtxr, int pending __unused) 5034 { 5035 struct hn_tx_ring *txr = xtxr; 5036 5037 mtx_lock(&txr->hn_tx_lock); 5038 hn_start_locked(txr, 0); 5039 mtx_unlock(&txr->hn_tx_lock); 5040 } 5041 5042 static int 5043 hn_start_locked(struct hn_tx_ring *txr, int len) 5044 { 5045 struct hn_softc *sc = txr->hn_sc; 5046 struct ifnet *ifp = sc->hn_ifp; 5047 int sched = 0; 5048 5049 KASSERT(hn_use_if_start, 5050 ("hn_start_locked is called, when if_start is disabled")); 5051 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5052 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5053 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5054 5055 if (__predict_false(txr->hn_suspended)) 5056 return (0); 5057 5058 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5059 IFF_DRV_RUNNING) 5060 return (0); 5061 5062 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5063 struct hn_txdesc *txd; 5064 struct mbuf *m_head; 5065 int error; 5066 5067 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5068 if (m_head == NULL) 5069 break; 5070 5071 if (len > 0 && m_head->m_pkthdr.len > len) { 5072 /* 5073 * This sending could be time consuming; let callers 5074 * dispatch this packet sending (and sending of any 5075 * following up packets) to tx taskqueue. 5076 */ 5077 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5078 sched = 1; 5079 break; 5080 } 5081 5082 #if defined(INET6) || defined(INET) 5083 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5084 m_head = hn_tso_fixup(m_head); 5085 if (__predict_false(m_head == NULL)) { 5086 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5087 continue; 5088 } 5089 } 5090 #endif 5091 5092 txd = hn_txdesc_get(txr); 5093 if (txd == NULL) { 5094 txr->hn_no_txdescs++; 5095 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5096 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5097 break; 5098 } 5099 5100 error = hn_encap(ifp, txr, txd, &m_head); 5101 if (error) { 5102 /* Both txd and m_head are freed */ 5103 KASSERT(txr->hn_agg_txd == NULL, 5104 ("encap failed w/ pending aggregating txdesc")); 5105 continue; 5106 } 5107 5108 if (txr->hn_agg_pktleft == 0) { 5109 if (txr->hn_agg_txd != NULL) { 5110 KASSERT(m_head == NULL, 5111 ("pending mbuf for aggregating txdesc")); 5112 error = hn_flush_txagg(ifp, txr); 5113 if (__predict_false(error)) { 5114 atomic_set_int(&ifp->if_drv_flags, 5115 IFF_DRV_OACTIVE); 5116 break; 5117 } 5118 } else { 5119 KASSERT(m_head != NULL, ("mbuf was freed")); 5120 error = hn_txpkt(ifp, txr, txd); 5121 if (__predict_false(error)) { 5122 /* txd is freed, but m_head is not */ 5123 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5124 atomic_set_int(&ifp->if_drv_flags, 5125 IFF_DRV_OACTIVE); 5126 break; 5127 } 5128 } 5129 } 5130 #ifdef INVARIANTS 5131 else { 5132 KASSERT(txr->hn_agg_txd != NULL, 5133 ("no aggregating txdesc")); 5134 KASSERT(m_head == NULL, 5135 ("pending mbuf for aggregating txdesc")); 5136 } 5137 #endif 5138 } 5139 5140 /* Flush pending aggerated transmission. */ 5141 if (txr->hn_agg_txd != NULL) 5142 hn_flush_txagg(ifp, txr); 5143 return (sched); 5144 } 5145 5146 static void 5147 hn_start(struct ifnet *ifp) 5148 { 5149 struct hn_softc *sc = ifp->if_softc; 5150 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5151 5152 if (txr->hn_sched_tx) 5153 goto do_sched; 5154 5155 if (mtx_trylock(&txr->hn_tx_lock)) { 5156 int sched; 5157 5158 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5159 mtx_unlock(&txr->hn_tx_lock); 5160 if (!sched) 5161 return; 5162 } 5163 do_sched: 5164 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5165 } 5166 5167 static void 5168 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5169 { 5170 struct hn_tx_ring *txr = xtxr; 5171 5172 mtx_lock(&txr->hn_tx_lock); 5173 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5174 hn_start_locked(txr, 0); 5175 mtx_unlock(&txr->hn_tx_lock); 5176 } 5177 5178 static void 5179 hn_start_txeof(struct hn_tx_ring *txr) 5180 { 5181 struct hn_softc *sc = txr->hn_sc; 5182 struct ifnet *ifp = sc->hn_ifp; 5183 5184 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5185 5186 if (txr->hn_sched_tx) 5187 goto do_sched; 5188 5189 if (mtx_trylock(&txr->hn_tx_lock)) { 5190 int sched; 5191 5192 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5193 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5194 mtx_unlock(&txr->hn_tx_lock); 5195 if (sched) { 5196 taskqueue_enqueue(txr->hn_tx_taskq, 5197 &txr->hn_tx_task); 5198 } 5199 } else { 5200 do_sched: 5201 /* 5202 * Release the OACTIVE earlier, with the hope, that 5203 * others could catch up. The task will clear the 5204 * flag again with the hn_tx_lock to avoid possible 5205 * races. 5206 */ 5207 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5208 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5209 } 5210 } 5211 5212 #endif /* HN_IFSTART_SUPPORT */ 5213 5214 static int 5215 hn_xmit(struct hn_tx_ring *txr, int len) 5216 { 5217 struct hn_softc *sc = txr->hn_sc; 5218 struct ifnet *ifp = sc->hn_ifp; 5219 struct mbuf *m_head; 5220 int sched = 0; 5221 5222 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5223 #ifdef HN_IFSTART_SUPPORT 5224 KASSERT(hn_use_if_start == 0, 5225 ("hn_xmit is called, when if_start is enabled")); 5226 #endif 5227 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5228 5229 if (__predict_false(txr->hn_suspended)) 5230 return (0); 5231 5232 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5233 return (0); 5234 5235 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5236 struct hn_txdesc *txd; 5237 int error; 5238 5239 if (len > 0 && m_head->m_pkthdr.len > len) { 5240 /* 5241 * This sending could be time consuming; let callers 5242 * dispatch this packet sending (and sending of any 5243 * following up packets) to tx taskqueue. 5244 */ 5245 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5246 sched = 1; 5247 break; 5248 } 5249 5250 txd = hn_txdesc_get(txr); 5251 if (txd == NULL) { 5252 txr->hn_no_txdescs++; 5253 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5254 txr->hn_oactive = 1; 5255 break; 5256 } 5257 5258 error = hn_encap(ifp, txr, txd, &m_head); 5259 if (error) { 5260 /* Both txd and m_head are freed; discard */ 5261 KASSERT(txr->hn_agg_txd == NULL, 5262 ("encap failed w/ pending aggregating txdesc")); 5263 drbr_advance(ifp, txr->hn_mbuf_br); 5264 continue; 5265 } 5266 5267 if (txr->hn_agg_pktleft == 0) { 5268 if (txr->hn_agg_txd != NULL) { 5269 KASSERT(m_head == NULL, 5270 ("pending mbuf for aggregating txdesc")); 5271 error = hn_flush_txagg(ifp, txr); 5272 if (__predict_false(error)) { 5273 txr->hn_oactive = 1; 5274 break; 5275 } 5276 } else { 5277 KASSERT(m_head != NULL, ("mbuf was freed")); 5278 error = hn_txpkt(ifp, txr, txd); 5279 if (__predict_false(error)) { 5280 /* txd is freed, but m_head is not */ 5281 drbr_putback(ifp, txr->hn_mbuf_br, 5282 m_head); 5283 txr->hn_oactive = 1; 5284 break; 5285 } 5286 } 5287 } 5288 #ifdef INVARIANTS 5289 else { 5290 KASSERT(txr->hn_agg_txd != NULL, 5291 ("no aggregating txdesc")); 5292 KASSERT(m_head == NULL, 5293 ("pending mbuf for aggregating txdesc")); 5294 } 5295 #endif 5296 5297 /* Sent */ 5298 drbr_advance(ifp, txr->hn_mbuf_br); 5299 } 5300 5301 /* Flush pending aggerated transmission. */ 5302 if (txr->hn_agg_txd != NULL) 5303 hn_flush_txagg(ifp, txr); 5304 return (sched); 5305 } 5306 5307 static int 5308 hn_transmit(struct ifnet *ifp, struct mbuf *m) 5309 { 5310 struct hn_softc *sc = ifp->if_softc; 5311 struct hn_tx_ring *txr; 5312 int error, idx = 0; 5313 5314 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5315 struct rm_priotracker pt; 5316 5317 rm_rlock(&sc->hn_vf_lock, &pt); 5318 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5319 struct mbuf *m_bpf = NULL; 5320 int obytes, omcast; 5321 5322 obytes = m->m_pkthdr.len; 5323 if (m->m_flags & M_MCAST) 5324 omcast = 1; 5325 5326 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 5327 if (bpf_peers_present(ifp->if_bpf)) { 5328 m_bpf = m_copypacket(m, M_NOWAIT); 5329 if (m_bpf == NULL) { 5330 /* 5331 * Failed to grab a shallow 5332 * copy; tap now. 5333 */ 5334 ETHER_BPF_MTAP(ifp, m); 5335 } 5336 } 5337 } else { 5338 ETHER_BPF_MTAP(ifp, m); 5339 } 5340 5341 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 5342 rm_runlock(&sc->hn_vf_lock, &pt); 5343 5344 if (m_bpf != NULL) { 5345 if (!error) 5346 ETHER_BPF_MTAP(ifp, m_bpf); 5347 m_freem(m_bpf); 5348 } 5349 5350 if (error == ENOBUFS) { 5351 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 5352 } else if (error) { 5353 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5354 } else { 5355 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 5356 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 5357 if (omcast) { 5358 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 5359 omcast); 5360 } 5361 } 5362 return (error); 5363 } 5364 rm_runlock(&sc->hn_vf_lock, &pt); 5365 } 5366 5367 #if defined(INET6) || defined(INET) 5368 /* 5369 * Perform TSO packet header fixup now, since the TSO 5370 * packet header should be cache-hot. 5371 */ 5372 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 5373 m = hn_tso_fixup(m); 5374 if (__predict_false(m == NULL)) { 5375 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5376 return EIO; 5377 } 5378 } 5379 #endif 5380 5381 /* 5382 * Select the TX ring based on flowid 5383 */ 5384 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 5385 #ifdef RSS 5386 uint32_t bid; 5387 5388 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 5389 &bid) == 0) 5390 idx = bid % sc->hn_tx_ring_inuse; 5391 else 5392 #endif 5393 { 5394 #if defined(INET6) || defined(INET) 5395 int tcpsyn = 0; 5396 5397 if (m->m_pkthdr.len < 128 && 5398 (m->m_pkthdr.csum_flags & 5399 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 5400 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 5401 m = hn_check_tcpsyn(m, &tcpsyn); 5402 if (__predict_false(m == NULL)) { 5403 if_inc_counter(ifp, 5404 IFCOUNTER_OERRORS, 1); 5405 return (EIO); 5406 } 5407 } 5408 #else 5409 const int tcpsyn = 0; 5410 #endif 5411 if (tcpsyn) 5412 idx = 0; 5413 else 5414 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 5415 } 5416 } 5417 txr = &sc->hn_tx_ring[idx]; 5418 5419 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 5420 if (error) { 5421 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 5422 return error; 5423 } 5424 5425 if (txr->hn_oactive) 5426 return 0; 5427 5428 if (txr->hn_sched_tx) 5429 goto do_sched; 5430 5431 if (mtx_trylock(&txr->hn_tx_lock)) { 5432 int sched; 5433 5434 sched = hn_xmit(txr, txr->hn_direct_tx_size); 5435 mtx_unlock(&txr->hn_tx_lock); 5436 if (!sched) 5437 return 0; 5438 } 5439 do_sched: 5440 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5441 return 0; 5442 } 5443 5444 static void 5445 hn_tx_ring_qflush(struct hn_tx_ring *txr) 5446 { 5447 struct mbuf *m; 5448 5449 mtx_lock(&txr->hn_tx_lock); 5450 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 5451 m_freem(m); 5452 mtx_unlock(&txr->hn_tx_lock); 5453 } 5454 5455 static void 5456 hn_xmit_qflush(struct ifnet *ifp) 5457 { 5458 struct hn_softc *sc = ifp->if_softc; 5459 struct rm_priotracker pt; 5460 int i; 5461 5462 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 5463 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 5464 if_qflush(ifp); 5465 5466 rm_rlock(&sc->hn_vf_lock, &pt); 5467 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 5468 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 5469 rm_runlock(&sc->hn_vf_lock, &pt); 5470 } 5471 5472 static void 5473 hn_xmit_txeof(struct hn_tx_ring *txr) 5474 { 5475 5476 if (txr->hn_sched_tx) 5477 goto do_sched; 5478 5479 if (mtx_trylock(&txr->hn_tx_lock)) { 5480 int sched; 5481 5482 txr->hn_oactive = 0; 5483 sched = hn_xmit(txr, txr->hn_direct_tx_size); 5484 mtx_unlock(&txr->hn_tx_lock); 5485 if (sched) { 5486 taskqueue_enqueue(txr->hn_tx_taskq, 5487 &txr->hn_tx_task); 5488 } 5489 } else { 5490 do_sched: 5491 /* 5492 * Release the oactive earlier, with the hope, that 5493 * others could catch up. The task will clear the 5494 * oactive again with the hn_tx_lock to avoid possible 5495 * races. 5496 */ 5497 txr->hn_oactive = 0; 5498 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5499 } 5500 } 5501 5502 static void 5503 hn_xmit_taskfunc(void *xtxr, int pending __unused) 5504 { 5505 struct hn_tx_ring *txr = xtxr; 5506 5507 mtx_lock(&txr->hn_tx_lock); 5508 hn_xmit(txr, 0); 5509 mtx_unlock(&txr->hn_tx_lock); 5510 } 5511 5512 static void 5513 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 5514 { 5515 struct hn_tx_ring *txr = xtxr; 5516 5517 mtx_lock(&txr->hn_tx_lock); 5518 txr->hn_oactive = 0; 5519 hn_xmit(txr, 0); 5520 mtx_unlock(&txr->hn_tx_lock); 5521 } 5522 5523 static int 5524 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 5525 { 5526 struct vmbus_chan_br cbr; 5527 struct hn_rx_ring *rxr; 5528 struct hn_tx_ring *txr = NULL; 5529 int idx, error; 5530 5531 idx = vmbus_chan_subidx(chan); 5532 5533 /* 5534 * Link this channel to RX/TX ring. 5535 */ 5536 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 5537 ("invalid channel index %d, should > 0 && < %d", 5538 idx, sc->hn_rx_ring_inuse)); 5539 rxr = &sc->hn_rx_ring[idx]; 5540 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 5541 ("RX ring %d already attached", idx)); 5542 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 5543 rxr->hn_chan = chan; 5544 5545 if (bootverbose) { 5546 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 5547 idx, vmbus_chan_id(chan)); 5548 } 5549 5550 if (idx < sc->hn_tx_ring_inuse) { 5551 txr = &sc->hn_tx_ring[idx]; 5552 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 5553 ("TX ring %d already attached", idx)); 5554 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 5555 5556 txr->hn_chan = chan; 5557 if (bootverbose) { 5558 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 5559 idx, vmbus_chan_id(chan)); 5560 } 5561 } 5562 5563 /* Bind this channel to a proper CPU. */ 5564 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 5565 5566 /* 5567 * Open this channel 5568 */ 5569 cbr.cbr = rxr->hn_br; 5570 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 5571 cbr.cbr_txsz = HN_TXBR_SIZE; 5572 cbr.cbr_rxsz = HN_RXBR_SIZE; 5573 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 5574 if (error) { 5575 if (error == EISCONN) { 5576 if_printf(sc->hn_ifp, "bufring is connected after " 5577 "chan%u open failure\n", vmbus_chan_id(chan)); 5578 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 5579 } else { 5580 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 5581 vmbus_chan_id(chan), error); 5582 } 5583 } 5584 return (error); 5585 } 5586 5587 static void 5588 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 5589 { 5590 struct hn_rx_ring *rxr; 5591 int idx, error; 5592 5593 idx = vmbus_chan_subidx(chan); 5594 5595 /* 5596 * Link this channel to RX/TX ring. 5597 */ 5598 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 5599 ("invalid channel index %d, should > 0 && < %d", 5600 idx, sc->hn_rx_ring_inuse)); 5601 rxr = &sc->hn_rx_ring[idx]; 5602 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 5603 ("RX ring %d is not attached", idx)); 5604 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 5605 5606 if (idx < sc->hn_tx_ring_inuse) { 5607 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 5608 5609 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 5610 ("TX ring %d is not attached attached", idx)); 5611 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 5612 } 5613 5614 /* 5615 * Close this channel. 5616 * 5617 * NOTE: 5618 * Channel closing does _not_ destroy the target channel. 5619 */ 5620 error = vmbus_chan_close_direct(chan); 5621 if (error == EISCONN) { 5622 if_printf(sc->hn_ifp, "chan%u bufring is connected " 5623 "after being closed\n", vmbus_chan_id(chan)); 5624 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 5625 } else if (error) { 5626 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 5627 vmbus_chan_id(chan), error); 5628 } 5629 } 5630 5631 static int 5632 hn_attach_subchans(struct hn_softc *sc) 5633 { 5634 struct vmbus_channel **subchans; 5635 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 5636 int i, error = 0; 5637 5638 KASSERT(subchan_cnt > 0, ("no sub-channels")); 5639 5640 /* Attach the sub-channels. */ 5641 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 5642 for (i = 0; i < subchan_cnt; ++i) { 5643 int error1; 5644 5645 error1 = hn_chan_attach(sc, subchans[i]); 5646 if (error1) { 5647 error = error1; 5648 /* Move on; all channels will be detached later. */ 5649 } 5650 } 5651 vmbus_subchan_rel(subchans, subchan_cnt); 5652 5653 if (error) { 5654 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 5655 } else { 5656 if (bootverbose) { 5657 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 5658 subchan_cnt); 5659 } 5660 } 5661 return (error); 5662 } 5663 5664 static void 5665 hn_detach_allchans(struct hn_softc *sc) 5666 { 5667 struct vmbus_channel **subchans; 5668 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 5669 int i; 5670 5671 if (subchan_cnt == 0) 5672 goto back; 5673 5674 /* Detach the sub-channels. */ 5675 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 5676 for (i = 0; i < subchan_cnt; ++i) 5677 hn_chan_detach(sc, subchans[i]); 5678 vmbus_subchan_rel(subchans, subchan_cnt); 5679 5680 back: 5681 /* 5682 * Detach the primary channel, _after_ all sub-channels 5683 * are detached. 5684 */ 5685 hn_chan_detach(sc, sc->hn_prichan); 5686 5687 /* Wait for sub-channels to be destroyed, if any. */ 5688 vmbus_subchan_drain(sc->hn_prichan); 5689 5690 #ifdef INVARIANTS 5691 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5692 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 5693 HN_RX_FLAG_ATTACHED) == 0, 5694 ("%dth RX ring is still attached", i)); 5695 } 5696 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5697 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 5698 HN_TX_FLAG_ATTACHED) == 0, 5699 ("%dth TX ring is still attached", i)); 5700 } 5701 #endif 5702 } 5703 5704 static int 5705 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 5706 { 5707 struct vmbus_channel **subchans; 5708 int nchan, rxr_cnt, error; 5709 5710 nchan = *nsubch + 1; 5711 if (nchan == 1) { 5712 /* 5713 * Multiple RX/TX rings are not requested. 5714 */ 5715 *nsubch = 0; 5716 return (0); 5717 } 5718 5719 /* 5720 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 5721 * table entries. 5722 */ 5723 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 5724 if (error) { 5725 /* No RSS; this is benign. */ 5726 *nsubch = 0; 5727 return (0); 5728 } 5729 if (bootverbose) { 5730 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 5731 rxr_cnt, nchan); 5732 } 5733 5734 if (nchan > rxr_cnt) 5735 nchan = rxr_cnt; 5736 if (nchan == 1) { 5737 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 5738 *nsubch = 0; 5739 return (0); 5740 } 5741 5742 /* 5743 * Allocate sub-channels from NVS. 5744 */ 5745 *nsubch = nchan - 1; 5746 error = hn_nvs_alloc_subchans(sc, nsubch); 5747 if (error || *nsubch == 0) { 5748 /* Failed to allocate sub-channels. */ 5749 *nsubch = 0; 5750 return (0); 5751 } 5752 5753 /* 5754 * Wait for all sub-channels to become ready before moving on. 5755 */ 5756 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 5757 vmbus_subchan_rel(subchans, *nsubch); 5758 return (0); 5759 } 5760 5761 static bool 5762 hn_synth_attachable(const struct hn_softc *sc) 5763 { 5764 int i; 5765 5766 if (sc->hn_flags & HN_FLAG_ERRORS) 5767 return (false); 5768 5769 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5770 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5771 5772 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 5773 return (false); 5774 } 5775 return (true); 5776 } 5777 5778 /* 5779 * Make sure that the RX filter is zero after the successful 5780 * RNDIS initialization. 5781 * 5782 * NOTE: 5783 * Under certain conditions on certain versions of Hyper-V, 5784 * the RNDIS rxfilter is _not_ zero on the hypervisor side 5785 * after the successful RNDIS initialization, which breaks 5786 * the assumption of any following code (well, it breaks the 5787 * RNDIS API contract actually). Clear the RNDIS rxfilter 5788 * explicitly, drain packets sneaking through, and drain the 5789 * interrupt taskqueues scheduled due to the stealth packets. 5790 */ 5791 static void 5792 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 5793 { 5794 5795 hn_disable_rx(sc); 5796 hn_drain_rxtx(sc, nchan); 5797 } 5798 5799 static int 5800 hn_synth_attach(struct hn_softc *sc, int mtu) 5801 { 5802 #define ATTACHED_NVS 0x0002 5803 #define ATTACHED_RNDIS 0x0004 5804 5805 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 5806 int error, nsubch, nchan = 1, i, rndis_inited; 5807 uint32_t old_caps, attached = 0; 5808 5809 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 5810 ("synthetic parts were attached")); 5811 5812 if (!hn_synth_attachable(sc)) 5813 return (ENXIO); 5814 5815 /* Save capabilities for later verification. */ 5816 old_caps = sc->hn_caps; 5817 sc->hn_caps = 0; 5818 5819 /* Clear RSS stuffs. */ 5820 sc->hn_rss_ind_size = 0; 5821 sc->hn_rss_hash = 0; 5822 5823 /* 5824 * Attach the primary channel _before_ attaching NVS and RNDIS. 5825 */ 5826 error = hn_chan_attach(sc, sc->hn_prichan); 5827 if (error) 5828 goto failed; 5829 5830 /* 5831 * Attach NVS. 5832 */ 5833 error = hn_nvs_attach(sc, mtu); 5834 if (error) 5835 goto failed; 5836 attached |= ATTACHED_NVS; 5837 5838 /* 5839 * Attach RNDIS _after_ NVS is attached. 5840 */ 5841 error = hn_rndis_attach(sc, mtu, &rndis_inited); 5842 if (rndis_inited) 5843 attached |= ATTACHED_RNDIS; 5844 if (error) 5845 goto failed; 5846 5847 /* 5848 * Make sure capabilities are not changed. 5849 */ 5850 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 5851 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 5852 old_caps, sc->hn_caps); 5853 error = ENXIO; 5854 goto failed; 5855 } 5856 5857 /* 5858 * Allocate sub-channels for multi-TX/RX rings. 5859 * 5860 * NOTE: 5861 * The # of RX rings that can be used is equivalent to the # of 5862 * channels to be requested. 5863 */ 5864 nsubch = sc->hn_rx_ring_cnt - 1; 5865 error = hn_synth_alloc_subchans(sc, &nsubch); 5866 if (error) 5867 goto failed; 5868 /* NOTE: _Full_ synthetic parts detach is required now. */ 5869 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 5870 5871 /* 5872 * Set the # of TX/RX rings that could be used according to 5873 * the # of channels that NVS offered. 5874 */ 5875 nchan = nsubch + 1; 5876 hn_set_ring_inuse(sc, nchan); 5877 if (nchan == 1) { 5878 /* Only the primary channel can be used; done */ 5879 goto back; 5880 } 5881 5882 /* 5883 * Attach the sub-channels. 5884 * 5885 * NOTE: hn_set_ring_inuse() _must_ have been called. 5886 */ 5887 error = hn_attach_subchans(sc); 5888 if (error) 5889 goto failed; 5890 5891 /* 5892 * Configure RSS key and indirect table _after_ all sub-channels 5893 * are attached. 5894 */ 5895 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 5896 /* 5897 * RSS key is not set yet; set it to the default RSS key. 5898 */ 5899 if (bootverbose) 5900 if_printf(sc->hn_ifp, "setup default RSS key\n"); 5901 #ifdef RSS 5902 rss_getkey(rss->rss_key); 5903 #else 5904 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 5905 #endif 5906 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 5907 } 5908 5909 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 5910 /* 5911 * RSS indirect table is not set yet; set it up in round- 5912 * robin fashion. 5913 */ 5914 if (bootverbose) { 5915 if_printf(sc->hn_ifp, "setup default RSS indirect " 5916 "table\n"); 5917 } 5918 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 5919 uint32_t subidx; 5920 5921 #ifdef RSS 5922 subidx = rss_get_indirection_to_bucket(i); 5923 #else 5924 subidx = i; 5925 #endif 5926 rss->rss_ind[i] = subidx % nchan; 5927 } 5928 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 5929 } else { 5930 /* 5931 * # of usable channels may be changed, so we have to 5932 * make sure that all entries in RSS indirect table 5933 * are valid. 5934 * 5935 * NOTE: hn_set_ring_inuse() _must_ have been called. 5936 */ 5937 hn_rss_ind_fixup(sc); 5938 } 5939 5940 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 5941 if (error) 5942 goto failed; 5943 back: 5944 /* 5945 * Fixup transmission aggregation setup. 5946 */ 5947 hn_set_txagg(sc); 5948 hn_rndis_init_fixat(sc, nchan); 5949 return (0); 5950 5951 failed: 5952 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 5953 hn_rndis_init_fixat(sc, nchan); 5954 hn_synth_detach(sc); 5955 } else { 5956 if (attached & ATTACHED_RNDIS) { 5957 hn_rndis_init_fixat(sc, nchan); 5958 hn_rndis_detach(sc); 5959 } 5960 if (attached & ATTACHED_NVS) 5961 hn_nvs_detach(sc); 5962 hn_chan_detach(sc, sc->hn_prichan); 5963 /* Restore old capabilities. */ 5964 sc->hn_caps = old_caps; 5965 } 5966 return (error); 5967 5968 #undef ATTACHED_RNDIS 5969 #undef ATTACHED_NVS 5970 } 5971 5972 /* 5973 * NOTE: 5974 * The interface must have been suspended though hn_suspend(), before 5975 * this function get called. 5976 */ 5977 static void 5978 hn_synth_detach(struct hn_softc *sc) 5979 { 5980 5981 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 5982 ("synthetic parts were not attached")); 5983 5984 /* Detach the RNDIS first. */ 5985 hn_rndis_detach(sc); 5986 5987 /* Detach NVS. */ 5988 hn_nvs_detach(sc); 5989 5990 /* Detach all of the channels. */ 5991 hn_detach_allchans(sc); 5992 5993 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 5994 } 5995 5996 static void 5997 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 5998 { 5999 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6000 ("invalid ring count %d", ring_cnt)); 6001 6002 if (sc->hn_tx_ring_cnt > ring_cnt) 6003 sc->hn_tx_ring_inuse = ring_cnt; 6004 else 6005 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6006 sc->hn_rx_ring_inuse = ring_cnt; 6007 6008 #ifdef RSS 6009 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6010 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6011 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6012 rss_getnumbuckets()); 6013 } 6014 #endif 6015 6016 if (bootverbose) { 6017 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6018 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6019 } 6020 } 6021 6022 static void 6023 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6024 { 6025 6026 /* 6027 * NOTE: 6028 * The TX bufring will not be drained by the hypervisor, 6029 * if the primary channel is revoked. 6030 */ 6031 while (!vmbus_chan_rx_empty(chan) || 6032 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6033 !vmbus_chan_tx_empty(chan))) 6034 pause("waitch", 1); 6035 vmbus_chan_intr_drain(chan); 6036 } 6037 6038 static void 6039 hn_disable_rx(struct hn_softc *sc) 6040 { 6041 6042 /* 6043 * Disable RX by clearing RX filter forcefully. 6044 */ 6045 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6046 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6047 6048 /* 6049 * Give RNDIS enough time to flush all pending data packets. 6050 */ 6051 pause("waitrx", (200 * hz) / 1000); 6052 } 6053 6054 /* 6055 * NOTE: 6056 * RX/TX _must_ have been suspended/disabled, before this function 6057 * is called. 6058 */ 6059 static void 6060 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6061 { 6062 struct vmbus_channel **subch = NULL; 6063 int nsubch; 6064 6065 /* 6066 * Drain RX/TX bufrings and interrupts. 6067 */ 6068 nsubch = nchan - 1; 6069 if (nsubch > 0) 6070 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6071 6072 if (subch != NULL) { 6073 int i; 6074 6075 for (i = 0; i < nsubch; ++i) 6076 hn_chan_drain(sc, subch[i]); 6077 } 6078 hn_chan_drain(sc, sc->hn_prichan); 6079 6080 if (subch != NULL) 6081 vmbus_subchan_rel(subch, nsubch); 6082 } 6083 6084 static void 6085 hn_suspend_data(struct hn_softc *sc) 6086 { 6087 struct hn_tx_ring *txr; 6088 int i; 6089 6090 HN_LOCK_ASSERT(sc); 6091 6092 /* 6093 * Suspend TX. 6094 */ 6095 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6096 txr = &sc->hn_tx_ring[i]; 6097 6098 mtx_lock(&txr->hn_tx_lock); 6099 txr->hn_suspended = 1; 6100 mtx_unlock(&txr->hn_tx_lock); 6101 /* No one is able send more packets now. */ 6102 6103 /* 6104 * Wait for all pending sends to finish. 6105 * 6106 * NOTE: 6107 * We will _not_ receive all pending send-done, if the 6108 * primary channel is revoked. 6109 */ 6110 while (hn_tx_ring_pending(txr) && 6111 !vmbus_chan_is_revoked(sc->hn_prichan)) 6112 pause("hnwtx", 1 /* 1 tick */); 6113 } 6114 6115 /* 6116 * Disable RX. 6117 */ 6118 hn_disable_rx(sc); 6119 6120 /* 6121 * Drain RX/TX. 6122 */ 6123 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6124 6125 /* 6126 * Drain any pending TX tasks. 6127 * 6128 * NOTE: 6129 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6130 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6131 */ 6132 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6133 txr = &sc->hn_tx_ring[i]; 6134 6135 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6136 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6137 } 6138 } 6139 6140 static void 6141 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6142 { 6143 6144 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6145 } 6146 6147 static void 6148 hn_suspend_mgmt(struct hn_softc *sc) 6149 { 6150 struct task task; 6151 6152 HN_LOCK_ASSERT(sc); 6153 6154 /* 6155 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6156 * through hn_mgmt_taskq. 6157 */ 6158 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6159 vmbus_chan_run_task(sc->hn_prichan, &task); 6160 6161 /* 6162 * Make sure that all pending management tasks are completed. 6163 */ 6164 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6165 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6166 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6167 } 6168 6169 static void 6170 hn_suspend(struct hn_softc *sc) 6171 { 6172 6173 /* Disable polling. */ 6174 hn_polling(sc, 0); 6175 6176 /* 6177 * If the non-transparent mode VF is activated, the synthetic 6178 * device is receiving packets, so the data path of the 6179 * synthetic device must be suspended. 6180 */ 6181 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6182 (sc->hn_flags & HN_FLAG_RXVF)) 6183 hn_suspend_data(sc); 6184 hn_suspend_mgmt(sc); 6185 } 6186 6187 static void 6188 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6189 { 6190 int i; 6191 6192 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6193 ("invalid TX ring count %d", tx_ring_cnt)); 6194 6195 for (i = 0; i < tx_ring_cnt; ++i) { 6196 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6197 6198 mtx_lock(&txr->hn_tx_lock); 6199 txr->hn_suspended = 0; 6200 mtx_unlock(&txr->hn_tx_lock); 6201 } 6202 } 6203 6204 static void 6205 hn_resume_data(struct hn_softc *sc) 6206 { 6207 int i; 6208 6209 HN_LOCK_ASSERT(sc); 6210 6211 /* 6212 * Re-enable RX. 6213 */ 6214 hn_rxfilter_config(sc); 6215 6216 /* 6217 * Make sure to clear suspend status on "all" TX rings, 6218 * since hn_tx_ring_inuse can be changed after 6219 * hn_suspend_data(). 6220 */ 6221 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6222 6223 #ifdef HN_IFSTART_SUPPORT 6224 if (!hn_use_if_start) 6225 #endif 6226 { 6227 /* 6228 * Flush unused drbrs, since hn_tx_ring_inuse may be 6229 * reduced. 6230 */ 6231 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6232 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6233 } 6234 6235 /* 6236 * Kick start TX. 6237 */ 6238 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6239 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6240 6241 /* 6242 * Use txeof task, so that any pending oactive can be 6243 * cleared properly. 6244 */ 6245 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6246 } 6247 } 6248 6249 static void 6250 hn_resume_mgmt(struct hn_softc *sc) 6251 { 6252 6253 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6254 6255 /* 6256 * Kick off network change detection, if it was pending. 6257 * If no network change was pending, start link status 6258 * checks, which is more lightweight than network change 6259 * detection. 6260 */ 6261 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6262 hn_change_network(sc); 6263 else 6264 hn_update_link_status(sc); 6265 } 6266 6267 static void 6268 hn_resume(struct hn_softc *sc) 6269 { 6270 6271 /* 6272 * If the non-transparent mode VF is activated, the synthetic 6273 * device have to receive packets, so the data path of the 6274 * synthetic device must be resumed. 6275 */ 6276 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6277 (sc->hn_flags & HN_FLAG_RXVF)) 6278 hn_resume_data(sc); 6279 6280 /* 6281 * Don't resume link status change if VF is attached/activated. 6282 * - In the non-transparent VF mode, the synthetic device marks 6283 * link down until the VF is deactivated; i.e. VF is down. 6284 * - In transparent VF mode, VF's media status is used until 6285 * the VF is detached. 6286 */ 6287 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 6288 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 6289 hn_resume_mgmt(sc); 6290 6291 /* 6292 * Re-enable polling if this interface is running and 6293 * the polling is requested. 6294 */ 6295 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 6296 hn_polling(sc, sc->hn_pollhz); 6297 } 6298 6299 static void 6300 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 6301 { 6302 const struct rndis_status_msg *msg; 6303 int ofs; 6304 6305 if (dlen < sizeof(*msg)) { 6306 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 6307 return; 6308 } 6309 msg = data; 6310 6311 switch (msg->rm_status) { 6312 case RNDIS_STATUS_MEDIA_CONNECT: 6313 case RNDIS_STATUS_MEDIA_DISCONNECT: 6314 hn_update_link_status(sc); 6315 break; 6316 6317 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 6318 case RNDIS_STATUS_LINK_SPEED_CHANGE: 6319 /* Not really useful; ignore. */ 6320 break; 6321 6322 case RNDIS_STATUS_NETWORK_CHANGE: 6323 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 6324 if (dlen < ofs + msg->rm_stbuflen || 6325 msg->rm_stbuflen < sizeof(uint32_t)) { 6326 if_printf(sc->hn_ifp, "network changed\n"); 6327 } else { 6328 uint32_t change; 6329 6330 memcpy(&change, ((const uint8_t *)msg) + ofs, 6331 sizeof(change)); 6332 if_printf(sc->hn_ifp, "network changed, change %u\n", 6333 change); 6334 } 6335 hn_change_network(sc); 6336 break; 6337 6338 default: 6339 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 6340 msg->rm_status); 6341 break; 6342 } 6343 } 6344 6345 static int 6346 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 6347 { 6348 const struct rndis_pktinfo *pi = info_data; 6349 uint32_t mask = 0; 6350 6351 while (info_dlen != 0) { 6352 const void *data; 6353 uint32_t dlen; 6354 6355 if (__predict_false(info_dlen < sizeof(*pi))) 6356 return (EINVAL); 6357 if (__predict_false(info_dlen < pi->rm_size)) 6358 return (EINVAL); 6359 info_dlen -= pi->rm_size; 6360 6361 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 6362 return (EINVAL); 6363 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 6364 return (EINVAL); 6365 dlen = pi->rm_size - pi->rm_pktinfooffset; 6366 data = pi->rm_data; 6367 6368 switch (pi->rm_type) { 6369 case NDIS_PKTINFO_TYPE_VLAN: 6370 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 6371 return (EINVAL); 6372 info->vlan_info = *((const uint32_t *)data); 6373 mask |= HN_RXINFO_VLAN; 6374 break; 6375 6376 case NDIS_PKTINFO_TYPE_CSUM: 6377 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 6378 return (EINVAL); 6379 info->csum_info = *((const uint32_t *)data); 6380 mask |= HN_RXINFO_CSUM; 6381 break; 6382 6383 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 6384 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 6385 return (EINVAL); 6386 info->hash_value = *((const uint32_t *)data); 6387 mask |= HN_RXINFO_HASHVAL; 6388 break; 6389 6390 case HN_NDIS_PKTINFO_TYPE_HASHINF: 6391 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 6392 return (EINVAL); 6393 info->hash_info = *((const uint32_t *)data); 6394 mask |= HN_RXINFO_HASHINF; 6395 break; 6396 6397 default: 6398 goto next; 6399 } 6400 6401 if (mask == HN_RXINFO_ALL) { 6402 /* All found; done */ 6403 break; 6404 } 6405 next: 6406 pi = (const struct rndis_pktinfo *) 6407 ((const uint8_t *)pi + pi->rm_size); 6408 } 6409 6410 /* 6411 * Final fixup. 6412 * - If there is no hash value, invalidate the hash info. 6413 */ 6414 if ((mask & HN_RXINFO_HASHVAL) == 0) 6415 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 6416 return (0); 6417 } 6418 6419 static __inline bool 6420 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 6421 { 6422 6423 if (off < check_off) { 6424 if (__predict_true(off + len <= check_off)) 6425 return (false); 6426 } else if (off > check_off) { 6427 if (__predict_true(check_off + check_len <= off)) 6428 return (false); 6429 } 6430 return (true); 6431 } 6432 6433 static void 6434 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 6435 { 6436 const struct rndis_packet_msg *pkt; 6437 struct hn_rxinfo info; 6438 int data_off, pktinfo_off, data_len, pktinfo_len; 6439 6440 /* 6441 * Check length. 6442 */ 6443 if (__predict_false(dlen < sizeof(*pkt))) { 6444 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 6445 return; 6446 } 6447 pkt = data; 6448 6449 if (__predict_false(dlen < pkt->rm_len)) { 6450 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 6451 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 6452 return; 6453 } 6454 if (__predict_false(pkt->rm_len < 6455 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 6456 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 6457 "msglen %u, data %u, oob %u, pktinfo %u\n", 6458 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 6459 pkt->rm_pktinfolen); 6460 return; 6461 } 6462 if (__predict_false(pkt->rm_datalen == 0)) { 6463 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 6464 return; 6465 } 6466 6467 /* 6468 * Check offests. 6469 */ 6470 #define IS_OFFSET_INVALID(ofs) \ 6471 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 6472 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 6473 6474 /* XXX Hyper-V does not meet data offset alignment requirement */ 6475 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 6476 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6477 "data offset %u\n", pkt->rm_dataoffset); 6478 return; 6479 } 6480 if (__predict_false(pkt->rm_oobdataoffset > 0 && 6481 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 6482 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6483 "oob offset %u\n", pkt->rm_oobdataoffset); 6484 return; 6485 } 6486 if (__predict_true(pkt->rm_pktinfooffset > 0) && 6487 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 6488 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6489 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 6490 return; 6491 } 6492 6493 #undef IS_OFFSET_INVALID 6494 6495 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 6496 data_len = pkt->rm_datalen; 6497 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 6498 pktinfo_len = pkt->rm_pktinfolen; 6499 6500 /* 6501 * Check OOB coverage. 6502 */ 6503 if (__predict_false(pkt->rm_oobdatalen != 0)) { 6504 int oob_off, oob_len; 6505 6506 if_printf(rxr->hn_ifp, "got oobdata\n"); 6507 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 6508 oob_len = pkt->rm_oobdatalen; 6509 6510 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 6511 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6512 "oob overflow, msglen %u, oob abs %d len %d\n", 6513 pkt->rm_len, oob_off, oob_len); 6514 return; 6515 } 6516 6517 /* 6518 * Check against data. 6519 */ 6520 if (hn_rndis_check_overlap(oob_off, oob_len, 6521 data_off, data_len)) { 6522 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6523 "oob overlaps data, oob abs %d len %d, " 6524 "data abs %d len %d\n", 6525 oob_off, oob_len, data_off, data_len); 6526 return; 6527 } 6528 6529 /* 6530 * Check against pktinfo. 6531 */ 6532 if (pktinfo_len != 0 && 6533 hn_rndis_check_overlap(oob_off, oob_len, 6534 pktinfo_off, pktinfo_len)) { 6535 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6536 "oob overlaps pktinfo, oob abs %d len %d, " 6537 "pktinfo abs %d len %d\n", 6538 oob_off, oob_len, pktinfo_off, pktinfo_len); 6539 return; 6540 } 6541 } 6542 6543 /* 6544 * Check per-packet-info coverage and find useful per-packet-info. 6545 */ 6546 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 6547 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 6548 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 6549 if (__predict_true(pktinfo_len != 0)) { 6550 bool overlap; 6551 int error; 6552 6553 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 6554 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6555 "pktinfo overflow, msglen %u, " 6556 "pktinfo abs %d len %d\n", 6557 pkt->rm_len, pktinfo_off, pktinfo_len); 6558 return; 6559 } 6560 6561 /* 6562 * Check packet info coverage. 6563 */ 6564 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 6565 data_off, data_len); 6566 if (__predict_false(overlap)) { 6567 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6568 "pktinfo overlap data, pktinfo abs %d len %d, " 6569 "data abs %d len %d\n", 6570 pktinfo_off, pktinfo_len, data_off, data_len); 6571 return; 6572 } 6573 6574 /* 6575 * Find useful per-packet-info. 6576 */ 6577 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 6578 pktinfo_len, &info); 6579 if (__predict_false(error)) { 6580 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 6581 "pktinfo\n"); 6582 return; 6583 } 6584 } 6585 6586 if (__predict_false(data_off + data_len > pkt->rm_len)) { 6587 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 6588 "data overflow, msglen %u, data abs %d len %d\n", 6589 pkt->rm_len, data_off, data_len); 6590 return; 6591 } 6592 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 6593 } 6594 6595 static __inline void 6596 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 6597 { 6598 const struct rndis_msghdr *hdr; 6599 6600 if (__predict_false(dlen < sizeof(*hdr))) { 6601 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 6602 return; 6603 } 6604 hdr = data; 6605 6606 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 6607 /* Hot data path. */ 6608 hn_rndis_rx_data(rxr, data, dlen); 6609 /* Done! */ 6610 return; 6611 } 6612 6613 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 6614 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 6615 else 6616 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 6617 } 6618 6619 static void 6620 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 6621 { 6622 const struct hn_nvs_hdr *hdr; 6623 6624 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 6625 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 6626 return; 6627 } 6628 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 6629 6630 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 6631 /* Useless; ignore */ 6632 return; 6633 } 6634 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 6635 } 6636 6637 static void 6638 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 6639 const struct vmbus_chanpkt_hdr *pkt) 6640 { 6641 struct hn_nvs_sendctx *sndc; 6642 6643 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 6644 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 6645 VMBUS_CHANPKT_DATALEN(pkt)); 6646 /* 6647 * NOTE: 6648 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 6649 * its callback. 6650 */ 6651 } 6652 6653 static void 6654 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 6655 const struct vmbus_chanpkt_hdr *pkthdr) 6656 { 6657 const struct vmbus_chanpkt_rxbuf *pkt; 6658 const struct hn_nvs_hdr *nvs_hdr; 6659 int count, i, hlen; 6660 6661 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 6662 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 6663 return; 6664 } 6665 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 6666 6667 /* Make sure that this is a RNDIS message. */ 6668 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 6669 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 6670 nvs_hdr->nvs_type); 6671 return; 6672 } 6673 6674 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 6675 if (__predict_false(hlen < sizeof(*pkt))) { 6676 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 6677 return; 6678 } 6679 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 6680 6681 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 6682 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 6683 pkt->cp_rxbuf_id); 6684 return; 6685 } 6686 6687 count = pkt->cp_rxbuf_cnt; 6688 if (__predict_false(hlen < 6689 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 6690 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 6691 return; 6692 } 6693 6694 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 6695 for (i = 0; i < count; ++i) { 6696 int ofs, len; 6697 6698 ofs = pkt->cp_rxbuf[i].rb_ofs; 6699 len = pkt->cp_rxbuf[i].rb_len; 6700 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 6701 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 6702 "ofs %d, len %d\n", i, ofs, len); 6703 continue; 6704 } 6705 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 6706 } 6707 6708 /* 6709 * Ack the consumed RXBUF associated w/ this channel packet, 6710 * so that this RXBUF can be recycled by the hypervisor. 6711 */ 6712 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 6713 } 6714 6715 static void 6716 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 6717 uint64_t tid) 6718 { 6719 struct hn_nvs_rndis_ack ack; 6720 int retries, error; 6721 6722 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 6723 ack.nvs_status = HN_NVS_STATUS_OK; 6724 6725 retries = 0; 6726 again: 6727 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 6728 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 6729 if (__predict_false(error == EAGAIN)) { 6730 /* 6731 * NOTE: 6732 * This should _not_ happen in real world, since the 6733 * consumption of the TX bufring from the TX path is 6734 * controlled. 6735 */ 6736 if (rxr->hn_ack_failed == 0) 6737 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 6738 rxr->hn_ack_failed++; 6739 retries++; 6740 if (retries < 10) { 6741 DELAY(100); 6742 goto again; 6743 } 6744 /* RXBUF leaks! */ 6745 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 6746 } 6747 } 6748 6749 static void 6750 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 6751 { 6752 struct hn_rx_ring *rxr = xrxr; 6753 struct hn_softc *sc = rxr->hn_ifp->if_softc; 6754 6755 for (;;) { 6756 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 6757 int error, pktlen; 6758 6759 pktlen = rxr->hn_pktbuf_len; 6760 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 6761 if (__predict_false(error == ENOBUFS)) { 6762 void *nbuf; 6763 int nlen; 6764 6765 /* 6766 * Expand channel packet buffer. 6767 * 6768 * XXX 6769 * Use M_WAITOK here, since allocation failure 6770 * is fatal. 6771 */ 6772 nlen = rxr->hn_pktbuf_len * 2; 6773 while (nlen < pktlen) 6774 nlen *= 2; 6775 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 6776 6777 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 6778 rxr->hn_pktbuf_len, nlen); 6779 6780 free(rxr->hn_pktbuf, M_DEVBUF); 6781 rxr->hn_pktbuf = nbuf; 6782 rxr->hn_pktbuf_len = nlen; 6783 /* Retry! */ 6784 continue; 6785 } else if (__predict_false(error == EAGAIN)) { 6786 /* No more channel packets; done! */ 6787 break; 6788 } 6789 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 6790 6791 switch (pkt->cph_type) { 6792 case VMBUS_CHANPKT_TYPE_COMP: 6793 hn_nvs_handle_comp(sc, chan, pkt); 6794 break; 6795 6796 case VMBUS_CHANPKT_TYPE_RXBUF: 6797 hn_nvs_handle_rxbuf(rxr, chan, pkt); 6798 break; 6799 6800 case VMBUS_CHANPKT_TYPE_INBAND: 6801 hn_nvs_handle_notify(sc, pkt); 6802 break; 6803 6804 default: 6805 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 6806 pkt->cph_type); 6807 break; 6808 } 6809 } 6810 hn_chan_rollup(rxr, rxr->hn_txr); 6811 } 6812 6813 static void 6814 hn_sysinit(void *arg __unused) 6815 { 6816 int i; 6817 6818 #ifdef HN_IFSTART_SUPPORT 6819 /* 6820 * Don't use ifnet.if_start if transparent VF mode is requested; 6821 * mainly due to the IFF_DRV_OACTIVE flag. 6822 */ 6823 if (hn_xpnt_vf && hn_use_if_start) { 6824 hn_use_if_start = 0; 6825 printf("hn: tranparent VF mode, if_transmit will be used, " 6826 "instead of if_start\n"); 6827 } 6828 #endif 6829 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 6830 printf("hn: invalid transparent VF attach routing " 6831 "wait timeout %d, reset to %d\n", 6832 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 6833 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 6834 } 6835 6836 /* 6837 * Initialize VF map. 6838 */ 6839 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 6840 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 6841 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 6842 M_WAITOK | M_ZERO); 6843 6844 /* 6845 * Fix the # of TX taskqueues. 6846 */ 6847 if (hn_tx_taskq_cnt <= 0) 6848 hn_tx_taskq_cnt = 1; 6849 else if (hn_tx_taskq_cnt > mp_ncpus) 6850 hn_tx_taskq_cnt = mp_ncpus; 6851 6852 /* 6853 * Fix the TX taskqueue mode. 6854 */ 6855 switch (hn_tx_taskq_mode) { 6856 case HN_TX_TASKQ_M_INDEP: 6857 case HN_TX_TASKQ_M_GLOBAL: 6858 case HN_TX_TASKQ_M_EVTTQ: 6859 break; 6860 default: 6861 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 6862 break; 6863 } 6864 6865 if (vm_guest != VM_GUEST_HV) 6866 return; 6867 6868 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 6869 return; 6870 6871 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 6872 M_DEVBUF, M_WAITOK); 6873 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 6874 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 6875 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 6876 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 6877 "hn tx%d", i); 6878 } 6879 } 6880 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 6881 6882 static void 6883 hn_sysuninit(void *arg __unused) 6884 { 6885 6886 if (hn_tx_taskque != NULL) { 6887 int i; 6888 6889 for (i = 0; i < hn_tx_taskq_cnt; ++i) 6890 taskqueue_free(hn_tx_taskque[i]); 6891 free(hn_tx_taskque, M_DEVBUF); 6892 } 6893 6894 if (hn_vfmap != NULL) 6895 free(hn_vfmap, M_DEVBUF); 6896 rm_destroy(&hn_vfmap_lock); 6897 } 6898 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 6899