1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/rmlock.h> 75 #include <sys/sbuf.h> 76 #include <sys/smp.h> 77 #include <sys/socket.h> 78 #include <sys/sockio.h> 79 #include <sys/sx.h> 80 #include <sys/sysctl.h> 81 #include <sys/taskqueue.h> 82 #include <sys/buf_ring.h> 83 #include <sys/eventhandler.h> 84 85 #include <machine/atomic.h> 86 #include <machine/in_cksum.h> 87 88 #include <net/bpf.h> 89 #include <net/ethernet.h> 90 #include <net/if.h> 91 #include <net/if_dl.h> 92 #include <net/if_media.h> 93 #include <net/if_types.h> 94 #include <net/if_var.h> 95 #include <net/rndis.h> 96 #ifdef RSS 97 #include <net/rss_config.h> 98 #endif 99 100 #include <netinet/in_systm.h> 101 #include <netinet/in.h> 102 #include <netinet/ip.h> 103 #include <netinet/ip6.h> 104 #include <netinet/tcp.h> 105 #include <netinet/tcp_lro.h> 106 #include <netinet/udp.h> 107 108 #include <dev/hyperv/include/hyperv.h> 109 #include <dev/hyperv/include/hyperv_busdma.h> 110 #include <dev/hyperv/include/vmbus.h> 111 #include <dev/hyperv/include/vmbus_xact.h> 112 113 #include <dev/hyperv/netvsc/ndis.h> 114 #include <dev/hyperv/netvsc/if_hnreg.h> 115 #include <dev/hyperv/netvsc/if_hnvar.h> 116 #include <dev/hyperv/netvsc/hn_nvs.h> 117 #include <dev/hyperv/netvsc/hn_rndis.h> 118 119 #include "vmbus_if.h" 120 121 #define HN_IFSTART_SUPPORT 122 123 #define HN_RING_CNT_DEF_MAX 8 124 125 #define HN_VFMAP_SIZE_DEF 8 126 127 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 128 129 /* YYY should get it from the underlying channel */ 130 #define HN_TX_DESC_CNT 512 131 132 #define HN_RNDIS_PKT_LEN \ 133 (sizeof(struct rndis_packet_msg) + \ 134 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 135 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 136 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 137 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 138 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 139 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 140 141 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 142 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 143 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 144 /* -1 for RNDIS packet message */ 145 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 146 147 #define HN_DIRECT_TX_SIZE_DEF 128 148 149 #define HN_EARLY_TXEOF_THRESH 8 150 151 #define HN_PKTBUF_LEN_DEF (16 * 1024) 152 153 #define HN_LROENT_CNT_DEF 128 154 155 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 156 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 157 /* YYY 2*MTU is a bit rough, but should be good enough. */ 158 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 159 160 #define HN_LRO_ACKCNT_DEF 1 161 162 #define HN_LOCK_INIT(sc) \ 163 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 164 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 165 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 166 #define HN_LOCK(sc) \ 167 do { \ 168 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 169 DELAY(1000); \ 170 } while (0) 171 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 172 173 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 174 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 175 #define HN_CSUM_IP_HWASSIST(sc) \ 176 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 177 #define HN_CSUM_IP6_HWASSIST(sc) \ 178 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 179 180 #define HN_PKTSIZE_MIN(align) \ 181 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 182 HN_RNDIS_PKT_LEN, (align)) 183 #define HN_PKTSIZE(m, align) \ 184 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 185 186 #ifdef RSS 187 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 188 #else 189 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 190 #endif 191 192 struct hn_txdesc { 193 #ifndef HN_USE_TXDESC_BUFRING 194 SLIST_ENTRY(hn_txdesc) link; 195 #endif 196 STAILQ_ENTRY(hn_txdesc) agg_link; 197 198 /* Aggregated txdescs, in sending order. */ 199 STAILQ_HEAD(, hn_txdesc) agg_list; 200 201 /* The oldest packet, if transmission aggregation happens. */ 202 struct mbuf *m; 203 struct hn_tx_ring *txr; 204 int refs; 205 uint32_t flags; /* HN_TXD_FLAG_ */ 206 struct hn_nvs_sendctx send_ctx; 207 uint32_t chim_index; 208 int chim_size; 209 210 bus_dmamap_t data_dmap; 211 212 bus_addr_t rndis_pkt_paddr; 213 struct rndis_packet_msg *rndis_pkt; 214 bus_dmamap_t rndis_pkt_dmap; 215 }; 216 217 #define HN_TXD_FLAG_ONLIST 0x0001 218 #define HN_TXD_FLAG_DMAMAP 0x0002 219 #define HN_TXD_FLAG_ONAGG 0x0004 220 221 struct hn_rxinfo { 222 uint32_t vlan_info; 223 uint32_t csum_info; 224 uint32_t hash_info; 225 uint32_t hash_value; 226 }; 227 228 struct hn_rxvf_setarg { 229 struct hn_rx_ring *rxr; 230 struct ifnet *vf_ifp; 231 }; 232 233 #define HN_RXINFO_VLAN 0x0001 234 #define HN_RXINFO_CSUM 0x0002 235 #define HN_RXINFO_HASHINF 0x0004 236 #define HN_RXINFO_HASHVAL 0x0008 237 #define HN_RXINFO_ALL \ 238 (HN_RXINFO_VLAN | \ 239 HN_RXINFO_CSUM | \ 240 HN_RXINFO_HASHINF | \ 241 HN_RXINFO_HASHVAL) 242 243 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 244 #define HN_NDIS_RXCSUM_INFO_INVALID 0 245 #define HN_NDIS_HASH_INFO_INVALID 0 246 247 static int hn_probe(device_t); 248 static int hn_attach(device_t); 249 static int hn_detach(device_t); 250 static int hn_shutdown(device_t); 251 static void hn_chan_callback(struct vmbus_channel *, 252 void *); 253 254 static void hn_init(void *); 255 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 256 #ifdef HN_IFSTART_SUPPORT 257 static void hn_start(struct ifnet *); 258 #endif 259 static int hn_transmit(struct ifnet *, struct mbuf *); 260 static void hn_xmit_qflush(struct ifnet *); 261 static int hn_ifmedia_upd(struct ifnet *); 262 static void hn_ifmedia_sts(struct ifnet *, 263 struct ifmediareq *); 264 265 static void hn_ifnet_event(void *, struct ifnet *, int); 266 static void hn_ifaddr_event(void *, struct ifnet *); 267 static void hn_ifnet_attevent(void *, struct ifnet *); 268 static void hn_ifnet_detevent(void *, struct ifnet *); 269 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 270 271 static bool hn_ismyvf(const struct hn_softc *, 272 const struct ifnet *); 273 static void hn_rxvf_change(struct hn_softc *, 274 struct ifnet *, bool); 275 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 276 static void hn_rxvf_set_task(void *, int); 277 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 278 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 279 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 280 struct ifreq *); 281 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 282 static bool hn_xpnt_vf_isready(struct hn_softc *); 283 static void hn_xpnt_vf_setready(struct hn_softc *); 284 static void hn_xpnt_vf_init_taskfunc(void *, int); 285 static void hn_xpnt_vf_init(struct hn_softc *); 286 static void hn_xpnt_vf_setenable(struct hn_softc *); 287 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 288 static void hn_vf_rss_fixup(struct hn_softc *, bool); 289 static void hn_vf_rss_restore(struct hn_softc *); 290 291 static int hn_rndis_rxinfo(const void *, int, 292 struct hn_rxinfo *); 293 static void hn_rndis_rx_data(struct hn_rx_ring *, 294 const void *, int); 295 static void hn_rndis_rx_status(struct hn_softc *, 296 const void *, int); 297 static void hn_rndis_init_fixat(struct hn_softc *, int); 298 299 static void hn_nvs_handle_notify(struct hn_softc *, 300 const struct vmbus_chanpkt_hdr *); 301 static void hn_nvs_handle_comp(struct hn_softc *, 302 struct vmbus_channel *, 303 const struct vmbus_chanpkt_hdr *); 304 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 305 struct vmbus_channel *, 306 const struct vmbus_chanpkt_hdr *); 307 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 308 struct vmbus_channel *, uint64_t); 309 310 #if __FreeBSD_version >= 1100099 311 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 312 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 313 #endif 314 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 315 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 316 #if __FreeBSD_version < 1100095 317 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 318 #else 319 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 320 #endif 321 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 322 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 323 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 324 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 325 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 326 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 327 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 328 #ifndef RSS 329 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 331 #endif 332 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 346 347 static void hn_stop(struct hn_softc *, bool); 348 static void hn_init_locked(struct hn_softc *); 349 static int hn_chan_attach(struct hn_softc *, 350 struct vmbus_channel *); 351 static void hn_chan_detach(struct hn_softc *, 352 struct vmbus_channel *); 353 static int hn_attach_subchans(struct hn_softc *); 354 static void hn_detach_allchans(struct hn_softc *); 355 static void hn_chan_rollup(struct hn_rx_ring *, 356 struct hn_tx_ring *); 357 static void hn_set_ring_inuse(struct hn_softc *, int); 358 static int hn_synth_attach(struct hn_softc *, int); 359 static void hn_synth_detach(struct hn_softc *); 360 static int hn_synth_alloc_subchans(struct hn_softc *, 361 int *); 362 static bool hn_synth_attachable(const struct hn_softc *); 363 static void hn_suspend(struct hn_softc *); 364 static void hn_suspend_data(struct hn_softc *); 365 static void hn_suspend_mgmt(struct hn_softc *); 366 static void hn_resume(struct hn_softc *); 367 static void hn_resume_data(struct hn_softc *); 368 static void hn_resume_mgmt(struct hn_softc *); 369 static void hn_suspend_mgmt_taskfunc(void *, int); 370 static void hn_chan_drain(struct hn_softc *, 371 struct vmbus_channel *); 372 static void hn_disable_rx(struct hn_softc *); 373 static void hn_drain_rxtx(struct hn_softc *, int); 374 static void hn_polling(struct hn_softc *, u_int); 375 static void hn_chan_polling(struct vmbus_channel *, u_int); 376 static void hn_mtu_change_fixup(struct hn_softc *); 377 378 static void hn_update_link_status(struct hn_softc *); 379 static void hn_change_network(struct hn_softc *); 380 static void hn_link_taskfunc(void *, int); 381 static void hn_netchg_init_taskfunc(void *, int); 382 static void hn_netchg_status_taskfunc(void *, int); 383 static void hn_link_status(struct hn_softc *); 384 385 static int hn_create_rx_data(struct hn_softc *, int); 386 static void hn_destroy_rx_data(struct hn_softc *); 387 static int hn_check_iplen(const struct mbuf *, int); 388 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 389 static int hn_rxfilter_config(struct hn_softc *); 390 static int hn_rss_reconfig(struct hn_softc *); 391 static void hn_rss_ind_fixup(struct hn_softc *); 392 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 393 static int hn_rxpkt(struct hn_rx_ring *, const void *, 394 int, const struct hn_rxinfo *); 395 static uint32_t hn_rss_type_fromndis(uint32_t); 396 static uint32_t hn_rss_type_tondis(uint32_t); 397 398 static int hn_tx_ring_create(struct hn_softc *, int); 399 static void hn_tx_ring_destroy(struct hn_tx_ring *); 400 static int hn_create_tx_data(struct hn_softc *, int); 401 static void hn_fixup_tx_data(struct hn_softc *); 402 static void hn_destroy_tx_data(struct hn_softc *); 403 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 404 static void hn_txdesc_gc(struct hn_tx_ring *, 405 struct hn_txdesc *); 406 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 407 struct hn_txdesc *, struct mbuf **); 408 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 409 struct hn_txdesc *); 410 static void hn_set_chim_size(struct hn_softc *, int); 411 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 412 static bool hn_tx_ring_pending(struct hn_tx_ring *); 413 static void hn_tx_ring_qflush(struct hn_tx_ring *); 414 static void hn_resume_tx(struct hn_softc *, int); 415 static void hn_set_txagg(struct hn_softc *); 416 static void *hn_try_txagg(struct ifnet *, 417 struct hn_tx_ring *, struct hn_txdesc *, 418 int); 419 static int hn_get_txswq_depth(const struct hn_tx_ring *); 420 static void hn_txpkt_done(struct hn_nvs_sendctx *, 421 struct hn_softc *, struct vmbus_channel *, 422 const void *, int); 423 static int hn_txpkt_sglist(struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static int hn_txpkt_chim(struct hn_tx_ring *, 426 struct hn_txdesc *); 427 static int hn_xmit(struct hn_tx_ring *, int); 428 static void hn_xmit_taskfunc(void *, int); 429 static void hn_xmit_txeof(struct hn_tx_ring *); 430 static void hn_xmit_txeof_taskfunc(void *, int); 431 #ifdef HN_IFSTART_SUPPORT 432 static int hn_start_locked(struct hn_tx_ring *, int); 433 static void hn_start_taskfunc(void *, int); 434 static void hn_start_txeof(struct hn_tx_ring *); 435 static void hn_start_txeof_taskfunc(void *, int); 436 #endif 437 438 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 439 "Hyper-V network interface"); 440 441 /* Trust tcp segements verification on host side. */ 442 static int hn_trust_hosttcp = 1; 443 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 444 &hn_trust_hosttcp, 0, 445 "Trust tcp segement verification on host side, " 446 "when csum info is missing (global setting)"); 447 448 /* Trust udp datagrams verification on host side. */ 449 static int hn_trust_hostudp = 1; 450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 451 &hn_trust_hostudp, 0, 452 "Trust udp datagram verification on host side, " 453 "when csum info is missing (global setting)"); 454 455 /* Trust ip packets verification on host side. */ 456 static int hn_trust_hostip = 1; 457 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 458 &hn_trust_hostip, 0, 459 "Trust ip packet verification on host side, " 460 "when csum info is missing (global setting)"); 461 462 /* 463 * Offload UDP/IPv4 checksum. 464 */ 465 static int hn_enable_udp4cs = 1; 466 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 467 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 468 469 /* 470 * Offload UDP/IPv6 checksum. 471 */ 472 static int hn_enable_udp6cs = 1; 473 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 474 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 475 476 /* Stats. */ 477 static counter_u64_t hn_udpcs_fixup; 478 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 479 &hn_udpcs_fixup, "# of UDP checksum fixup"); 480 481 /* 482 * See hn_set_hlen(). 483 * 484 * This value is for Azure. For Hyper-V, set this above 485 * 65536 to disable UDP datagram checksum fixup. 486 */ 487 static int hn_udpcs_fixup_mtu = 1420; 488 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 489 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 490 491 /* Limit TSO burst size */ 492 static int hn_tso_maxlen = IP_MAXPACKET; 493 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 494 &hn_tso_maxlen, 0, "TSO burst limit"); 495 496 /* Limit chimney send size */ 497 static int hn_tx_chimney_size = 0; 498 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 499 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 500 501 /* Limit the size of packet for direct transmission */ 502 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 503 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 504 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 505 506 /* # of LRO entries per RX ring */ 507 #if defined(INET) || defined(INET6) 508 #if __FreeBSD_version >= 1100095 509 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 510 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 511 &hn_lro_entry_count, 0, "LRO entry count"); 512 #endif 513 #endif 514 515 static int hn_tx_taskq_cnt = 1; 516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 517 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 518 519 #define HN_TX_TASKQ_M_INDEP 0 520 #define HN_TX_TASKQ_M_GLOBAL 1 521 #define HN_TX_TASKQ_M_EVTTQ 2 522 523 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 524 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 525 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 526 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 527 528 #ifndef HN_USE_TXDESC_BUFRING 529 static int hn_use_txdesc_bufring = 0; 530 #else 531 static int hn_use_txdesc_bufring = 1; 532 #endif 533 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 534 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 535 536 #ifdef HN_IFSTART_SUPPORT 537 /* Use ifnet.if_start instead of ifnet.if_transmit */ 538 static int hn_use_if_start = 0; 539 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 540 &hn_use_if_start, 0, "Use if_start TX method"); 541 #endif 542 543 /* # of channels to use */ 544 static int hn_chan_cnt = 0; 545 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 546 &hn_chan_cnt, 0, 547 "# of channels to use; each channel has one RX ring and one TX ring"); 548 549 /* # of transmit rings to use */ 550 static int hn_tx_ring_cnt = 0; 551 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 552 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 553 554 /* Software TX ring deptch */ 555 static int hn_tx_swq_depth = 0; 556 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 557 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 558 559 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 560 #if __FreeBSD_version >= 1100095 561 static u_int hn_lro_mbufq_depth = 0; 562 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 563 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 564 #endif 565 566 /* Packet transmission aggregation size limit */ 567 static int hn_tx_agg_size = -1; 568 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 569 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 570 571 /* Packet transmission aggregation count limit */ 572 static int hn_tx_agg_pkts = -1; 573 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 574 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 575 576 /* VF list */ 577 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING, 578 0, 0, hn_vflist_sysctl, "A", "VF list"); 579 580 /* VF mapping */ 581 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING, 582 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); 583 584 /* Transparent VF */ 585 static int hn_xpnt_vf = 0; 586 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 587 &hn_xpnt_vf, 0, "Transparent VF mod"); 588 589 /* Accurate BPF support for Transparent VF */ 590 static int hn_xpnt_vf_accbpf = 0; 591 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 592 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 593 594 /* Extra wait for transparent VF attach routing; unit seconds. */ 595 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 596 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 597 &hn_xpnt_vf_attwait, 0, 598 "Extra wait for transparent VF attach routing; unit: seconds"); 599 600 static u_int hn_cpu_index; /* next CPU for channel */ 601 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 602 603 static struct rmlock hn_vfmap_lock; 604 static int hn_vfmap_size; 605 static struct ifnet **hn_vfmap; 606 607 #ifndef RSS 608 static const uint8_t 609 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 610 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 611 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 612 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 613 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 614 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 615 }; 616 #endif /* !RSS */ 617 618 static const struct hyperv_guid hn_guid = { 619 .hv_guid = { 620 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 621 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 622 }; 623 624 static device_method_t hn_methods[] = { 625 /* Device interface */ 626 DEVMETHOD(device_probe, hn_probe), 627 DEVMETHOD(device_attach, hn_attach), 628 DEVMETHOD(device_detach, hn_detach), 629 DEVMETHOD(device_shutdown, hn_shutdown), 630 DEVMETHOD_END 631 }; 632 633 static driver_t hn_driver = { 634 "hn", 635 hn_methods, 636 sizeof(struct hn_softc) 637 }; 638 639 static devclass_t hn_devclass; 640 641 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 642 MODULE_VERSION(hn, 1); 643 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 644 645 #if __FreeBSD_version >= 1100099 646 static void 647 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 648 { 649 int i; 650 651 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 652 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 653 } 654 #endif 655 656 static int 657 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 658 { 659 660 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 661 txd->chim_size == 0, ("invalid rndis sglist txd")); 662 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 663 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 664 } 665 666 static int 667 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 668 { 669 struct hn_nvs_rndis rndis; 670 671 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 672 txd->chim_size > 0, ("invalid rndis chim txd")); 673 674 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 675 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 676 rndis.nvs_chim_idx = txd->chim_index; 677 rndis.nvs_chim_sz = txd->chim_size; 678 679 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 680 &rndis, sizeof(rndis), &txd->send_ctx)); 681 } 682 683 static __inline uint32_t 684 hn_chim_alloc(struct hn_softc *sc) 685 { 686 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 687 u_long *bmap = sc->hn_chim_bmap; 688 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 689 690 for (i = 0; i < bmap_cnt; ++i) { 691 int idx; 692 693 idx = ffsl(~bmap[i]); 694 if (idx == 0) 695 continue; 696 697 --idx; /* ffsl is 1-based */ 698 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 699 ("invalid i %d and idx %d", i, idx)); 700 701 if (atomic_testandset_long(&bmap[i], idx)) 702 continue; 703 704 ret = i * LONG_BIT + idx; 705 break; 706 } 707 return (ret); 708 } 709 710 static __inline void 711 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 712 { 713 u_long mask; 714 uint32_t idx; 715 716 idx = chim_idx / LONG_BIT; 717 KASSERT(idx < sc->hn_chim_bmap_cnt, 718 ("invalid chimney index 0x%x", chim_idx)); 719 720 mask = 1UL << (chim_idx % LONG_BIT); 721 KASSERT(sc->hn_chim_bmap[idx] & mask, 722 ("index bitmap 0x%lx, chimney index %u, " 723 "bitmap idx %d, bitmask 0x%lx", 724 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 725 726 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 727 } 728 729 #if defined(INET6) || defined(INET) 730 731 #define PULLUP_HDR(m, len) \ 732 do { \ 733 if (__predict_false((m)->m_len < (len))) { \ 734 (m) = m_pullup((m), (len)); \ 735 if ((m) == NULL) \ 736 return (NULL); \ 737 } \ 738 } while (0) 739 740 /* 741 * NOTE: If this function failed, the m_head would be freed. 742 */ 743 static __inline struct mbuf * 744 hn_tso_fixup(struct mbuf *m_head) 745 { 746 struct ether_vlan_header *evl; 747 struct tcphdr *th; 748 int ehlen; 749 750 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 751 752 PULLUP_HDR(m_head, sizeof(*evl)); 753 evl = mtod(m_head, struct ether_vlan_header *); 754 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 755 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 756 else 757 ehlen = ETHER_HDR_LEN; 758 m_head->m_pkthdr.l2hlen = ehlen; 759 760 #ifdef INET 761 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 762 struct ip *ip; 763 int iphlen; 764 765 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 766 ip = mtodo(m_head, ehlen); 767 iphlen = ip->ip_hl << 2; 768 m_head->m_pkthdr.l3hlen = iphlen; 769 770 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 771 th = mtodo(m_head, ehlen + iphlen); 772 773 ip->ip_len = 0; 774 ip->ip_sum = 0; 775 th->th_sum = in_pseudo(ip->ip_src.s_addr, 776 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 777 } 778 #endif 779 #if defined(INET6) && defined(INET) 780 else 781 #endif 782 #ifdef INET6 783 { 784 struct ip6_hdr *ip6; 785 786 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 787 ip6 = mtodo(m_head, ehlen); 788 if (ip6->ip6_nxt != IPPROTO_TCP) { 789 m_freem(m_head); 790 return (NULL); 791 } 792 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 793 794 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 795 th = mtodo(m_head, ehlen + sizeof(*ip6)); 796 797 ip6->ip6_plen = 0; 798 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 799 } 800 #endif 801 return (m_head); 802 } 803 804 /* 805 * NOTE: If this function failed, the m_head would be freed. 806 */ 807 static __inline struct mbuf * 808 hn_set_hlen(struct mbuf *m_head) 809 { 810 const struct ether_vlan_header *evl; 811 int ehlen; 812 813 PULLUP_HDR(m_head, sizeof(*evl)); 814 evl = mtod(m_head, const struct ether_vlan_header *); 815 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 816 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 817 else 818 ehlen = ETHER_HDR_LEN; 819 m_head->m_pkthdr.l2hlen = ehlen; 820 821 #ifdef INET 822 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 823 const struct ip *ip; 824 int iphlen; 825 826 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 827 ip = mtodo(m_head, ehlen); 828 iphlen = ip->ip_hl << 2; 829 m_head->m_pkthdr.l3hlen = iphlen; 830 831 /* 832 * UDP checksum offload does not work in Azure, if the 833 * following conditions meet: 834 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 835 * - IP_DF is not set in the IP hdr. 836 * 837 * Fallback to software checksum for these UDP datagrams. 838 */ 839 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 840 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 841 (ntohs(ip->ip_off) & IP_DF) == 0) { 842 uint16_t off = ehlen + iphlen; 843 844 counter_u64_add(hn_udpcs_fixup, 1); 845 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 846 *(uint16_t *)(m_head->m_data + off + 847 m_head->m_pkthdr.csum_data) = in_cksum_skip( 848 m_head, m_head->m_pkthdr.len, off); 849 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 850 } 851 } 852 #endif 853 #if defined(INET6) && defined(INET) 854 else 855 #endif 856 #ifdef INET6 857 { 858 const struct ip6_hdr *ip6; 859 860 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 861 ip6 = mtodo(m_head, ehlen); 862 if (ip6->ip6_nxt != IPPROTO_TCP) { 863 m_freem(m_head); 864 return (NULL); 865 } 866 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 867 } 868 #endif 869 return (m_head); 870 } 871 872 /* 873 * NOTE: If this function failed, the m_head would be freed. 874 */ 875 static __inline struct mbuf * 876 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 877 { 878 const struct tcphdr *th; 879 int ehlen, iphlen; 880 881 *tcpsyn = 0; 882 ehlen = m_head->m_pkthdr.l2hlen; 883 iphlen = m_head->m_pkthdr.l3hlen; 884 885 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 886 th = mtodo(m_head, ehlen + iphlen); 887 if (th->th_flags & TH_SYN) 888 *tcpsyn = 1; 889 return (m_head); 890 } 891 892 #undef PULLUP_HDR 893 894 #endif /* INET6 || INET */ 895 896 static int 897 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 898 { 899 int error = 0; 900 901 HN_LOCK_ASSERT(sc); 902 903 if (sc->hn_rx_filter != filter) { 904 error = hn_rndis_set_rxfilter(sc, filter); 905 if (!error) 906 sc->hn_rx_filter = filter; 907 } 908 return (error); 909 } 910 911 static int 912 hn_rxfilter_config(struct hn_softc *sc) 913 { 914 struct ifnet *ifp = sc->hn_ifp; 915 uint32_t filter; 916 917 HN_LOCK_ASSERT(sc); 918 919 /* 920 * If the non-transparent mode VF is activated, we don't know how 921 * its RX filter is configured, so stick the synthetic device in 922 * the promiscous mode. 923 */ 924 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 925 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 926 } else { 927 filter = NDIS_PACKET_TYPE_DIRECTED; 928 if (ifp->if_flags & IFF_BROADCAST) 929 filter |= NDIS_PACKET_TYPE_BROADCAST; 930 /* TODO: support multicast list */ 931 if ((ifp->if_flags & IFF_ALLMULTI) || 932 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 933 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 934 } 935 return (hn_set_rxfilter(sc, filter)); 936 } 937 938 static void 939 hn_set_txagg(struct hn_softc *sc) 940 { 941 uint32_t size, pkts; 942 int i; 943 944 /* 945 * Setup aggregation size. 946 */ 947 if (sc->hn_agg_size < 0) 948 size = UINT32_MAX; 949 else 950 size = sc->hn_agg_size; 951 952 if (sc->hn_rndis_agg_size < size) 953 size = sc->hn_rndis_agg_size; 954 955 /* NOTE: We only aggregate packets using chimney sending buffers. */ 956 if (size > (uint32_t)sc->hn_chim_szmax) 957 size = sc->hn_chim_szmax; 958 959 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 960 /* Disable */ 961 size = 0; 962 pkts = 0; 963 goto done; 964 } 965 966 /* NOTE: Type of the per TX ring setting is 'int'. */ 967 if (size > INT_MAX) 968 size = INT_MAX; 969 970 /* 971 * Setup aggregation packet count. 972 */ 973 if (sc->hn_agg_pkts < 0) 974 pkts = UINT32_MAX; 975 else 976 pkts = sc->hn_agg_pkts; 977 978 if (sc->hn_rndis_agg_pkts < pkts) 979 pkts = sc->hn_rndis_agg_pkts; 980 981 if (pkts <= 1) { 982 /* Disable */ 983 size = 0; 984 pkts = 0; 985 goto done; 986 } 987 988 /* NOTE: Type of the per TX ring setting is 'short'. */ 989 if (pkts > SHRT_MAX) 990 pkts = SHRT_MAX; 991 992 done: 993 /* NOTE: Type of the per TX ring setting is 'short'. */ 994 if (sc->hn_rndis_agg_align > SHRT_MAX) { 995 /* Disable */ 996 size = 0; 997 pkts = 0; 998 } 999 1000 if (bootverbose) { 1001 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1002 size, pkts, sc->hn_rndis_agg_align); 1003 } 1004 1005 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1006 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1007 1008 mtx_lock(&txr->hn_tx_lock); 1009 txr->hn_agg_szmax = size; 1010 txr->hn_agg_pktmax = pkts; 1011 txr->hn_agg_align = sc->hn_rndis_agg_align; 1012 mtx_unlock(&txr->hn_tx_lock); 1013 } 1014 } 1015 1016 static int 1017 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1018 { 1019 1020 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1021 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1022 return txr->hn_txdesc_cnt; 1023 return hn_tx_swq_depth; 1024 } 1025 1026 static int 1027 hn_rss_reconfig(struct hn_softc *sc) 1028 { 1029 int error; 1030 1031 HN_LOCK_ASSERT(sc); 1032 1033 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1034 return (ENXIO); 1035 1036 /* 1037 * Disable RSS first. 1038 * 1039 * NOTE: 1040 * Direct reconfiguration by setting the UNCHG flags does 1041 * _not_ work properly. 1042 */ 1043 if (bootverbose) 1044 if_printf(sc->hn_ifp, "disable RSS\n"); 1045 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1046 if (error) { 1047 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1048 return (error); 1049 } 1050 1051 /* 1052 * Reenable the RSS w/ the updated RSS key or indirect 1053 * table. 1054 */ 1055 if (bootverbose) 1056 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1057 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1058 if (error) { 1059 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1060 return (error); 1061 } 1062 return (0); 1063 } 1064 1065 static void 1066 hn_rss_ind_fixup(struct hn_softc *sc) 1067 { 1068 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1069 int i, nchan; 1070 1071 nchan = sc->hn_rx_ring_inuse; 1072 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1073 1074 /* 1075 * Check indirect table to make sure that all channels in it 1076 * can be used. 1077 */ 1078 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1079 if (rss->rss_ind[i] >= nchan) { 1080 if_printf(sc->hn_ifp, 1081 "RSS indirect table %d fixup: %u -> %d\n", 1082 i, rss->rss_ind[i], nchan - 1); 1083 rss->rss_ind[i] = nchan - 1; 1084 } 1085 } 1086 } 1087 1088 static int 1089 hn_ifmedia_upd(struct ifnet *ifp __unused) 1090 { 1091 1092 return EOPNOTSUPP; 1093 } 1094 1095 static void 1096 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1097 { 1098 struct hn_softc *sc = ifp->if_softc; 1099 1100 ifmr->ifm_status = IFM_AVALID; 1101 ifmr->ifm_active = IFM_ETHER; 1102 1103 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1104 ifmr->ifm_active |= IFM_NONE; 1105 return; 1106 } 1107 ifmr->ifm_status |= IFM_ACTIVE; 1108 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1109 } 1110 1111 static void 1112 hn_rxvf_set_task(void *xarg, int pending __unused) 1113 { 1114 struct hn_rxvf_setarg *arg = xarg; 1115 1116 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1117 } 1118 1119 static void 1120 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1121 { 1122 struct hn_rx_ring *rxr; 1123 struct hn_rxvf_setarg arg; 1124 struct task task; 1125 int i; 1126 1127 HN_LOCK_ASSERT(sc); 1128 1129 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1130 1131 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1132 rxr = &sc->hn_rx_ring[i]; 1133 1134 if (i < sc->hn_rx_ring_inuse) { 1135 arg.rxr = rxr; 1136 arg.vf_ifp = vf_ifp; 1137 vmbus_chan_run_task(rxr->hn_chan, &task); 1138 } else { 1139 rxr->hn_rxvf_ifp = vf_ifp; 1140 } 1141 } 1142 } 1143 1144 static bool 1145 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1146 { 1147 const struct ifnet *hn_ifp; 1148 1149 hn_ifp = sc->hn_ifp; 1150 1151 if (ifp == hn_ifp) 1152 return (false); 1153 1154 if (ifp->if_alloctype != IFT_ETHER) 1155 return (false); 1156 1157 /* Ignore lagg/vlan interfaces */ 1158 if (strcmp(ifp->if_dname, "lagg") == 0 || 1159 strcmp(ifp->if_dname, "vlan") == 0) 1160 return (false); 1161 1162 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1163 return (false); 1164 1165 return (true); 1166 } 1167 1168 static void 1169 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1170 { 1171 struct ifnet *hn_ifp; 1172 1173 HN_LOCK(sc); 1174 1175 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1176 goto out; 1177 1178 if (!hn_ismyvf(sc, ifp)) 1179 goto out; 1180 hn_ifp = sc->hn_ifp; 1181 1182 if (rxvf) { 1183 if (sc->hn_flags & HN_FLAG_RXVF) 1184 goto out; 1185 1186 sc->hn_flags |= HN_FLAG_RXVF; 1187 hn_rxfilter_config(sc); 1188 } else { 1189 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1190 goto out; 1191 1192 sc->hn_flags &= ~HN_FLAG_RXVF; 1193 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1194 hn_rxfilter_config(sc); 1195 else 1196 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1197 } 1198 1199 hn_nvs_set_datapath(sc, 1200 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1201 1202 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1203 1204 if (rxvf) { 1205 hn_vf_rss_fixup(sc, true); 1206 hn_suspend_mgmt(sc); 1207 sc->hn_link_flags &= 1208 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1209 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1210 } else { 1211 hn_vf_rss_restore(sc); 1212 hn_resume_mgmt(sc); 1213 } 1214 1215 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1216 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1217 1218 if (bootverbose) { 1219 if_printf(hn_ifp, "datapath is switched %s %s\n", 1220 rxvf ? "to" : "from", ifp->if_xname); 1221 } 1222 out: 1223 HN_UNLOCK(sc); 1224 } 1225 1226 static void 1227 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1228 { 1229 1230 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1231 return; 1232 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1233 } 1234 1235 static void 1236 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1237 { 1238 1239 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1240 } 1241 1242 static int 1243 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1244 { 1245 struct ifnet *ifp, *vf_ifp; 1246 uint64_t tmp; 1247 int error; 1248 1249 HN_LOCK_ASSERT(sc); 1250 ifp = sc->hn_ifp; 1251 vf_ifp = sc->hn_vf_ifp; 1252 1253 /* 1254 * Fix up requested capabilities w/ supported capabilities, 1255 * since the supported capabilities could have been changed. 1256 */ 1257 ifr->ifr_reqcap &= ifp->if_capabilities; 1258 /* Pass SIOCSIFCAP to VF. */ 1259 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1260 1261 /* 1262 * NOTE: 1263 * The error will be propagated to the callers, however, it 1264 * is _not_ useful here. 1265 */ 1266 1267 /* 1268 * Merge VF's enabled capabilities. 1269 */ 1270 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1271 1272 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1273 if (ifp->if_capenable & IFCAP_TXCSUM) 1274 ifp->if_hwassist |= tmp; 1275 else 1276 ifp->if_hwassist &= ~tmp; 1277 1278 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1279 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1280 ifp->if_hwassist |= tmp; 1281 else 1282 ifp->if_hwassist &= ~tmp; 1283 1284 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1285 if (ifp->if_capenable & IFCAP_TSO4) 1286 ifp->if_hwassist |= tmp; 1287 else 1288 ifp->if_hwassist &= ~tmp; 1289 1290 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1291 if (ifp->if_capenable & IFCAP_TSO6) 1292 ifp->if_hwassist |= tmp; 1293 else 1294 ifp->if_hwassist &= ~tmp; 1295 1296 return (error); 1297 } 1298 1299 static int 1300 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1301 { 1302 struct ifnet *vf_ifp; 1303 struct ifreq ifr; 1304 1305 HN_LOCK_ASSERT(sc); 1306 vf_ifp = sc->hn_vf_ifp; 1307 1308 memset(&ifr, 0, sizeof(ifr)); 1309 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1310 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1311 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1312 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1313 } 1314 1315 static void 1316 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1317 { 1318 struct ifnet *ifp = sc->hn_ifp; 1319 int allmulti = 0; 1320 1321 HN_LOCK_ASSERT(sc); 1322 1323 /* XXX vlan(4) style mcast addr maintenance */ 1324 if (!TAILQ_EMPTY(&ifp->if_multiaddrs)) 1325 allmulti = IFF_ALLMULTI; 1326 1327 /* Always set the VF's if_flags */ 1328 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1329 } 1330 1331 static void 1332 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1333 { 1334 struct rm_priotracker pt; 1335 struct ifnet *hn_ifp = NULL; 1336 struct mbuf *mn; 1337 1338 /* 1339 * XXX racy, if hn(4) ever detached. 1340 */ 1341 rm_rlock(&hn_vfmap_lock, &pt); 1342 if (vf_ifp->if_index < hn_vfmap_size) 1343 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1344 rm_runlock(&hn_vfmap_lock, &pt); 1345 1346 if (hn_ifp != NULL) { 1347 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1348 /* 1349 * Allow tapping on the VF. 1350 */ 1351 ETHER_BPF_MTAP(vf_ifp, mn); 1352 1353 /* 1354 * Update VF stats. 1355 */ 1356 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1357 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1358 mn->m_pkthdr.len); 1359 } 1360 /* 1361 * XXX IFCOUNTER_IMCAST 1362 * This stat updating is kinda invasive, since it 1363 * requires two checks on the mbuf: the length check 1364 * and the ethernet header check. As of this write, 1365 * all multicast packets go directly to hn(4), which 1366 * makes imcast stat updating in the VF a try in vian. 1367 */ 1368 1369 /* 1370 * Fix up rcvif and increase hn(4)'s ipackets. 1371 */ 1372 mn->m_pkthdr.rcvif = hn_ifp; 1373 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1374 } 1375 /* 1376 * Go through hn(4)'s if_input. 1377 */ 1378 hn_ifp->if_input(hn_ifp, m); 1379 } else { 1380 /* 1381 * In the middle of the transition; free this 1382 * mbuf chain. 1383 */ 1384 while (m != NULL) { 1385 mn = m->m_nextpkt; 1386 m->m_nextpkt = NULL; 1387 m_freem(m); 1388 m = mn; 1389 } 1390 } 1391 } 1392 1393 static void 1394 hn_mtu_change_fixup(struct hn_softc *sc) 1395 { 1396 struct ifnet *ifp; 1397 1398 HN_LOCK_ASSERT(sc); 1399 ifp = sc->hn_ifp; 1400 1401 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1402 #if __FreeBSD_version >= 1100099 1403 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1404 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1405 #endif 1406 } 1407 1408 static uint32_t 1409 hn_rss_type_fromndis(uint32_t rss_hash) 1410 { 1411 uint32_t types = 0; 1412 1413 if (rss_hash & NDIS_HASH_IPV4) 1414 types |= RSS_TYPE_IPV4; 1415 if (rss_hash & NDIS_HASH_TCP_IPV4) 1416 types |= RSS_TYPE_TCP_IPV4; 1417 if (rss_hash & NDIS_HASH_IPV6) 1418 types |= RSS_TYPE_IPV6; 1419 if (rss_hash & NDIS_HASH_IPV6_EX) 1420 types |= RSS_TYPE_IPV6_EX; 1421 if (rss_hash & NDIS_HASH_TCP_IPV6) 1422 types |= RSS_TYPE_TCP_IPV6; 1423 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1424 types |= RSS_TYPE_TCP_IPV6_EX; 1425 return (types); 1426 } 1427 1428 static uint32_t 1429 hn_rss_type_tondis(uint32_t types) 1430 { 1431 uint32_t rss_hash = 0; 1432 1433 KASSERT((types & 1434 (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1435 ("UDP4, UDP6 and UDP6EX are not supported")); 1436 1437 if (types & RSS_TYPE_IPV4) 1438 rss_hash |= NDIS_HASH_IPV4; 1439 if (types & RSS_TYPE_TCP_IPV4) 1440 rss_hash |= NDIS_HASH_TCP_IPV4; 1441 if (types & RSS_TYPE_IPV6) 1442 rss_hash |= NDIS_HASH_IPV6; 1443 if (types & RSS_TYPE_IPV6_EX) 1444 rss_hash |= NDIS_HASH_IPV6_EX; 1445 if (types & RSS_TYPE_TCP_IPV6) 1446 rss_hash |= NDIS_HASH_TCP_IPV6; 1447 if (types & RSS_TYPE_TCP_IPV6_EX) 1448 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1449 return (rss_hash); 1450 } 1451 1452 static void 1453 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1454 { 1455 int i; 1456 1457 HN_LOCK_ASSERT(sc); 1458 1459 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1460 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1461 } 1462 1463 static void 1464 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1465 { 1466 struct ifnet *ifp, *vf_ifp; 1467 struct ifrsshash ifrh; 1468 struct ifrsskey ifrk; 1469 int error; 1470 uint32_t my_types, diff_types, mbuf_types = 0; 1471 1472 HN_LOCK_ASSERT(sc); 1473 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1474 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1475 1476 if (sc->hn_rx_ring_inuse == 1) { 1477 /* No RSS on synthetic parts; done. */ 1478 return; 1479 } 1480 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1481 /* Synthetic parts do not support Toeplitz; done. */ 1482 return; 1483 } 1484 1485 ifp = sc->hn_ifp; 1486 vf_ifp = sc->hn_vf_ifp; 1487 1488 /* 1489 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1490 * supported. 1491 */ 1492 memset(&ifrk, 0, sizeof(ifrk)); 1493 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1494 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1495 if (error) { 1496 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n", 1497 vf_ifp->if_xname, error); 1498 goto done; 1499 } 1500 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1501 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1502 vf_ifp->if_xname, ifrk.ifrk_func); 1503 goto done; 1504 } 1505 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1506 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1507 vf_ifp->if_xname, ifrk.ifrk_keylen); 1508 goto done; 1509 } 1510 1511 /* 1512 * Extract VF's RSS hash. Only Toeplitz is supported. 1513 */ 1514 memset(&ifrh, 0, sizeof(ifrh)); 1515 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1516 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1517 if (error) { 1518 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1519 vf_ifp->if_xname, error); 1520 goto done; 1521 } 1522 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1523 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1524 vf_ifp->if_xname, ifrh.ifrh_func); 1525 goto done; 1526 } 1527 1528 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1529 if ((ifrh.ifrh_types & my_types) == 0) { 1530 /* This disables RSS; ignore it then */ 1531 if_printf(ifp, "%s intersection of RSS types failed. " 1532 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1533 ifrh.ifrh_types, my_types); 1534 goto done; 1535 } 1536 1537 diff_types = my_types ^ ifrh.ifrh_types; 1538 my_types &= ifrh.ifrh_types; 1539 mbuf_types = my_types; 1540 1541 /* 1542 * Detect RSS hash value/type confliction. 1543 * 1544 * NOTE: 1545 * We don't disable the hash type, but stop delivery the hash 1546 * value/type through mbufs on RX path. 1547 */ 1548 if ((my_types & RSS_TYPE_IPV4) && 1549 (diff_types & ifrh.ifrh_types & 1550 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1551 /* Conflict; disable IPV4 hash type/value delivery. */ 1552 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1553 mbuf_types &= ~RSS_TYPE_IPV4; 1554 } 1555 if ((my_types & RSS_TYPE_IPV6) && 1556 (diff_types & ifrh.ifrh_types & 1557 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1558 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1559 RSS_TYPE_IPV6_EX))) { 1560 /* Conflict; disable IPV6 hash type/value delivery. */ 1561 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1562 mbuf_types &= ~RSS_TYPE_IPV6; 1563 } 1564 if ((my_types & RSS_TYPE_IPV6_EX) && 1565 (diff_types & ifrh.ifrh_types & 1566 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1567 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1568 RSS_TYPE_IPV6))) { 1569 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1570 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1571 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1572 } 1573 if ((my_types & RSS_TYPE_TCP_IPV6) && 1574 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1575 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1576 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1577 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1578 } 1579 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1580 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1581 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1582 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1583 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1584 } 1585 if ((my_types & RSS_TYPE_UDP_IPV6) && 1586 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1587 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1588 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1589 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1590 } 1591 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1592 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1593 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1594 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1595 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1596 } 1597 1598 /* 1599 * Indirect table does not matter. 1600 */ 1601 1602 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1603 hn_rss_type_tondis(my_types); 1604 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1605 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1606 1607 if (reconf) { 1608 error = hn_rss_reconfig(sc); 1609 if (error) { 1610 /* XXX roll-back? */ 1611 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1612 /* XXX keep going. */ 1613 } 1614 } 1615 done: 1616 /* Hash deliverability for mbufs. */ 1617 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1618 } 1619 1620 static void 1621 hn_vf_rss_restore(struct hn_softc *sc) 1622 { 1623 1624 HN_LOCK_ASSERT(sc); 1625 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1626 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1627 1628 if (sc->hn_rx_ring_inuse == 1) 1629 goto done; 1630 1631 /* 1632 * Restore hash types. Key does _not_ matter. 1633 */ 1634 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1635 int error; 1636 1637 sc->hn_rss_hash = sc->hn_rss_hcap; 1638 error = hn_rss_reconfig(sc); 1639 if (error) { 1640 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1641 error); 1642 /* XXX keep going. */ 1643 } 1644 } 1645 done: 1646 /* Hash deliverability for mbufs. */ 1647 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1648 } 1649 1650 static void 1651 hn_xpnt_vf_setready(struct hn_softc *sc) 1652 { 1653 struct ifnet *ifp, *vf_ifp; 1654 struct ifreq ifr; 1655 1656 HN_LOCK_ASSERT(sc); 1657 ifp = sc->hn_ifp; 1658 vf_ifp = sc->hn_vf_ifp; 1659 1660 /* 1661 * Mark the VF ready. 1662 */ 1663 sc->hn_vf_rdytick = 0; 1664 1665 /* 1666 * Save information for restoration. 1667 */ 1668 sc->hn_saved_caps = ifp->if_capabilities; 1669 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1670 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1671 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1672 1673 /* 1674 * Intersect supported/enabled capabilities. 1675 * 1676 * NOTE: 1677 * if_hwassist is not changed here. 1678 */ 1679 ifp->if_capabilities &= vf_ifp->if_capabilities; 1680 ifp->if_capenable &= ifp->if_capabilities; 1681 1682 /* 1683 * Fix TSO settings. 1684 */ 1685 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1686 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1687 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1688 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1689 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1690 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1691 1692 /* 1693 * Change VF's enabled capabilities. 1694 */ 1695 memset(&ifr, 0, sizeof(ifr)); 1696 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1697 ifr.ifr_reqcap = ifp->if_capenable; 1698 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1699 1700 if (ifp->if_mtu != ETHERMTU) { 1701 int error; 1702 1703 /* 1704 * Change VF's MTU. 1705 */ 1706 memset(&ifr, 0, sizeof(ifr)); 1707 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1708 ifr.ifr_mtu = ifp->if_mtu; 1709 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1710 if (error) { 1711 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1712 vf_ifp->if_xname, ifp->if_mtu); 1713 if (ifp->if_mtu > ETHERMTU) { 1714 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1715 1716 /* 1717 * XXX 1718 * No need to adjust the synthetic parts' MTU; 1719 * failure of the adjustment will cause us 1720 * infinite headache. 1721 */ 1722 ifp->if_mtu = ETHERMTU; 1723 hn_mtu_change_fixup(sc); 1724 } 1725 } 1726 } 1727 } 1728 1729 static bool 1730 hn_xpnt_vf_isready(struct hn_softc *sc) 1731 { 1732 1733 HN_LOCK_ASSERT(sc); 1734 1735 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1736 return (false); 1737 1738 if (sc->hn_vf_rdytick == 0) 1739 return (true); 1740 1741 if (sc->hn_vf_rdytick > ticks) 1742 return (false); 1743 1744 /* Mark VF as ready. */ 1745 hn_xpnt_vf_setready(sc); 1746 return (true); 1747 } 1748 1749 static void 1750 hn_xpnt_vf_setenable(struct hn_softc *sc) 1751 { 1752 int i; 1753 1754 HN_LOCK_ASSERT(sc); 1755 1756 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1757 rm_wlock(&sc->hn_vf_lock); 1758 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1759 rm_wunlock(&sc->hn_vf_lock); 1760 1761 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1762 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1763 } 1764 1765 static void 1766 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1767 { 1768 int i; 1769 1770 HN_LOCK_ASSERT(sc); 1771 1772 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1773 rm_wlock(&sc->hn_vf_lock); 1774 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1775 if (clear_vf) 1776 sc->hn_vf_ifp = NULL; 1777 rm_wunlock(&sc->hn_vf_lock); 1778 1779 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1780 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1781 } 1782 1783 static void 1784 hn_xpnt_vf_init(struct hn_softc *sc) 1785 { 1786 int error; 1787 1788 HN_LOCK_ASSERT(sc); 1789 1790 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1791 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1792 1793 if (bootverbose) { 1794 if_printf(sc->hn_ifp, "try bringing up %s\n", 1795 sc->hn_vf_ifp->if_xname); 1796 } 1797 1798 /* 1799 * Bring the VF up. 1800 */ 1801 hn_xpnt_vf_saveifflags(sc); 1802 sc->hn_vf_ifp->if_flags |= IFF_UP; 1803 error = hn_xpnt_vf_iocsetflags(sc); 1804 if (error) { 1805 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1806 sc->hn_vf_ifp->if_xname, error); 1807 return; 1808 } 1809 1810 /* 1811 * NOTE: 1812 * Datapath setting must happen _after_ bringing the VF up. 1813 */ 1814 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1815 1816 /* 1817 * NOTE: 1818 * Fixup RSS related bits _after_ the VF is brought up, since 1819 * many VFs generate RSS key during it's initialization. 1820 */ 1821 hn_vf_rss_fixup(sc, true); 1822 1823 /* Mark transparent mode VF as enabled. */ 1824 hn_xpnt_vf_setenable(sc); 1825 } 1826 1827 static void 1828 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1829 { 1830 struct hn_softc *sc = xsc; 1831 1832 HN_LOCK(sc); 1833 1834 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1835 goto done; 1836 if (sc->hn_vf_ifp == NULL) 1837 goto done; 1838 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1839 goto done; 1840 1841 if (sc->hn_vf_rdytick != 0) { 1842 /* Mark VF as ready. */ 1843 hn_xpnt_vf_setready(sc); 1844 } 1845 1846 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1847 /* 1848 * Delayed VF initialization. 1849 */ 1850 if (bootverbose) { 1851 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1852 sc->hn_vf_ifp->if_xname); 1853 } 1854 hn_xpnt_vf_init(sc); 1855 } 1856 done: 1857 HN_UNLOCK(sc); 1858 } 1859 1860 static void 1861 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1862 { 1863 struct hn_softc *sc = xsc; 1864 1865 HN_LOCK(sc); 1866 1867 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1868 goto done; 1869 1870 if (!hn_ismyvf(sc, ifp)) 1871 goto done; 1872 1873 if (sc->hn_vf_ifp != NULL) { 1874 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1875 sc->hn_vf_ifp->if_xname); 1876 goto done; 1877 } 1878 1879 if (hn_xpnt_vf && ifp->if_start != NULL) { 1880 /* 1881 * ifnet.if_start is _not_ supported by transparent 1882 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1883 */ 1884 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1885 "in transparent VF mode.\n", ifp->if_xname); 1886 goto done; 1887 } 1888 1889 rm_wlock(&hn_vfmap_lock); 1890 1891 if (ifp->if_index >= hn_vfmap_size) { 1892 struct ifnet **newmap; 1893 int newsize; 1894 1895 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1896 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1897 M_WAITOK | M_ZERO); 1898 1899 memcpy(newmap, hn_vfmap, 1900 sizeof(struct ifnet *) * hn_vfmap_size); 1901 free(hn_vfmap, M_DEVBUF); 1902 hn_vfmap = newmap; 1903 hn_vfmap_size = newsize; 1904 } 1905 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1906 ("%s: ifindex %d was mapped to %s", 1907 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1908 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1909 1910 rm_wunlock(&hn_vfmap_lock); 1911 1912 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1913 rm_wlock(&sc->hn_vf_lock); 1914 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1915 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1916 sc->hn_vf_ifp = ifp; 1917 rm_wunlock(&sc->hn_vf_lock); 1918 1919 if (hn_xpnt_vf) { 1920 int wait_ticks; 1921 1922 /* 1923 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1924 * Save vf_ifp's current if_input for later restoration. 1925 */ 1926 sc->hn_vf_input = ifp->if_input; 1927 ifp->if_input = hn_xpnt_vf_input; 1928 1929 /* 1930 * Stop link status management; use the VF's. 1931 */ 1932 hn_suspend_mgmt(sc); 1933 1934 /* 1935 * Give VF sometime to complete its attach routing. 1936 */ 1937 wait_ticks = hn_xpnt_vf_attwait * hz; 1938 sc->hn_vf_rdytick = ticks + wait_ticks; 1939 1940 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1941 wait_ticks); 1942 } 1943 done: 1944 HN_UNLOCK(sc); 1945 } 1946 1947 static void 1948 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1949 { 1950 struct hn_softc *sc = xsc; 1951 1952 HN_LOCK(sc); 1953 1954 if (sc->hn_vf_ifp == NULL) 1955 goto done; 1956 1957 if (!hn_ismyvf(sc, ifp)) 1958 goto done; 1959 1960 if (hn_xpnt_vf) { 1961 /* 1962 * Make sure that the delayed initialization is not running. 1963 * 1964 * NOTE: 1965 * - This lock _must_ be released, since the hn_vf_init task 1966 * will try holding this lock. 1967 * - It is safe to release this lock here, since the 1968 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1969 * 1970 * XXX racy, if hn(4) ever detached. 1971 */ 1972 HN_UNLOCK(sc); 1973 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 1974 HN_LOCK(sc); 1975 1976 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 1977 sc->hn_ifp->if_xname)); 1978 ifp->if_input = sc->hn_vf_input; 1979 sc->hn_vf_input = NULL; 1980 1981 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 1982 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 1983 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 1984 1985 if (sc->hn_vf_rdytick == 0) { 1986 /* 1987 * The VF was ready; restore some settings. 1988 */ 1989 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 1990 /* 1991 * NOTE: 1992 * There is _no_ need to fixup if_capenable and 1993 * if_hwassist, since the if_capabilities before 1994 * restoration was an intersection of the VF's 1995 * if_capabilites and the synthetic device's 1996 * if_capabilites. 1997 */ 1998 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 1999 sc->hn_ifp->if_hw_tsomaxsegcount = 2000 sc->hn_saved_tsosegcnt; 2001 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2002 } 2003 2004 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2005 /* 2006 * Restore RSS settings. 2007 */ 2008 hn_vf_rss_restore(sc); 2009 2010 /* 2011 * Resume link status management, which was suspended 2012 * by hn_ifnet_attevent(). 2013 */ 2014 hn_resume_mgmt(sc); 2015 } 2016 } 2017 2018 /* Mark transparent mode VF as disabled. */ 2019 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2020 2021 rm_wlock(&hn_vfmap_lock); 2022 2023 KASSERT(ifp->if_index < hn_vfmap_size, 2024 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2025 if (hn_vfmap[ifp->if_index] != NULL) { 2026 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2027 ("%s: ifindex %d was mapped to %s", 2028 ifp->if_xname, ifp->if_index, 2029 hn_vfmap[ifp->if_index]->if_xname)); 2030 hn_vfmap[ifp->if_index] = NULL; 2031 } 2032 2033 rm_wunlock(&hn_vfmap_lock); 2034 done: 2035 HN_UNLOCK(sc); 2036 } 2037 2038 static void 2039 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2040 { 2041 struct hn_softc *sc = xsc; 2042 2043 if (sc->hn_vf_ifp == ifp) 2044 if_link_state_change(sc->hn_ifp, link_state); 2045 } 2046 2047 static int 2048 hn_probe(device_t dev) 2049 { 2050 2051 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2052 device_set_desc(dev, "Hyper-V Network Interface"); 2053 return BUS_PROBE_DEFAULT; 2054 } 2055 return ENXIO; 2056 } 2057 2058 static int 2059 hn_attach(device_t dev) 2060 { 2061 struct hn_softc *sc = device_get_softc(dev); 2062 struct sysctl_oid_list *child; 2063 struct sysctl_ctx_list *ctx; 2064 uint8_t eaddr[ETHER_ADDR_LEN]; 2065 struct ifnet *ifp = NULL; 2066 int error, ring_cnt, tx_ring_cnt; 2067 uint32_t mtu; 2068 2069 sc->hn_dev = dev; 2070 sc->hn_prichan = vmbus_get_channel(dev); 2071 HN_LOCK_INIT(sc); 2072 rm_init(&sc->hn_vf_lock, "hnvf"); 2073 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2074 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2075 2076 /* 2077 * Initialize these tunables once. 2078 */ 2079 sc->hn_agg_size = hn_tx_agg_size; 2080 sc->hn_agg_pkts = hn_tx_agg_pkts; 2081 2082 /* 2083 * Setup taskqueue for transmission. 2084 */ 2085 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2086 int i; 2087 2088 sc->hn_tx_taskqs = 2089 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2090 M_DEVBUF, M_WAITOK); 2091 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2092 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2093 M_WAITOK, taskqueue_thread_enqueue, 2094 &sc->hn_tx_taskqs[i]); 2095 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2096 "%s tx%d", device_get_nameunit(dev), i); 2097 } 2098 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2099 sc->hn_tx_taskqs = hn_tx_taskque; 2100 } 2101 2102 /* 2103 * Setup taskqueue for mangement tasks, e.g. link status. 2104 */ 2105 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2106 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2107 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2108 device_get_nameunit(dev)); 2109 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2110 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2111 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2112 hn_netchg_status_taskfunc, sc); 2113 2114 if (hn_xpnt_vf) { 2115 /* 2116 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2117 */ 2118 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2119 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2120 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2121 device_get_nameunit(dev)); 2122 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2123 hn_xpnt_vf_init_taskfunc, sc); 2124 } 2125 2126 /* 2127 * Allocate ifnet and setup its name earlier, so that if_printf 2128 * can be used by functions, which will be called after 2129 * ether_ifattach(). 2130 */ 2131 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2132 ifp->if_softc = sc; 2133 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2134 2135 /* 2136 * Initialize ifmedia earlier so that it can be unconditionally 2137 * destroyed, if error happened later on. 2138 */ 2139 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2140 2141 /* 2142 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2143 * to use (tx_ring_cnt). 2144 * 2145 * NOTE: 2146 * The # of RX rings to use is same as the # of channels to use. 2147 */ 2148 ring_cnt = hn_chan_cnt; 2149 if (ring_cnt <= 0) { 2150 /* Default */ 2151 ring_cnt = mp_ncpus; 2152 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2153 ring_cnt = HN_RING_CNT_DEF_MAX; 2154 } else if (ring_cnt > mp_ncpus) { 2155 ring_cnt = mp_ncpus; 2156 } 2157 #ifdef RSS 2158 if (ring_cnt > rss_getnumbuckets()) 2159 ring_cnt = rss_getnumbuckets(); 2160 #endif 2161 2162 tx_ring_cnt = hn_tx_ring_cnt; 2163 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2164 tx_ring_cnt = ring_cnt; 2165 #ifdef HN_IFSTART_SUPPORT 2166 if (hn_use_if_start) { 2167 /* ifnet.if_start only needs one TX ring. */ 2168 tx_ring_cnt = 1; 2169 } 2170 #endif 2171 2172 /* 2173 * Set the leader CPU for channels. 2174 */ 2175 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2176 2177 /* 2178 * Create enough TX/RX rings, even if only limited number of 2179 * channels can be allocated. 2180 */ 2181 error = hn_create_tx_data(sc, tx_ring_cnt); 2182 if (error) 2183 goto failed; 2184 error = hn_create_rx_data(sc, ring_cnt); 2185 if (error) 2186 goto failed; 2187 2188 /* 2189 * Create transaction context for NVS and RNDIS transactions. 2190 */ 2191 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2192 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2193 if (sc->hn_xact == NULL) { 2194 error = ENXIO; 2195 goto failed; 2196 } 2197 2198 /* 2199 * Install orphan handler for the revocation of this device's 2200 * primary channel. 2201 * 2202 * NOTE: 2203 * The processing order is critical here: 2204 * Install the orphan handler, _before_ testing whether this 2205 * device's primary channel has been revoked or not. 2206 */ 2207 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2208 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2209 error = ENXIO; 2210 goto failed; 2211 } 2212 2213 /* 2214 * Attach the synthetic parts, i.e. NVS and RNDIS. 2215 */ 2216 error = hn_synth_attach(sc, ETHERMTU); 2217 if (error) 2218 goto failed; 2219 2220 error = hn_rndis_get_eaddr(sc, eaddr); 2221 if (error) 2222 goto failed; 2223 2224 error = hn_rndis_get_mtu(sc, &mtu); 2225 if (error) 2226 mtu = ETHERMTU; 2227 else if (bootverbose) 2228 device_printf(dev, "RNDIS mtu %u\n", mtu); 2229 2230 #if __FreeBSD_version >= 1100099 2231 if (sc->hn_rx_ring_inuse > 1) { 2232 /* 2233 * Reduce TCP segment aggregation limit for multiple 2234 * RX rings to increase ACK timeliness. 2235 */ 2236 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2237 } 2238 #endif 2239 2240 /* 2241 * Fixup TX stuffs after synthetic parts are attached. 2242 */ 2243 hn_fixup_tx_data(sc); 2244 2245 ctx = device_get_sysctl_ctx(dev); 2246 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2247 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2248 &sc->hn_nvs_ver, 0, "NVS version"); 2249 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2250 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2251 hn_ndis_version_sysctl, "A", "NDIS version"); 2252 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2253 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2254 hn_caps_sysctl, "A", "capabilities"); 2255 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2256 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2257 hn_hwassist_sysctl, "A", "hwassist"); 2258 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2259 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2260 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2261 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2262 "max # of TSO segments"); 2263 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2264 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2265 "max size of TSO segment"); 2266 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2267 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2268 hn_rxfilter_sysctl, "A", "rxfilter"); 2269 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2270 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2271 hn_rss_hash_sysctl, "A", "RSS hash"); 2272 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2273 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2274 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2275 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2276 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2277 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2278 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2279 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2280 #ifndef RSS 2281 /* 2282 * Don't allow RSS key/indirect table changes, if RSS is defined. 2283 */ 2284 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2285 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2286 hn_rss_key_sysctl, "IU", "RSS key"); 2287 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2288 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2289 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2290 #endif 2291 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2292 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2293 "RNDIS offered packet transmission aggregation size limit"); 2294 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2295 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2296 "RNDIS offered packet transmission aggregation count limit"); 2297 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2298 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2299 "RNDIS packet transmission aggregation alignment"); 2300 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2301 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2302 hn_txagg_size_sysctl, "I", 2303 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2304 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2305 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2306 hn_txagg_pkts_sysctl, "I", 2307 "Packet transmission aggregation packets, " 2308 "0 -- disable, -1 -- auto"); 2309 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2310 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2311 hn_polling_sysctl, "I", 2312 "Polling frequency: [100,1000000], 0 disable polling"); 2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2314 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2315 hn_vf_sysctl, "A", "Virtual Function's name"); 2316 if (!hn_xpnt_vf) { 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2319 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2320 } else { 2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2322 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2323 hn_xpnt_vf_enabled_sysctl, "I", 2324 "Transparent VF enabled"); 2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2326 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2327 hn_xpnt_vf_accbpf_sysctl, "I", 2328 "Accurate BPF for transparent VF"); 2329 } 2330 2331 /* 2332 * Setup the ifmedia, which has been initialized earlier. 2333 */ 2334 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2335 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2336 /* XXX ifmedia_set really should do this for us */ 2337 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2338 2339 /* 2340 * Setup the ifnet for this interface. 2341 */ 2342 2343 ifp->if_baudrate = IF_Gbps(10); 2344 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2345 ifp->if_ioctl = hn_ioctl; 2346 ifp->if_init = hn_init; 2347 #ifdef HN_IFSTART_SUPPORT 2348 if (hn_use_if_start) { 2349 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2350 2351 ifp->if_start = hn_start; 2352 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2353 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2354 IFQ_SET_READY(&ifp->if_snd); 2355 } else 2356 #endif 2357 { 2358 ifp->if_transmit = hn_transmit; 2359 ifp->if_qflush = hn_xmit_qflush; 2360 } 2361 2362 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2363 #ifdef foo 2364 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2365 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2366 #endif 2367 if (sc->hn_caps & HN_CAP_VLAN) { 2368 /* XXX not sure about VLAN_MTU. */ 2369 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2370 } 2371 2372 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2373 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2374 ifp->if_capabilities |= IFCAP_TXCSUM; 2375 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2376 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2377 if (sc->hn_caps & HN_CAP_TSO4) { 2378 ifp->if_capabilities |= IFCAP_TSO4; 2379 ifp->if_hwassist |= CSUM_IP_TSO; 2380 } 2381 if (sc->hn_caps & HN_CAP_TSO6) { 2382 ifp->if_capabilities |= IFCAP_TSO6; 2383 ifp->if_hwassist |= CSUM_IP6_TSO; 2384 } 2385 2386 /* Enable all available capabilities by default. */ 2387 ifp->if_capenable = ifp->if_capabilities; 2388 2389 /* 2390 * Disable IPv6 TSO and TXCSUM by default, they still can 2391 * be enabled through SIOCSIFCAP. 2392 */ 2393 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2394 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2395 2396 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2397 /* 2398 * Lock hn_set_tso_maxsize() to simplify its 2399 * internal logic. 2400 */ 2401 HN_LOCK(sc); 2402 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2403 HN_UNLOCK(sc); 2404 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2405 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2406 } 2407 2408 ether_ifattach(ifp, eaddr); 2409 2410 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2411 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2412 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2413 } 2414 if (mtu < ETHERMTU) { 2415 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2416 ifp->if_mtu = mtu; 2417 } 2418 2419 /* Inform the upper layer about the long frame support. */ 2420 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2421 2422 /* 2423 * Kick off link status check. 2424 */ 2425 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2426 hn_update_link_status(sc); 2427 2428 if (!hn_xpnt_vf) { 2429 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2430 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2431 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2432 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2433 } else { 2434 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2435 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2436 } 2437 2438 /* 2439 * NOTE: 2440 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2441 * since interface's LLADDR is needed; interface LLADDR is not 2442 * available when ifnet_arrival event is triggered. 2443 */ 2444 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2445 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2446 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2447 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2448 2449 return (0); 2450 failed: 2451 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2452 hn_synth_detach(sc); 2453 hn_detach(dev); 2454 return (error); 2455 } 2456 2457 static int 2458 hn_detach(device_t dev) 2459 { 2460 struct hn_softc *sc = device_get_softc(dev); 2461 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2462 2463 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2464 /* 2465 * In case that the vmbus missed the orphan handler 2466 * installation. 2467 */ 2468 vmbus_xact_ctx_orphan(sc->hn_xact); 2469 } 2470 2471 if (sc->hn_ifaddr_evthand != NULL) 2472 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2473 if (sc->hn_ifnet_evthand != NULL) 2474 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2475 if (sc->hn_ifnet_atthand != NULL) { 2476 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2477 sc->hn_ifnet_atthand); 2478 } 2479 if (sc->hn_ifnet_dethand != NULL) { 2480 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2481 sc->hn_ifnet_dethand); 2482 } 2483 if (sc->hn_ifnet_lnkhand != NULL) 2484 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2485 2486 vf_ifp = sc->hn_vf_ifp; 2487 __compiler_membar(); 2488 if (vf_ifp != NULL) 2489 hn_ifnet_detevent(sc, vf_ifp); 2490 2491 if (device_is_attached(dev)) { 2492 HN_LOCK(sc); 2493 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2494 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2495 hn_stop(sc, true); 2496 /* 2497 * NOTE: 2498 * hn_stop() only suspends data, so managment 2499 * stuffs have to be suspended manually here. 2500 */ 2501 hn_suspend_mgmt(sc); 2502 hn_synth_detach(sc); 2503 } 2504 HN_UNLOCK(sc); 2505 ether_ifdetach(ifp); 2506 } 2507 2508 ifmedia_removeall(&sc->hn_media); 2509 hn_destroy_rx_data(sc); 2510 hn_destroy_tx_data(sc); 2511 2512 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2513 int i; 2514 2515 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2516 taskqueue_free(sc->hn_tx_taskqs[i]); 2517 free(sc->hn_tx_taskqs, M_DEVBUF); 2518 } 2519 taskqueue_free(sc->hn_mgmt_taskq0); 2520 if (sc->hn_vf_taskq != NULL) 2521 taskqueue_free(sc->hn_vf_taskq); 2522 2523 if (sc->hn_xact != NULL) { 2524 /* 2525 * Uninstall the orphan handler _before_ the xact is 2526 * destructed. 2527 */ 2528 vmbus_chan_unset_orphan(sc->hn_prichan); 2529 vmbus_xact_ctx_destroy(sc->hn_xact); 2530 } 2531 2532 if_free(ifp); 2533 2534 HN_LOCK_DESTROY(sc); 2535 rm_destroy(&sc->hn_vf_lock); 2536 return (0); 2537 } 2538 2539 static int 2540 hn_shutdown(device_t dev) 2541 { 2542 2543 return (0); 2544 } 2545 2546 static void 2547 hn_link_status(struct hn_softc *sc) 2548 { 2549 uint32_t link_status; 2550 int error; 2551 2552 error = hn_rndis_get_linkstatus(sc, &link_status); 2553 if (error) { 2554 /* XXX what to do? */ 2555 return; 2556 } 2557 2558 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2559 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2560 else 2561 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2562 if_link_state_change(sc->hn_ifp, 2563 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2564 LINK_STATE_UP : LINK_STATE_DOWN); 2565 } 2566 2567 static void 2568 hn_link_taskfunc(void *xsc, int pending __unused) 2569 { 2570 struct hn_softc *sc = xsc; 2571 2572 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2573 return; 2574 hn_link_status(sc); 2575 } 2576 2577 static void 2578 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2579 { 2580 struct hn_softc *sc = xsc; 2581 2582 /* Prevent any link status checks from running. */ 2583 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2584 2585 /* 2586 * Fake up a [link down --> link up] state change; 5 seconds 2587 * delay is used, which closely simulates miibus reaction 2588 * upon link down event. 2589 */ 2590 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2591 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2592 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2593 &sc->hn_netchg_status, 5 * hz); 2594 } 2595 2596 static void 2597 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2598 { 2599 struct hn_softc *sc = xsc; 2600 2601 /* Re-allow link status checks. */ 2602 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2603 hn_link_status(sc); 2604 } 2605 2606 static void 2607 hn_update_link_status(struct hn_softc *sc) 2608 { 2609 2610 if (sc->hn_mgmt_taskq != NULL) 2611 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2612 } 2613 2614 static void 2615 hn_change_network(struct hn_softc *sc) 2616 { 2617 2618 if (sc->hn_mgmt_taskq != NULL) 2619 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2620 } 2621 2622 static __inline int 2623 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2624 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2625 { 2626 struct mbuf *m = *m_head; 2627 int error; 2628 2629 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2630 2631 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2632 m, segs, nsegs, BUS_DMA_NOWAIT); 2633 if (error == EFBIG) { 2634 struct mbuf *m_new; 2635 2636 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2637 if (m_new == NULL) 2638 return ENOBUFS; 2639 else 2640 *m_head = m = m_new; 2641 txr->hn_tx_collapsed++; 2642 2643 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2644 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2645 } 2646 if (!error) { 2647 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2648 BUS_DMASYNC_PREWRITE); 2649 txd->flags |= HN_TXD_FLAG_DMAMAP; 2650 } 2651 return error; 2652 } 2653 2654 static __inline int 2655 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2656 { 2657 2658 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2659 ("put an onlist txd %#x", txd->flags)); 2660 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2661 ("put an onagg txd %#x", txd->flags)); 2662 2663 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2664 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2665 return 0; 2666 2667 if (!STAILQ_EMPTY(&txd->agg_list)) { 2668 struct hn_txdesc *tmp_txd; 2669 2670 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2671 int freed; 2672 2673 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2674 ("resursive aggregation on aggregated txdesc")); 2675 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2676 ("not aggregated txdesc")); 2677 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2678 ("aggregated txdesc uses dmamap")); 2679 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2680 ("aggregated txdesc consumes " 2681 "chimney sending buffer")); 2682 KASSERT(tmp_txd->chim_size == 0, 2683 ("aggregated txdesc has non-zero " 2684 "chimney sending size")); 2685 2686 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2687 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2688 freed = hn_txdesc_put(txr, tmp_txd); 2689 KASSERT(freed, ("failed to free aggregated txdesc")); 2690 } 2691 } 2692 2693 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2694 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2695 ("chim txd uses dmamap")); 2696 hn_chim_free(txr->hn_sc, txd->chim_index); 2697 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2698 txd->chim_size = 0; 2699 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2700 bus_dmamap_sync(txr->hn_tx_data_dtag, 2701 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2702 bus_dmamap_unload(txr->hn_tx_data_dtag, 2703 txd->data_dmap); 2704 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2705 } 2706 2707 if (txd->m != NULL) { 2708 m_freem(txd->m); 2709 txd->m = NULL; 2710 } 2711 2712 txd->flags |= HN_TXD_FLAG_ONLIST; 2713 #ifndef HN_USE_TXDESC_BUFRING 2714 mtx_lock_spin(&txr->hn_txlist_spin); 2715 KASSERT(txr->hn_txdesc_avail >= 0 && 2716 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2717 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2718 txr->hn_txdesc_avail++; 2719 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2720 mtx_unlock_spin(&txr->hn_txlist_spin); 2721 #else /* HN_USE_TXDESC_BUFRING */ 2722 #ifdef HN_DEBUG 2723 atomic_add_int(&txr->hn_txdesc_avail, 1); 2724 #endif 2725 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2726 #endif /* !HN_USE_TXDESC_BUFRING */ 2727 2728 return 1; 2729 } 2730 2731 static __inline struct hn_txdesc * 2732 hn_txdesc_get(struct hn_tx_ring *txr) 2733 { 2734 struct hn_txdesc *txd; 2735 2736 #ifndef HN_USE_TXDESC_BUFRING 2737 mtx_lock_spin(&txr->hn_txlist_spin); 2738 txd = SLIST_FIRST(&txr->hn_txlist); 2739 if (txd != NULL) { 2740 KASSERT(txr->hn_txdesc_avail > 0, 2741 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2742 txr->hn_txdesc_avail--; 2743 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2744 } 2745 mtx_unlock_spin(&txr->hn_txlist_spin); 2746 #else 2747 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2748 #endif 2749 2750 if (txd != NULL) { 2751 #ifdef HN_USE_TXDESC_BUFRING 2752 #ifdef HN_DEBUG 2753 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2754 #endif 2755 #endif /* HN_USE_TXDESC_BUFRING */ 2756 KASSERT(txd->m == NULL && txd->refs == 0 && 2757 STAILQ_EMPTY(&txd->agg_list) && 2758 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2759 txd->chim_size == 0 && 2760 (txd->flags & HN_TXD_FLAG_ONLIST) && 2761 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2762 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2763 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2764 txd->refs = 1; 2765 } 2766 return txd; 2767 } 2768 2769 static __inline void 2770 hn_txdesc_hold(struct hn_txdesc *txd) 2771 { 2772 2773 /* 0->1 transition will never work */ 2774 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2775 atomic_add_int(&txd->refs, 1); 2776 } 2777 2778 static __inline void 2779 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2780 { 2781 2782 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2783 ("recursive aggregation on aggregating txdesc")); 2784 2785 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2786 ("already aggregated")); 2787 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2788 ("recursive aggregation on to-be-aggregated txdesc")); 2789 2790 txd->flags |= HN_TXD_FLAG_ONAGG; 2791 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2792 } 2793 2794 static bool 2795 hn_tx_ring_pending(struct hn_tx_ring *txr) 2796 { 2797 bool pending = false; 2798 2799 #ifndef HN_USE_TXDESC_BUFRING 2800 mtx_lock_spin(&txr->hn_txlist_spin); 2801 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2802 pending = true; 2803 mtx_unlock_spin(&txr->hn_txlist_spin); 2804 #else 2805 if (!buf_ring_full(txr->hn_txdesc_br)) 2806 pending = true; 2807 #endif 2808 return (pending); 2809 } 2810 2811 static __inline void 2812 hn_txeof(struct hn_tx_ring *txr) 2813 { 2814 txr->hn_has_txeof = 0; 2815 txr->hn_txeof(txr); 2816 } 2817 2818 static void 2819 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2820 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2821 { 2822 struct hn_txdesc *txd = sndc->hn_cbarg; 2823 struct hn_tx_ring *txr; 2824 2825 txr = txd->txr; 2826 KASSERT(txr->hn_chan == chan, 2827 ("channel mismatch, on chan%u, should be chan%u", 2828 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2829 2830 txr->hn_has_txeof = 1; 2831 hn_txdesc_put(txr, txd); 2832 2833 ++txr->hn_txdone_cnt; 2834 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2835 txr->hn_txdone_cnt = 0; 2836 if (txr->hn_oactive) 2837 hn_txeof(txr); 2838 } 2839 } 2840 2841 static void 2842 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2843 { 2844 #if defined(INET) || defined(INET6) 2845 tcp_lro_flush_all(&rxr->hn_lro); 2846 #endif 2847 2848 /* 2849 * NOTE: 2850 * 'txr' could be NULL, if multiple channels and 2851 * ifnet.if_start method are enabled. 2852 */ 2853 if (txr == NULL || !txr->hn_has_txeof) 2854 return; 2855 2856 txr->hn_txdone_cnt = 0; 2857 hn_txeof(txr); 2858 } 2859 2860 static __inline uint32_t 2861 hn_rndis_pktmsg_offset(uint32_t ofs) 2862 { 2863 2864 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2865 ("invalid RNDIS packet msg offset %u", ofs)); 2866 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2867 } 2868 2869 static __inline void * 2870 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2871 size_t pi_dlen, uint32_t pi_type) 2872 { 2873 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2874 struct rndis_pktinfo *pi; 2875 2876 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2877 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2878 2879 /* 2880 * Per-packet-info does not move; it only grows. 2881 * 2882 * NOTE: 2883 * rm_pktinfooffset in this phase counts from the beginning 2884 * of rndis_packet_msg. 2885 */ 2886 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2887 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2888 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2889 pkt->rm_pktinfolen); 2890 pkt->rm_pktinfolen += pi_size; 2891 2892 pi->rm_size = pi_size; 2893 pi->rm_type = pi_type; 2894 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2895 2896 return (pi->rm_data); 2897 } 2898 2899 static __inline int 2900 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2901 { 2902 struct hn_txdesc *txd; 2903 struct mbuf *m; 2904 int error, pkts; 2905 2906 txd = txr->hn_agg_txd; 2907 KASSERT(txd != NULL, ("no aggregate txdesc")); 2908 2909 /* 2910 * Since hn_txpkt() will reset this temporary stat, save 2911 * it now, so that oerrors can be updated properly, if 2912 * hn_txpkt() ever fails. 2913 */ 2914 pkts = txr->hn_stat_pkts; 2915 2916 /* 2917 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2918 * failure, save it for later freeing, if hn_txpkt() ever 2919 * fails. 2920 */ 2921 m = txd->m; 2922 error = hn_txpkt(ifp, txr, txd); 2923 if (__predict_false(error)) { 2924 /* txd is freed, but m is not. */ 2925 m_freem(m); 2926 2927 txr->hn_flush_failed++; 2928 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2929 } 2930 2931 /* Reset all aggregation states. */ 2932 txr->hn_agg_txd = NULL; 2933 txr->hn_agg_szleft = 0; 2934 txr->hn_agg_pktleft = 0; 2935 txr->hn_agg_prevpkt = NULL; 2936 2937 return (error); 2938 } 2939 2940 static void * 2941 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2942 int pktsize) 2943 { 2944 void *chim; 2945 2946 if (txr->hn_agg_txd != NULL) { 2947 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2948 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2949 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2950 int olen; 2951 2952 /* 2953 * Update the previous RNDIS packet's total length, 2954 * it can be increased due to the mandatory alignment 2955 * padding for this RNDIS packet. And update the 2956 * aggregating txdesc's chimney sending buffer size 2957 * accordingly. 2958 * 2959 * XXX 2960 * Zero-out the padding, as required by the RNDIS spec. 2961 */ 2962 olen = pkt->rm_len; 2963 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 2964 agg_txd->chim_size += pkt->rm_len - olen; 2965 2966 /* Link this txdesc to the parent. */ 2967 hn_txdesc_agg(agg_txd, txd); 2968 2969 chim = (uint8_t *)pkt + pkt->rm_len; 2970 /* Save the current packet for later fixup. */ 2971 txr->hn_agg_prevpkt = chim; 2972 2973 txr->hn_agg_pktleft--; 2974 txr->hn_agg_szleft -= pktsize; 2975 if (txr->hn_agg_szleft <= 2976 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 2977 /* 2978 * Probably can't aggregate more packets, 2979 * flush this aggregating txdesc proactively. 2980 */ 2981 txr->hn_agg_pktleft = 0; 2982 } 2983 /* Done! */ 2984 return (chim); 2985 } 2986 hn_flush_txagg(ifp, txr); 2987 } 2988 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 2989 2990 txr->hn_tx_chimney_tried++; 2991 txd->chim_index = hn_chim_alloc(txr->hn_sc); 2992 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 2993 return (NULL); 2994 txr->hn_tx_chimney++; 2995 2996 chim = txr->hn_sc->hn_chim + 2997 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 2998 2999 if (txr->hn_agg_pktmax > 1 && 3000 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3001 txr->hn_agg_txd = txd; 3002 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3003 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3004 txr->hn_agg_prevpkt = chim; 3005 } 3006 return (chim); 3007 } 3008 3009 /* 3010 * NOTE: 3011 * If this function fails, then both txd and m_head0 will be freed. 3012 */ 3013 static int 3014 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3015 struct mbuf **m_head0) 3016 { 3017 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3018 int error, nsegs, i; 3019 struct mbuf *m_head = *m_head0; 3020 struct rndis_packet_msg *pkt; 3021 uint32_t *pi_data; 3022 void *chim = NULL; 3023 int pkt_hlen, pkt_size; 3024 3025 pkt = txd->rndis_pkt; 3026 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3027 if (pkt_size < txr->hn_chim_size) { 3028 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3029 if (chim != NULL) 3030 pkt = chim; 3031 } else { 3032 if (txr->hn_agg_txd != NULL) 3033 hn_flush_txagg(ifp, txr); 3034 } 3035 3036 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3037 pkt->rm_len = m_head->m_pkthdr.len; 3038 pkt->rm_dataoffset = 0; 3039 pkt->rm_datalen = m_head->m_pkthdr.len; 3040 pkt->rm_oobdataoffset = 0; 3041 pkt->rm_oobdatalen = 0; 3042 pkt->rm_oobdataelements = 0; 3043 pkt->rm_pktinfooffset = sizeof(*pkt); 3044 pkt->rm_pktinfolen = 0; 3045 pkt->rm_vchandle = 0; 3046 pkt->rm_reserved = 0; 3047 3048 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3049 /* 3050 * Set the hash value for this packet, so that the host could 3051 * dispatch the TX done event for this packet back to this TX 3052 * ring's channel. 3053 */ 3054 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3055 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3056 *pi_data = txr->hn_tx_idx; 3057 } 3058 3059 if (m_head->m_flags & M_VLANTAG) { 3060 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3061 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3062 *pi_data = NDIS_VLAN_INFO_MAKE( 3063 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3064 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3065 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3066 } 3067 3068 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3069 #if defined(INET6) || defined(INET) 3070 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3071 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3072 #ifdef INET 3073 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3074 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3075 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3076 m_head->m_pkthdr.tso_segsz); 3077 } 3078 #endif 3079 #if defined(INET6) && defined(INET) 3080 else 3081 #endif 3082 #ifdef INET6 3083 { 3084 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3085 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3086 m_head->m_pkthdr.tso_segsz); 3087 } 3088 #endif 3089 #endif /* INET6 || INET */ 3090 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3091 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3092 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3093 if (m_head->m_pkthdr.csum_flags & 3094 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3095 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3096 } else { 3097 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3098 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3099 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3100 } 3101 3102 if (m_head->m_pkthdr.csum_flags & 3103 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3104 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3105 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3106 } else if (m_head->m_pkthdr.csum_flags & 3107 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3108 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3109 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3110 } 3111 } 3112 3113 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3114 /* Fixup RNDIS packet message total length */ 3115 pkt->rm_len += pkt_hlen; 3116 /* Convert RNDIS packet message offsets */ 3117 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3118 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3119 3120 /* 3121 * Fast path: Chimney sending. 3122 */ 3123 if (chim != NULL) { 3124 struct hn_txdesc *tgt_txd = txd; 3125 3126 if (txr->hn_agg_txd != NULL) { 3127 tgt_txd = txr->hn_agg_txd; 3128 #ifdef INVARIANTS 3129 *m_head0 = NULL; 3130 #endif 3131 } 3132 3133 KASSERT(pkt == chim, 3134 ("RNDIS pkt not in chimney sending buffer")); 3135 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3136 ("chimney sending buffer is not used")); 3137 tgt_txd->chim_size += pkt->rm_len; 3138 3139 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3140 ((uint8_t *)chim) + pkt_hlen); 3141 3142 txr->hn_gpa_cnt = 0; 3143 txr->hn_sendpkt = hn_txpkt_chim; 3144 goto done; 3145 } 3146 3147 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3148 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3149 ("chimney buffer is used")); 3150 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3151 3152 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3153 if (__predict_false(error)) { 3154 int freed; 3155 3156 /* 3157 * This mbuf is not linked w/ the txd yet, so free it now. 3158 */ 3159 m_freem(m_head); 3160 *m_head0 = NULL; 3161 3162 freed = hn_txdesc_put(txr, txd); 3163 KASSERT(freed != 0, 3164 ("fail to free txd upon txdma error")); 3165 3166 txr->hn_txdma_failed++; 3167 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3168 return error; 3169 } 3170 *m_head0 = m_head; 3171 3172 /* +1 RNDIS packet message */ 3173 txr->hn_gpa_cnt = nsegs + 1; 3174 3175 /* send packet with page buffer */ 3176 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3177 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3178 txr->hn_gpa[0].gpa_len = pkt_hlen; 3179 3180 /* 3181 * Fill the page buffers with mbuf info after the page 3182 * buffer for RNDIS packet message. 3183 */ 3184 for (i = 0; i < nsegs; ++i) { 3185 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3186 3187 gpa->gpa_page = atop(segs[i].ds_addr); 3188 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3189 gpa->gpa_len = segs[i].ds_len; 3190 } 3191 3192 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3193 txd->chim_size = 0; 3194 txr->hn_sendpkt = hn_txpkt_sglist; 3195 done: 3196 txd->m = m_head; 3197 3198 /* Set the completion routine */ 3199 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3200 3201 /* Update temporary stats for later use. */ 3202 txr->hn_stat_pkts++; 3203 txr->hn_stat_size += m_head->m_pkthdr.len; 3204 if (m_head->m_flags & M_MCAST) 3205 txr->hn_stat_mcasts++; 3206 3207 return 0; 3208 } 3209 3210 /* 3211 * NOTE: 3212 * If this function fails, then txd will be freed, but the mbuf 3213 * associated w/ the txd will _not_ be freed. 3214 */ 3215 static int 3216 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3217 { 3218 int error, send_failed = 0, has_bpf; 3219 3220 again: 3221 has_bpf = bpf_peers_present(ifp->if_bpf); 3222 if (has_bpf) { 3223 /* 3224 * Make sure that this txd and any aggregated txds are not 3225 * freed before ETHER_BPF_MTAP. 3226 */ 3227 hn_txdesc_hold(txd); 3228 } 3229 error = txr->hn_sendpkt(txr, txd); 3230 if (!error) { 3231 if (has_bpf) { 3232 const struct hn_txdesc *tmp_txd; 3233 3234 ETHER_BPF_MTAP(ifp, txd->m); 3235 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3236 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3237 } 3238 3239 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3240 #ifdef HN_IFSTART_SUPPORT 3241 if (!hn_use_if_start) 3242 #endif 3243 { 3244 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3245 txr->hn_stat_size); 3246 if (txr->hn_stat_mcasts != 0) { 3247 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3248 txr->hn_stat_mcasts); 3249 } 3250 } 3251 txr->hn_pkts += txr->hn_stat_pkts; 3252 txr->hn_sends++; 3253 } 3254 if (has_bpf) 3255 hn_txdesc_put(txr, txd); 3256 3257 if (__predict_false(error)) { 3258 int freed; 3259 3260 /* 3261 * This should "really rarely" happen. 3262 * 3263 * XXX Too many RX to be acked or too many sideband 3264 * commands to run? Ask netvsc_channel_rollup() 3265 * to kick start later. 3266 */ 3267 txr->hn_has_txeof = 1; 3268 if (!send_failed) { 3269 txr->hn_send_failed++; 3270 send_failed = 1; 3271 /* 3272 * Try sending again after set hn_has_txeof; 3273 * in case that we missed the last 3274 * netvsc_channel_rollup(). 3275 */ 3276 goto again; 3277 } 3278 if_printf(ifp, "send failed\n"); 3279 3280 /* 3281 * Caller will perform further processing on the 3282 * associated mbuf, so don't free it in hn_txdesc_put(); 3283 * only unload it from the DMA map in hn_txdesc_put(), 3284 * if it was loaded. 3285 */ 3286 txd->m = NULL; 3287 freed = hn_txdesc_put(txr, txd); 3288 KASSERT(freed != 0, 3289 ("fail to free txd upon send error")); 3290 3291 txr->hn_send_failed++; 3292 } 3293 3294 /* Reset temporary stats, after this sending is done. */ 3295 txr->hn_stat_size = 0; 3296 txr->hn_stat_pkts = 0; 3297 txr->hn_stat_mcasts = 0; 3298 3299 return (error); 3300 } 3301 3302 /* 3303 * Append the specified data to the indicated mbuf chain, 3304 * Extend the mbuf chain if the new data does not fit in 3305 * existing space. 3306 * 3307 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3308 * There should be an equivalent in the kernel mbuf code, 3309 * but there does not appear to be one yet. 3310 * 3311 * Differs from m_append() in that additional mbufs are 3312 * allocated with cluster size MJUMPAGESIZE, and filled 3313 * accordingly. 3314 * 3315 * Return 1 if able to complete the job; otherwise 0. 3316 */ 3317 static int 3318 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3319 { 3320 struct mbuf *m, *n; 3321 int remainder, space; 3322 3323 for (m = m0; m->m_next != NULL; m = m->m_next) 3324 ; 3325 remainder = len; 3326 space = M_TRAILINGSPACE(m); 3327 if (space > 0) { 3328 /* 3329 * Copy into available space. 3330 */ 3331 if (space > remainder) 3332 space = remainder; 3333 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3334 m->m_len += space; 3335 cp += space; 3336 remainder -= space; 3337 } 3338 while (remainder > 0) { 3339 /* 3340 * Allocate a new mbuf; could check space 3341 * and allocate a cluster instead. 3342 */ 3343 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3344 if (n == NULL) 3345 break; 3346 n->m_len = min(MJUMPAGESIZE, remainder); 3347 bcopy(cp, mtod(n, caddr_t), n->m_len); 3348 cp += n->m_len; 3349 remainder -= n->m_len; 3350 m->m_next = n; 3351 m = n; 3352 } 3353 if (m0->m_flags & M_PKTHDR) 3354 m0->m_pkthdr.len += len - remainder; 3355 3356 return (remainder == 0); 3357 } 3358 3359 #if defined(INET) || defined(INET6) 3360 static __inline int 3361 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3362 { 3363 #if __FreeBSD_version >= 1100095 3364 if (hn_lro_mbufq_depth) { 3365 tcp_lro_queue_mbuf(lc, m); 3366 return 0; 3367 } 3368 #endif 3369 return tcp_lro_rx(lc, m, 0); 3370 } 3371 #endif 3372 3373 static int 3374 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 3375 const struct hn_rxinfo *info) 3376 { 3377 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3378 struct mbuf *m_new; 3379 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3380 int hash_type = M_HASHTYPE_NONE; 3381 3382 ifp = hn_ifp; 3383 if (rxr->hn_rxvf_ifp != NULL) { 3384 /* 3385 * Non-transparent mode VF; pretend this packet is from 3386 * the VF. 3387 */ 3388 ifp = rxr->hn_rxvf_ifp; 3389 is_vf = 1; 3390 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3391 /* Transparent mode VF. */ 3392 is_vf = 1; 3393 } 3394 3395 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3396 /* 3397 * NOTE: 3398 * See the NOTE of hn_rndis_init_fixat(). This 3399 * function can be reached, immediately after the 3400 * RNDIS is initialized but before the ifnet is 3401 * setup on the hn_attach() path; drop the unexpected 3402 * packets. 3403 */ 3404 return (0); 3405 } 3406 3407 if (__predict_false(dlen < ETHER_HDR_LEN)) { 3408 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3409 return (0); 3410 } 3411 3412 if (dlen <= MHLEN) { 3413 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3414 if (m_new == NULL) { 3415 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3416 return (0); 3417 } 3418 memcpy(mtod(m_new, void *), data, dlen); 3419 m_new->m_pkthdr.len = m_new->m_len = dlen; 3420 rxr->hn_small_pkts++; 3421 } else { 3422 /* 3423 * Get an mbuf with a cluster. For packets 2K or less, 3424 * get a standard 2K cluster. For anything larger, get a 3425 * 4K cluster. Any buffers larger than 4K can cause problems 3426 * if looped around to the Hyper-V TX channel, so avoid them. 3427 */ 3428 size = MCLBYTES; 3429 if (dlen > MCLBYTES) { 3430 /* 4096 */ 3431 size = MJUMPAGESIZE; 3432 } 3433 3434 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3435 if (m_new == NULL) { 3436 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3437 return (0); 3438 } 3439 3440 hv_m_append(m_new, dlen, data); 3441 } 3442 m_new->m_pkthdr.rcvif = ifp; 3443 3444 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3445 do_csum = 0; 3446 3447 /* receive side checksum offload */ 3448 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 3449 /* IP csum offload */ 3450 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3451 m_new->m_pkthdr.csum_flags |= 3452 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3453 rxr->hn_csum_ip++; 3454 } 3455 3456 /* TCP/UDP csum offload */ 3457 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 3458 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3459 m_new->m_pkthdr.csum_flags |= 3460 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3461 m_new->m_pkthdr.csum_data = 0xffff; 3462 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 3463 rxr->hn_csum_tcp++; 3464 else 3465 rxr->hn_csum_udp++; 3466 } 3467 3468 /* 3469 * XXX 3470 * As of this write (Oct 28th, 2016), host side will turn 3471 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3472 * the do_lro setting here is actually _not_ accurate. We 3473 * depend on the RSS hash type check to reset do_lro. 3474 */ 3475 if ((info->csum_info & 3476 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3477 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3478 do_lro = 1; 3479 } else { 3480 const struct ether_header *eh; 3481 uint16_t etype; 3482 int hoff; 3483 3484 hoff = sizeof(*eh); 3485 /* Checked at the beginning of this function. */ 3486 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 3487 3488 eh = mtod(m_new, struct ether_header *); 3489 etype = ntohs(eh->ether_type); 3490 if (etype == ETHERTYPE_VLAN) { 3491 const struct ether_vlan_header *evl; 3492 3493 hoff = sizeof(*evl); 3494 if (m_new->m_len < hoff) 3495 goto skip; 3496 evl = mtod(m_new, struct ether_vlan_header *); 3497 etype = ntohs(evl->evl_proto); 3498 } 3499 3500 if (etype == ETHERTYPE_IP) { 3501 int pr; 3502 3503 pr = hn_check_iplen(m_new, hoff); 3504 if (pr == IPPROTO_TCP) { 3505 if (do_csum && 3506 (rxr->hn_trust_hcsum & 3507 HN_TRUST_HCSUM_TCP)) { 3508 rxr->hn_csum_trusted++; 3509 m_new->m_pkthdr.csum_flags |= 3510 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3511 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3512 m_new->m_pkthdr.csum_data = 0xffff; 3513 } 3514 do_lro = 1; 3515 } else if (pr == IPPROTO_UDP) { 3516 if (do_csum && 3517 (rxr->hn_trust_hcsum & 3518 HN_TRUST_HCSUM_UDP)) { 3519 rxr->hn_csum_trusted++; 3520 m_new->m_pkthdr.csum_flags |= 3521 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3522 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3523 m_new->m_pkthdr.csum_data = 0xffff; 3524 } 3525 } else if (pr != IPPROTO_DONE && do_csum && 3526 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3527 rxr->hn_csum_trusted++; 3528 m_new->m_pkthdr.csum_flags |= 3529 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3530 } 3531 } 3532 } 3533 skip: 3534 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 3535 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3536 NDIS_VLAN_INFO_ID(info->vlan_info), 3537 NDIS_VLAN_INFO_PRI(info->vlan_info), 3538 NDIS_VLAN_INFO_CFI(info->vlan_info)); 3539 m_new->m_flags |= M_VLANTAG; 3540 } 3541 3542 /* 3543 * If VF is activated (tranparent/non-transparent mode does not 3544 * matter here). 3545 * 3546 * - Disable LRO 3547 * 3548 * hn(4) will only receive broadcast packets, multicast packets, 3549 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3550 * packet types. 3551 * 3552 * For non-transparent, we definitely _cannot_ enable LRO at 3553 * all, since the LRO flush will use hn(4) as the receiving 3554 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3555 */ 3556 if (is_vf) 3557 do_lro = 0; 3558 3559 /* 3560 * If VF is activated (tranparent/non-transparent mode does not 3561 * matter here), do _not_ mess with unsupported hash types or 3562 * functions. 3563 */ 3564 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 3565 rxr->hn_rss_pkts++; 3566 m_new->m_pkthdr.flowid = info->hash_value; 3567 if (!is_vf) 3568 hash_type = M_HASHTYPE_OPAQUE_HASH; 3569 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 3570 NDIS_HASH_FUNCTION_TOEPLITZ) { 3571 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK & 3572 rxr->hn_mbuf_hash); 3573 3574 /* 3575 * NOTE: 3576 * do_lro is resetted, if the hash types are not TCP 3577 * related. See the comment in the above csum_flags 3578 * setup section. 3579 */ 3580 switch (type) { 3581 case NDIS_HASH_IPV4: 3582 hash_type = M_HASHTYPE_RSS_IPV4; 3583 do_lro = 0; 3584 break; 3585 3586 case NDIS_HASH_TCP_IPV4: 3587 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3588 break; 3589 3590 case NDIS_HASH_IPV6: 3591 hash_type = M_HASHTYPE_RSS_IPV6; 3592 do_lro = 0; 3593 break; 3594 3595 case NDIS_HASH_IPV6_EX: 3596 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3597 do_lro = 0; 3598 break; 3599 3600 case NDIS_HASH_TCP_IPV6: 3601 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3602 break; 3603 3604 case NDIS_HASH_TCP_IPV6_EX: 3605 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3606 break; 3607 } 3608 } 3609 } else if (!is_vf) { 3610 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3611 hash_type = M_HASHTYPE_OPAQUE; 3612 } 3613 M_HASHTYPE_SET(m_new, hash_type); 3614 3615 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3616 if (hn_ifp != ifp) { 3617 const struct ether_header *eh; 3618 3619 /* 3620 * Non-transparent mode VF is activated. 3621 */ 3622 3623 /* 3624 * Allow tapping on hn(4). 3625 */ 3626 ETHER_BPF_MTAP(hn_ifp, m_new); 3627 3628 /* 3629 * Update hn(4)'s stats. 3630 */ 3631 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3632 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3633 /* Checked at the beginning of this function. */ 3634 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3635 eh = mtod(m_new, struct ether_header *); 3636 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3637 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3638 } 3639 rxr->hn_pkts++; 3640 3641 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3642 #if defined(INET) || defined(INET6) 3643 struct lro_ctrl *lro = &rxr->hn_lro; 3644 3645 if (lro->lro_cnt) { 3646 rxr->hn_lro_tried++; 3647 if (hn_lro_rx(lro, m_new) == 0) { 3648 /* DONE! */ 3649 return 0; 3650 } 3651 } 3652 #endif 3653 } 3654 ifp->if_input(ifp, m_new); 3655 3656 return (0); 3657 } 3658 3659 static int 3660 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3661 { 3662 struct hn_softc *sc = ifp->if_softc; 3663 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3664 struct ifnet *vf_ifp; 3665 int mask, error = 0; 3666 struct ifrsskey *ifrk; 3667 struct ifrsshash *ifrh; 3668 uint32_t mtu; 3669 3670 switch (cmd) { 3671 case SIOCSIFMTU: 3672 if (ifr->ifr_mtu > HN_MTU_MAX) { 3673 error = EINVAL; 3674 break; 3675 } 3676 3677 HN_LOCK(sc); 3678 3679 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3680 HN_UNLOCK(sc); 3681 break; 3682 } 3683 3684 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3685 /* Can't change MTU */ 3686 HN_UNLOCK(sc); 3687 error = EOPNOTSUPP; 3688 break; 3689 } 3690 3691 if (ifp->if_mtu == ifr->ifr_mtu) { 3692 HN_UNLOCK(sc); 3693 break; 3694 } 3695 3696 if (hn_xpnt_vf_isready(sc)) { 3697 vf_ifp = sc->hn_vf_ifp; 3698 ifr_vf = *ifr; 3699 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3700 sizeof(ifr_vf.ifr_name)); 3701 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3702 (caddr_t)&ifr_vf); 3703 if (error) { 3704 HN_UNLOCK(sc); 3705 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3706 vf_ifp->if_xname, ifr->ifr_mtu, error); 3707 break; 3708 } 3709 } 3710 3711 /* 3712 * Suspend this interface before the synthetic parts 3713 * are ripped. 3714 */ 3715 hn_suspend(sc); 3716 3717 /* 3718 * Detach the synthetics parts, i.e. NVS and RNDIS. 3719 */ 3720 hn_synth_detach(sc); 3721 3722 /* 3723 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3724 * with the new MTU setting. 3725 */ 3726 error = hn_synth_attach(sc, ifr->ifr_mtu); 3727 if (error) { 3728 HN_UNLOCK(sc); 3729 break; 3730 } 3731 3732 error = hn_rndis_get_mtu(sc, &mtu); 3733 if (error) 3734 mtu = ifr->ifr_mtu; 3735 else if (bootverbose) 3736 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3737 3738 /* 3739 * Commit the requested MTU, after the synthetic parts 3740 * have been successfully attached. 3741 */ 3742 if (mtu >= ifr->ifr_mtu) { 3743 mtu = ifr->ifr_mtu; 3744 } else { 3745 if_printf(ifp, "fixup mtu %d -> %u\n", 3746 ifr->ifr_mtu, mtu); 3747 } 3748 ifp->if_mtu = mtu; 3749 3750 /* 3751 * Synthetic parts' reattach may change the chimney 3752 * sending size; update it. 3753 */ 3754 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3755 hn_set_chim_size(sc, sc->hn_chim_szmax); 3756 3757 /* 3758 * Make sure that various parameters based on MTU are 3759 * still valid, after the MTU change. 3760 */ 3761 hn_mtu_change_fixup(sc); 3762 3763 /* 3764 * All done! Resume the interface now. 3765 */ 3766 hn_resume(sc); 3767 3768 if ((sc->hn_flags & HN_FLAG_RXVF) || 3769 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3770 /* 3771 * Since we have reattached the NVS part, 3772 * change the datapath to VF again; in case 3773 * that it is lost, after the NVS was detached. 3774 */ 3775 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3776 } 3777 3778 HN_UNLOCK(sc); 3779 break; 3780 3781 case SIOCSIFFLAGS: 3782 HN_LOCK(sc); 3783 3784 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3785 HN_UNLOCK(sc); 3786 break; 3787 } 3788 3789 if (hn_xpnt_vf_isready(sc)) 3790 hn_xpnt_vf_saveifflags(sc); 3791 3792 if (ifp->if_flags & IFF_UP) { 3793 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3794 /* 3795 * Caller meight hold mutex, e.g. 3796 * bpf; use busy-wait for the RNDIS 3797 * reply. 3798 */ 3799 HN_NO_SLEEPING(sc); 3800 hn_rxfilter_config(sc); 3801 HN_SLEEPING_OK(sc); 3802 3803 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3804 error = hn_xpnt_vf_iocsetflags(sc); 3805 } else { 3806 hn_init_locked(sc); 3807 } 3808 } else { 3809 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3810 hn_stop(sc, false); 3811 } 3812 sc->hn_if_flags = ifp->if_flags; 3813 3814 HN_UNLOCK(sc); 3815 break; 3816 3817 case SIOCSIFCAP: 3818 HN_LOCK(sc); 3819 3820 if (hn_xpnt_vf_isready(sc)) { 3821 ifr_vf = *ifr; 3822 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3823 sizeof(ifr_vf.ifr_name)); 3824 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3825 HN_UNLOCK(sc); 3826 break; 3827 } 3828 3829 /* 3830 * Fix up requested capabilities w/ supported capabilities, 3831 * since the supported capabilities could have been changed. 3832 */ 3833 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3834 ifp->if_capenable; 3835 3836 if (mask & IFCAP_TXCSUM) { 3837 ifp->if_capenable ^= IFCAP_TXCSUM; 3838 if (ifp->if_capenable & IFCAP_TXCSUM) 3839 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3840 else 3841 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3842 } 3843 if (mask & IFCAP_TXCSUM_IPV6) { 3844 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3845 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3846 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3847 else 3848 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3849 } 3850 3851 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3852 if (mask & IFCAP_RXCSUM) 3853 ifp->if_capenable ^= IFCAP_RXCSUM; 3854 #ifdef foo 3855 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3856 if (mask & IFCAP_RXCSUM_IPV6) 3857 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3858 #endif 3859 3860 if (mask & IFCAP_LRO) 3861 ifp->if_capenable ^= IFCAP_LRO; 3862 3863 if (mask & IFCAP_TSO4) { 3864 ifp->if_capenable ^= IFCAP_TSO4; 3865 if (ifp->if_capenable & IFCAP_TSO4) 3866 ifp->if_hwassist |= CSUM_IP_TSO; 3867 else 3868 ifp->if_hwassist &= ~CSUM_IP_TSO; 3869 } 3870 if (mask & IFCAP_TSO6) { 3871 ifp->if_capenable ^= IFCAP_TSO6; 3872 if (ifp->if_capenable & IFCAP_TSO6) 3873 ifp->if_hwassist |= CSUM_IP6_TSO; 3874 else 3875 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3876 } 3877 3878 HN_UNLOCK(sc); 3879 break; 3880 3881 case SIOCADDMULTI: 3882 case SIOCDELMULTI: 3883 HN_LOCK(sc); 3884 3885 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3886 HN_UNLOCK(sc); 3887 break; 3888 } 3889 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3890 /* 3891 * Multicast uses mutex; use busy-wait for 3892 * the RNDIS reply. 3893 */ 3894 HN_NO_SLEEPING(sc); 3895 hn_rxfilter_config(sc); 3896 HN_SLEEPING_OK(sc); 3897 } 3898 3899 /* XXX vlan(4) style mcast addr maintenance */ 3900 if (hn_xpnt_vf_isready(sc)) { 3901 int old_if_flags; 3902 3903 old_if_flags = sc->hn_vf_ifp->if_flags; 3904 hn_xpnt_vf_saveifflags(sc); 3905 3906 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3907 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3908 IFF_ALLMULTI)) 3909 error = hn_xpnt_vf_iocsetflags(sc); 3910 } 3911 3912 HN_UNLOCK(sc); 3913 break; 3914 3915 case SIOCSIFMEDIA: 3916 case SIOCGIFMEDIA: 3917 HN_LOCK(sc); 3918 if (hn_xpnt_vf_isready(sc)) { 3919 /* 3920 * SIOCGIFMEDIA expects ifmediareq, so don't 3921 * create and pass ifr_vf to the VF here; just 3922 * replace the ifr_name. 3923 */ 3924 vf_ifp = sc->hn_vf_ifp; 3925 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 3926 sizeof(ifr->ifr_name)); 3927 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 3928 /* Restore the ifr_name. */ 3929 strlcpy(ifr->ifr_name, ifp->if_xname, 3930 sizeof(ifr->ifr_name)); 3931 HN_UNLOCK(sc); 3932 break; 3933 } 3934 HN_UNLOCK(sc); 3935 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 3936 break; 3937 3938 case SIOCGIFRSSHASH: 3939 ifrh = (struct ifrsshash *)data; 3940 HN_LOCK(sc); 3941 if (sc->hn_rx_ring_inuse == 1) { 3942 HN_UNLOCK(sc); 3943 ifrh->ifrh_func = RSS_FUNC_NONE; 3944 ifrh->ifrh_types = 0; 3945 break; 3946 } 3947 3948 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 3949 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 3950 else 3951 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 3952 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 3953 HN_UNLOCK(sc); 3954 break; 3955 3956 case SIOCGIFRSSKEY: 3957 ifrk = (struct ifrsskey *)data; 3958 HN_LOCK(sc); 3959 if (sc->hn_rx_ring_inuse == 1) { 3960 HN_UNLOCK(sc); 3961 ifrk->ifrk_func = RSS_FUNC_NONE; 3962 ifrk->ifrk_keylen = 0; 3963 break; 3964 } 3965 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 3966 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 3967 else 3968 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 3969 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 3970 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 3971 NDIS_HASH_KEYSIZE_TOEPLITZ); 3972 HN_UNLOCK(sc); 3973 break; 3974 3975 default: 3976 error = ether_ioctl(ifp, cmd, data); 3977 break; 3978 } 3979 return (error); 3980 } 3981 3982 static void 3983 hn_stop(struct hn_softc *sc, bool detaching) 3984 { 3985 struct ifnet *ifp = sc->hn_ifp; 3986 int i; 3987 3988 HN_LOCK_ASSERT(sc); 3989 3990 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 3991 ("synthetic parts were not attached")); 3992 3993 /* Clear RUNNING bit ASAP. */ 3994 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 3995 3996 /* Disable polling. */ 3997 hn_polling(sc, 0); 3998 3999 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4000 KASSERT(sc->hn_vf_ifp != NULL, 4001 ("%s: VF is not attached", ifp->if_xname)); 4002 4003 /* Mark transparent mode VF as disabled. */ 4004 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4005 4006 /* 4007 * NOTE: 4008 * Datapath setting must happen _before_ bringing 4009 * the VF down. 4010 */ 4011 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4012 4013 /* 4014 * Bring the VF down. 4015 */ 4016 hn_xpnt_vf_saveifflags(sc); 4017 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4018 hn_xpnt_vf_iocsetflags(sc); 4019 } 4020 4021 /* Suspend data transfers. */ 4022 hn_suspend_data(sc); 4023 4024 /* Clear OACTIVE bit. */ 4025 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4026 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4027 sc->hn_tx_ring[i].hn_oactive = 0; 4028 4029 /* 4030 * If the non-transparent mode VF is active, make sure 4031 * that the RX filter still allows packet reception. 4032 */ 4033 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4034 hn_rxfilter_config(sc); 4035 } 4036 4037 static void 4038 hn_init_locked(struct hn_softc *sc) 4039 { 4040 struct ifnet *ifp = sc->hn_ifp; 4041 int i; 4042 4043 HN_LOCK_ASSERT(sc); 4044 4045 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4046 return; 4047 4048 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4049 return; 4050 4051 /* Configure RX filter */ 4052 hn_rxfilter_config(sc); 4053 4054 /* Clear OACTIVE bit. */ 4055 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4056 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4057 sc->hn_tx_ring[i].hn_oactive = 0; 4058 4059 /* Clear TX 'suspended' bit. */ 4060 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4061 4062 if (hn_xpnt_vf_isready(sc)) { 4063 /* Initialize transparent VF. */ 4064 hn_xpnt_vf_init(sc); 4065 } 4066 4067 /* Everything is ready; unleash! */ 4068 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4069 4070 /* Re-enable polling if requested. */ 4071 if (sc->hn_pollhz > 0) 4072 hn_polling(sc, sc->hn_pollhz); 4073 } 4074 4075 static void 4076 hn_init(void *xsc) 4077 { 4078 struct hn_softc *sc = xsc; 4079 4080 HN_LOCK(sc); 4081 hn_init_locked(sc); 4082 HN_UNLOCK(sc); 4083 } 4084 4085 #if __FreeBSD_version >= 1100099 4086 4087 static int 4088 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4089 { 4090 struct hn_softc *sc = arg1; 4091 unsigned int lenlim; 4092 int error; 4093 4094 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4095 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4096 if (error || req->newptr == NULL) 4097 return error; 4098 4099 HN_LOCK(sc); 4100 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4101 lenlim > TCP_LRO_LENGTH_MAX) { 4102 HN_UNLOCK(sc); 4103 return EINVAL; 4104 } 4105 hn_set_lro_lenlim(sc, lenlim); 4106 HN_UNLOCK(sc); 4107 4108 return 0; 4109 } 4110 4111 static int 4112 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4113 { 4114 struct hn_softc *sc = arg1; 4115 int ackcnt, error, i; 4116 4117 /* 4118 * lro_ackcnt_lim is append count limit, 4119 * +1 to turn it into aggregation limit. 4120 */ 4121 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4122 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4123 if (error || req->newptr == NULL) 4124 return error; 4125 4126 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4127 return EINVAL; 4128 4129 /* 4130 * Convert aggregation limit back to append 4131 * count limit. 4132 */ 4133 --ackcnt; 4134 HN_LOCK(sc); 4135 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4136 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4137 HN_UNLOCK(sc); 4138 return 0; 4139 } 4140 4141 #endif 4142 4143 static int 4144 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4145 { 4146 struct hn_softc *sc = arg1; 4147 int hcsum = arg2; 4148 int on, error, i; 4149 4150 on = 0; 4151 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4152 on = 1; 4153 4154 error = sysctl_handle_int(oidp, &on, 0, req); 4155 if (error || req->newptr == NULL) 4156 return error; 4157 4158 HN_LOCK(sc); 4159 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4160 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4161 4162 if (on) 4163 rxr->hn_trust_hcsum |= hcsum; 4164 else 4165 rxr->hn_trust_hcsum &= ~hcsum; 4166 } 4167 HN_UNLOCK(sc); 4168 return 0; 4169 } 4170 4171 static int 4172 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4173 { 4174 struct hn_softc *sc = arg1; 4175 int chim_size, error; 4176 4177 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4178 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4179 if (error || req->newptr == NULL) 4180 return error; 4181 4182 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4183 return EINVAL; 4184 4185 HN_LOCK(sc); 4186 hn_set_chim_size(sc, chim_size); 4187 HN_UNLOCK(sc); 4188 return 0; 4189 } 4190 4191 #if __FreeBSD_version < 1100095 4192 static int 4193 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4194 { 4195 struct hn_softc *sc = arg1; 4196 int ofs = arg2, i, error; 4197 struct hn_rx_ring *rxr; 4198 uint64_t stat; 4199 4200 stat = 0; 4201 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4202 rxr = &sc->hn_rx_ring[i]; 4203 stat += *((int *)((uint8_t *)rxr + ofs)); 4204 } 4205 4206 error = sysctl_handle_64(oidp, &stat, 0, req); 4207 if (error || req->newptr == NULL) 4208 return error; 4209 4210 /* Zero out this stat. */ 4211 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4212 rxr = &sc->hn_rx_ring[i]; 4213 *((int *)((uint8_t *)rxr + ofs)) = 0; 4214 } 4215 return 0; 4216 } 4217 #else 4218 static int 4219 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4220 { 4221 struct hn_softc *sc = arg1; 4222 int ofs = arg2, i, error; 4223 struct hn_rx_ring *rxr; 4224 uint64_t stat; 4225 4226 stat = 0; 4227 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4228 rxr = &sc->hn_rx_ring[i]; 4229 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4230 } 4231 4232 error = sysctl_handle_64(oidp, &stat, 0, req); 4233 if (error || req->newptr == NULL) 4234 return error; 4235 4236 /* Zero out this stat. */ 4237 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4238 rxr = &sc->hn_rx_ring[i]; 4239 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4240 } 4241 return 0; 4242 } 4243 4244 #endif 4245 4246 static int 4247 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4248 { 4249 struct hn_softc *sc = arg1; 4250 int ofs = arg2, i, error; 4251 struct hn_rx_ring *rxr; 4252 u_long stat; 4253 4254 stat = 0; 4255 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4256 rxr = &sc->hn_rx_ring[i]; 4257 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4258 } 4259 4260 error = sysctl_handle_long(oidp, &stat, 0, req); 4261 if (error || req->newptr == NULL) 4262 return error; 4263 4264 /* Zero out this stat. */ 4265 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4266 rxr = &sc->hn_rx_ring[i]; 4267 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4268 } 4269 return 0; 4270 } 4271 4272 static int 4273 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4274 { 4275 struct hn_softc *sc = arg1; 4276 int ofs = arg2, i, error; 4277 struct hn_tx_ring *txr; 4278 u_long stat; 4279 4280 stat = 0; 4281 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4282 txr = &sc->hn_tx_ring[i]; 4283 stat += *((u_long *)((uint8_t *)txr + ofs)); 4284 } 4285 4286 error = sysctl_handle_long(oidp, &stat, 0, req); 4287 if (error || req->newptr == NULL) 4288 return error; 4289 4290 /* Zero out this stat. */ 4291 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4292 txr = &sc->hn_tx_ring[i]; 4293 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4294 } 4295 return 0; 4296 } 4297 4298 static int 4299 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4300 { 4301 struct hn_softc *sc = arg1; 4302 int ofs = arg2, i, error, conf; 4303 struct hn_tx_ring *txr; 4304 4305 txr = &sc->hn_tx_ring[0]; 4306 conf = *((int *)((uint8_t *)txr + ofs)); 4307 4308 error = sysctl_handle_int(oidp, &conf, 0, req); 4309 if (error || req->newptr == NULL) 4310 return error; 4311 4312 HN_LOCK(sc); 4313 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4314 txr = &sc->hn_tx_ring[i]; 4315 *((int *)((uint8_t *)txr + ofs)) = conf; 4316 } 4317 HN_UNLOCK(sc); 4318 4319 return 0; 4320 } 4321 4322 static int 4323 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4324 { 4325 struct hn_softc *sc = arg1; 4326 int error, size; 4327 4328 size = sc->hn_agg_size; 4329 error = sysctl_handle_int(oidp, &size, 0, req); 4330 if (error || req->newptr == NULL) 4331 return (error); 4332 4333 HN_LOCK(sc); 4334 sc->hn_agg_size = size; 4335 hn_set_txagg(sc); 4336 HN_UNLOCK(sc); 4337 4338 return (0); 4339 } 4340 4341 static int 4342 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4343 { 4344 struct hn_softc *sc = arg1; 4345 int error, pkts; 4346 4347 pkts = sc->hn_agg_pkts; 4348 error = sysctl_handle_int(oidp, &pkts, 0, req); 4349 if (error || req->newptr == NULL) 4350 return (error); 4351 4352 HN_LOCK(sc); 4353 sc->hn_agg_pkts = pkts; 4354 hn_set_txagg(sc); 4355 HN_UNLOCK(sc); 4356 4357 return (0); 4358 } 4359 4360 static int 4361 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4362 { 4363 struct hn_softc *sc = arg1; 4364 int pkts; 4365 4366 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4367 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4368 } 4369 4370 static int 4371 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4372 { 4373 struct hn_softc *sc = arg1; 4374 int align; 4375 4376 align = sc->hn_tx_ring[0].hn_agg_align; 4377 return (sysctl_handle_int(oidp, &align, 0, req)); 4378 } 4379 4380 static void 4381 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4382 { 4383 if (pollhz == 0) 4384 vmbus_chan_poll_disable(chan); 4385 else 4386 vmbus_chan_poll_enable(chan, pollhz); 4387 } 4388 4389 static void 4390 hn_polling(struct hn_softc *sc, u_int pollhz) 4391 { 4392 int nsubch = sc->hn_rx_ring_inuse - 1; 4393 4394 HN_LOCK_ASSERT(sc); 4395 4396 if (nsubch > 0) { 4397 struct vmbus_channel **subch; 4398 int i; 4399 4400 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4401 for (i = 0; i < nsubch; ++i) 4402 hn_chan_polling(subch[i], pollhz); 4403 vmbus_subchan_rel(subch, nsubch); 4404 } 4405 hn_chan_polling(sc->hn_prichan, pollhz); 4406 } 4407 4408 static int 4409 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4410 { 4411 struct hn_softc *sc = arg1; 4412 int pollhz, error; 4413 4414 pollhz = sc->hn_pollhz; 4415 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4416 if (error || req->newptr == NULL) 4417 return (error); 4418 4419 if (pollhz != 0 && 4420 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4421 return (EINVAL); 4422 4423 HN_LOCK(sc); 4424 if (sc->hn_pollhz != pollhz) { 4425 sc->hn_pollhz = pollhz; 4426 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4427 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4428 hn_polling(sc, sc->hn_pollhz); 4429 } 4430 HN_UNLOCK(sc); 4431 4432 return (0); 4433 } 4434 4435 static int 4436 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4437 { 4438 struct hn_softc *sc = arg1; 4439 char verstr[16]; 4440 4441 snprintf(verstr, sizeof(verstr), "%u.%u", 4442 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4443 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4444 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4445 } 4446 4447 static int 4448 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4449 { 4450 struct hn_softc *sc = arg1; 4451 char caps_str[128]; 4452 uint32_t caps; 4453 4454 HN_LOCK(sc); 4455 caps = sc->hn_caps; 4456 HN_UNLOCK(sc); 4457 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4458 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4459 } 4460 4461 static int 4462 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4463 { 4464 struct hn_softc *sc = arg1; 4465 char assist_str[128]; 4466 uint32_t hwassist; 4467 4468 HN_LOCK(sc); 4469 hwassist = sc->hn_ifp->if_hwassist; 4470 HN_UNLOCK(sc); 4471 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4472 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4473 } 4474 4475 static int 4476 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4477 { 4478 struct hn_softc *sc = arg1; 4479 char filter_str[128]; 4480 uint32_t filter; 4481 4482 HN_LOCK(sc); 4483 filter = sc->hn_rx_filter; 4484 HN_UNLOCK(sc); 4485 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4486 NDIS_PACKET_TYPES); 4487 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4488 } 4489 4490 #ifndef RSS 4491 4492 static int 4493 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4494 { 4495 struct hn_softc *sc = arg1; 4496 int error; 4497 4498 HN_LOCK(sc); 4499 4500 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4501 if (error || req->newptr == NULL) 4502 goto back; 4503 4504 if ((sc->hn_flags & HN_FLAG_RXVF) || 4505 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4506 /* 4507 * RSS key is synchronized w/ VF's, don't allow users 4508 * to change it. 4509 */ 4510 error = EBUSY; 4511 goto back; 4512 } 4513 4514 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4515 if (error) 4516 goto back; 4517 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4518 4519 if (sc->hn_rx_ring_inuse > 1) { 4520 error = hn_rss_reconfig(sc); 4521 } else { 4522 /* Not RSS capable, at least for now; just save the RSS key. */ 4523 error = 0; 4524 } 4525 back: 4526 HN_UNLOCK(sc); 4527 return (error); 4528 } 4529 4530 static int 4531 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4532 { 4533 struct hn_softc *sc = arg1; 4534 int error; 4535 4536 HN_LOCK(sc); 4537 4538 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4539 if (error || req->newptr == NULL) 4540 goto back; 4541 4542 /* 4543 * Don't allow RSS indirect table change, if this interface is not 4544 * RSS capable currently. 4545 */ 4546 if (sc->hn_rx_ring_inuse == 1) { 4547 error = EOPNOTSUPP; 4548 goto back; 4549 } 4550 4551 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4552 if (error) 4553 goto back; 4554 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4555 4556 hn_rss_ind_fixup(sc); 4557 error = hn_rss_reconfig(sc); 4558 back: 4559 HN_UNLOCK(sc); 4560 return (error); 4561 } 4562 4563 #endif /* !RSS */ 4564 4565 static int 4566 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4567 { 4568 struct hn_softc *sc = arg1; 4569 char hash_str[128]; 4570 uint32_t hash; 4571 4572 HN_LOCK(sc); 4573 hash = sc->hn_rss_hash; 4574 HN_UNLOCK(sc); 4575 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4576 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4577 } 4578 4579 static int 4580 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4581 { 4582 struct hn_softc *sc = arg1; 4583 char hash_str[128]; 4584 uint32_t hash; 4585 4586 HN_LOCK(sc); 4587 hash = sc->hn_rss_hcap; 4588 HN_UNLOCK(sc); 4589 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4590 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4591 } 4592 4593 static int 4594 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4595 { 4596 struct hn_softc *sc = arg1; 4597 char hash_str[128]; 4598 uint32_t hash; 4599 4600 HN_LOCK(sc); 4601 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4602 HN_UNLOCK(sc); 4603 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4604 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4605 } 4606 4607 static int 4608 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4609 { 4610 struct hn_softc *sc = arg1; 4611 char vf_name[IFNAMSIZ + 1]; 4612 struct ifnet *vf_ifp; 4613 4614 HN_LOCK(sc); 4615 vf_name[0] = '\0'; 4616 vf_ifp = sc->hn_vf_ifp; 4617 if (vf_ifp != NULL) 4618 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4619 HN_UNLOCK(sc); 4620 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4621 } 4622 4623 static int 4624 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4625 { 4626 struct hn_softc *sc = arg1; 4627 char vf_name[IFNAMSIZ + 1]; 4628 struct ifnet *vf_ifp; 4629 4630 HN_LOCK(sc); 4631 vf_name[0] = '\0'; 4632 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4633 if (vf_ifp != NULL) 4634 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4635 HN_UNLOCK(sc); 4636 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4637 } 4638 4639 static int 4640 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4641 { 4642 struct rm_priotracker pt; 4643 struct sbuf *sb; 4644 int error, i; 4645 bool first; 4646 4647 error = sysctl_wire_old_buffer(req, 0); 4648 if (error != 0) 4649 return (error); 4650 4651 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4652 if (sb == NULL) 4653 return (ENOMEM); 4654 4655 rm_rlock(&hn_vfmap_lock, &pt); 4656 4657 first = true; 4658 for (i = 0; i < hn_vfmap_size; ++i) { 4659 struct ifnet *ifp; 4660 4661 if (hn_vfmap[i] == NULL) 4662 continue; 4663 4664 ifp = ifnet_byindex(i); 4665 if (ifp != NULL) { 4666 if (first) 4667 sbuf_printf(sb, "%s", ifp->if_xname); 4668 else 4669 sbuf_printf(sb, " %s", ifp->if_xname); 4670 first = false; 4671 } 4672 } 4673 4674 rm_runlock(&hn_vfmap_lock, &pt); 4675 4676 error = sbuf_finish(sb); 4677 sbuf_delete(sb); 4678 return (error); 4679 } 4680 4681 static int 4682 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4683 { 4684 struct rm_priotracker pt; 4685 struct sbuf *sb; 4686 int error, i; 4687 bool first; 4688 4689 error = sysctl_wire_old_buffer(req, 0); 4690 if (error != 0) 4691 return (error); 4692 4693 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4694 if (sb == NULL) 4695 return (ENOMEM); 4696 4697 rm_rlock(&hn_vfmap_lock, &pt); 4698 4699 first = true; 4700 for (i = 0; i < hn_vfmap_size; ++i) { 4701 struct ifnet *ifp, *hn_ifp; 4702 4703 hn_ifp = hn_vfmap[i]; 4704 if (hn_ifp == NULL) 4705 continue; 4706 4707 ifp = ifnet_byindex(i); 4708 if (ifp != NULL) { 4709 if (first) { 4710 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4711 hn_ifp->if_xname); 4712 } else { 4713 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4714 hn_ifp->if_xname); 4715 } 4716 first = false; 4717 } 4718 } 4719 4720 rm_runlock(&hn_vfmap_lock, &pt); 4721 4722 error = sbuf_finish(sb); 4723 sbuf_delete(sb); 4724 return (error); 4725 } 4726 4727 static int 4728 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4729 { 4730 struct hn_softc *sc = arg1; 4731 int error, onoff = 0; 4732 4733 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4734 onoff = 1; 4735 error = sysctl_handle_int(oidp, &onoff, 0, req); 4736 if (error || req->newptr == NULL) 4737 return (error); 4738 4739 HN_LOCK(sc); 4740 /* NOTE: hn_vf_lock for hn_transmit() */ 4741 rm_wlock(&sc->hn_vf_lock); 4742 if (onoff) 4743 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4744 else 4745 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4746 rm_wunlock(&sc->hn_vf_lock); 4747 HN_UNLOCK(sc); 4748 4749 return (0); 4750 } 4751 4752 static int 4753 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4754 { 4755 struct hn_softc *sc = arg1; 4756 int enabled = 0; 4757 4758 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4759 enabled = 1; 4760 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4761 } 4762 4763 static int 4764 hn_check_iplen(const struct mbuf *m, int hoff) 4765 { 4766 const struct ip *ip; 4767 int len, iphlen, iplen; 4768 const struct tcphdr *th; 4769 int thoff; /* TCP data offset */ 4770 4771 len = hoff + sizeof(struct ip); 4772 4773 /* The packet must be at least the size of an IP header. */ 4774 if (m->m_pkthdr.len < len) 4775 return IPPROTO_DONE; 4776 4777 /* The fixed IP header must reside completely in the first mbuf. */ 4778 if (m->m_len < len) 4779 return IPPROTO_DONE; 4780 4781 ip = mtodo(m, hoff); 4782 4783 /* Bound check the packet's stated IP header length. */ 4784 iphlen = ip->ip_hl << 2; 4785 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4786 return IPPROTO_DONE; 4787 4788 /* The full IP header must reside completely in the one mbuf. */ 4789 if (m->m_len < hoff + iphlen) 4790 return IPPROTO_DONE; 4791 4792 iplen = ntohs(ip->ip_len); 4793 4794 /* 4795 * Check that the amount of data in the buffers is as 4796 * at least much as the IP header would have us expect. 4797 */ 4798 if (m->m_pkthdr.len < hoff + iplen) 4799 return IPPROTO_DONE; 4800 4801 /* 4802 * Ignore IP fragments. 4803 */ 4804 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4805 return IPPROTO_DONE; 4806 4807 /* 4808 * The TCP/IP or UDP/IP header must be entirely contained within 4809 * the first fragment of a packet. 4810 */ 4811 switch (ip->ip_p) { 4812 case IPPROTO_TCP: 4813 if (iplen < iphlen + sizeof(struct tcphdr)) 4814 return IPPROTO_DONE; 4815 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4816 return IPPROTO_DONE; 4817 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4818 thoff = th->th_off << 2; 4819 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4820 return IPPROTO_DONE; 4821 if (m->m_len < hoff + iphlen + thoff) 4822 return IPPROTO_DONE; 4823 break; 4824 case IPPROTO_UDP: 4825 if (iplen < iphlen + sizeof(struct udphdr)) 4826 return IPPROTO_DONE; 4827 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4828 return IPPROTO_DONE; 4829 break; 4830 default: 4831 if (iplen < iphlen) 4832 return IPPROTO_DONE; 4833 break; 4834 } 4835 return ip->ip_p; 4836 } 4837 4838 static int 4839 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4840 { 4841 struct sysctl_oid_list *child; 4842 struct sysctl_ctx_list *ctx; 4843 device_t dev = sc->hn_dev; 4844 #if defined(INET) || defined(INET6) 4845 #if __FreeBSD_version >= 1100095 4846 int lroent_cnt; 4847 #endif 4848 #endif 4849 int i; 4850 4851 /* 4852 * Create RXBUF for reception. 4853 * 4854 * NOTE: 4855 * - It is shared by all channels. 4856 * - A large enough buffer is allocated, certain version of NVSes 4857 * may further limit the usable space. 4858 */ 4859 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4860 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4861 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4862 if (sc->hn_rxbuf == NULL) { 4863 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4864 return (ENOMEM); 4865 } 4866 4867 sc->hn_rx_ring_cnt = ring_cnt; 4868 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4869 4870 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4871 M_DEVBUF, M_WAITOK | M_ZERO); 4872 4873 #if defined(INET) || defined(INET6) 4874 #if __FreeBSD_version >= 1100095 4875 lroent_cnt = hn_lro_entry_count; 4876 if (lroent_cnt < TCP_LRO_ENTRIES) 4877 lroent_cnt = TCP_LRO_ENTRIES; 4878 if (bootverbose) 4879 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4880 #endif 4881 #endif /* INET || INET6 */ 4882 4883 ctx = device_get_sysctl_ctx(dev); 4884 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4885 4886 /* Create dev.hn.UNIT.rx sysctl tree */ 4887 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4888 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4889 4890 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4891 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4892 4893 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4894 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 4895 &rxr->hn_br_dma, BUS_DMA_WAITOK); 4896 if (rxr->hn_br == NULL) { 4897 device_printf(dev, "allocate bufring failed\n"); 4898 return (ENOMEM); 4899 } 4900 4901 if (hn_trust_hosttcp) 4902 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 4903 if (hn_trust_hostudp) 4904 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 4905 if (hn_trust_hostip) 4906 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 4907 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 4908 rxr->hn_ifp = sc->hn_ifp; 4909 if (i < sc->hn_tx_ring_cnt) 4910 rxr->hn_txr = &sc->hn_tx_ring[i]; 4911 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 4912 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 4913 rxr->hn_rx_idx = i; 4914 rxr->hn_rxbuf = sc->hn_rxbuf; 4915 4916 /* 4917 * Initialize LRO. 4918 */ 4919 #if defined(INET) || defined(INET6) 4920 #if __FreeBSD_version >= 1100095 4921 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 4922 hn_lro_mbufq_depth); 4923 #else 4924 tcp_lro_init(&rxr->hn_lro); 4925 rxr->hn_lro.ifp = sc->hn_ifp; 4926 #endif 4927 #if __FreeBSD_version >= 1100099 4928 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 4929 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 4930 #endif 4931 #endif /* INET || INET6 */ 4932 4933 if (sc->hn_rx_sysctl_tree != NULL) { 4934 char name[16]; 4935 4936 /* 4937 * Create per RX ring sysctl tree: 4938 * dev.hn.UNIT.rx.RINGID 4939 */ 4940 snprintf(name, sizeof(name), "%d", i); 4941 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 4942 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 4943 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4944 4945 if (rxr->hn_rx_sysctl_tree != NULL) { 4946 SYSCTL_ADD_ULONG(ctx, 4947 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4948 OID_AUTO, "packets", CTLFLAG_RW, 4949 &rxr->hn_pkts, "# of packets received"); 4950 SYSCTL_ADD_ULONG(ctx, 4951 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4952 OID_AUTO, "rss_pkts", CTLFLAG_RW, 4953 &rxr->hn_rss_pkts, 4954 "# of packets w/ RSS info received"); 4955 SYSCTL_ADD_INT(ctx, 4956 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 4957 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 4958 &rxr->hn_pktbuf_len, 0, 4959 "Temporary channel packet buffer length"); 4960 } 4961 } 4962 } 4963 4964 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 4965 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4966 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 4967 #if __FreeBSD_version < 1100095 4968 hn_rx_stat_int_sysctl, 4969 #else 4970 hn_rx_stat_u64_sysctl, 4971 #endif 4972 "LU", "LRO queued"); 4973 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 4974 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4975 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 4976 #if __FreeBSD_version < 1100095 4977 hn_rx_stat_int_sysctl, 4978 #else 4979 hn_rx_stat_u64_sysctl, 4980 #endif 4981 "LU", "LRO flushed"); 4982 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 4983 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 4984 __offsetof(struct hn_rx_ring, hn_lro_tried), 4985 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 4986 #if __FreeBSD_version >= 1100099 4987 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 4988 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 4989 hn_lro_lenlim_sysctl, "IU", 4990 "Max # of data bytes to be aggregated by LRO"); 4991 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 4992 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 4993 hn_lro_ackcnt_sysctl, "I", 4994 "Max # of ACKs to be aggregated by LRO"); 4995 #endif 4996 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 4997 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 4998 hn_trust_hcsum_sysctl, "I", 4999 "Trust tcp segement verification on host side, " 5000 "when csum info is missing"); 5001 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5002 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5003 hn_trust_hcsum_sysctl, "I", 5004 "Trust udp datagram verification on host side, " 5005 "when csum info is missing"); 5006 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5007 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5008 hn_trust_hcsum_sysctl, "I", 5009 "Trust ip packet verification on host side, " 5010 "when csum info is missing"); 5011 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5012 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5013 __offsetof(struct hn_rx_ring, hn_csum_ip), 5014 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5015 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5016 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5017 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5018 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5019 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5020 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5021 __offsetof(struct hn_rx_ring, hn_csum_udp), 5022 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5023 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5024 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5025 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5026 hn_rx_stat_ulong_sysctl, "LU", 5027 "# of packets that we trust host's csum verification"); 5028 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5029 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5030 __offsetof(struct hn_rx_ring, hn_small_pkts), 5031 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5032 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5033 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5034 __offsetof(struct hn_rx_ring, hn_ack_failed), 5035 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5036 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5037 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5038 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5039 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5040 5041 return (0); 5042 } 5043 5044 static void 5045 hn_destroy_rx_data(struct hn_softc *sc) 5046 { 5047 int i; 5048 5049 if (sc->hn_rxbuf != NULL) { 5050 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5051 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5052 else 5053 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5054 sc->hn_rxbuf = NULL; 5055 } 5056 5057 if (sc->hn_rx_ring_cnt == 0) 5058 return; 5059 5060 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5061 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5062 5063 if (rxr->hn_br == NULL) 5064 continue; 5065 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5066 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5067 } else { 5068 device_printf(sc->hn_dev, 5069 "%dth channel bufring is referenced", i); 5070 } 5071 rxr->hn_br = NULL; 5072 5073 #if defined(INET) || defined(INET6) 5074 tcp_lro_free(&rxr->hn_lro); 5075 #endif 5076 free(rxr->hn_pktbuf, M_DEVBUF); 5077 } 5078 free(sc->hn_rx_ring, M_DEVBUF); 5079 sc->hn_rx_ring = NULL; 5080 5081 sc->hn_rx_ring_cnt = 0; 5082 sc->hn_rx_ring_inuse = 0; 5083 } 5084 5085 static int 5086 hn_tx_ring_create(struct hn_softc *sc, int id) 5087 { 5088 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5089 device_t dev = sc->hn_dev; 5090 bus_dma_tag_t parent_dtag; 5091 int error, i; 5092 5093 txr->hn_sc = sc; 5094 txr->hn_tx_idx = id; 5095 5096 #ifndef HN_USE_TXDESC_BUFRING 5097 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5098 #endif 5099 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5100 5101 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5102 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5103 M_DEVBUF, M_WAITOK | M_ZERO); 5104 #ifndef HN_USE_TXDESC_BUFRING 5105 SLIST_INIT(&txr->hn_txlist); 5106 #else 5107 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5108 M_WAITOK, &txr->hn_tx_lock); 5109 #endif 5110 5111 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5112 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5113 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5114 } else { 5115 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5116 } 5117 5118 #ifdef HN_IFSTART_SUPPORT 5119 if (hn_use_if_start) { 5120 txr->hn_txeof = hn_start_txeof; 5121 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5122 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5123 } else 5124 #endif 5125 { 5126 int br_depth; 5127 5128 txr->hn_txeof = hn_xmit_txeof; 5129 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5130 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5131 5132 br_depth = hn_get_txswq_depth(txr); 5133 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5134 M_WAITOK, &txr->hn_tx_lock); 5135 } 5136 5137 txr->hn_direct_tx_size = hn_direct_tx_size; 5138 5139 /* 5140 * Always schedule transmission instead of trying to do direct 5141 * transmission. This one gives the best performance so far. 5142 */ 5143 txr->hn_sched_tx = 1; 5144 5145 parent_dtag = bus_get_dma_tag(dev); 5146 5147 /* DMA tag for RNDIS packet messages. */ 5148 error = bus_dma_tag_create(parent_dtag, /* parent */ 5149 HN_RNDIS_PKT_ALIGN, /* alignment */ 5150 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5151 BUS_SPACE_MAXADDR, /* lowaddr */ 5152 BUS_SPACE_MAXADDR, /* highaddr */ 5153 NULL, NULL, /* filter, filterarg */ 5154 HN_RNDIS_PKT_LEN, /* maxsize */ 5155 1, /* nsegments */ 5156 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5157 0, /* flags */ 5158 NULL, /* lockfunc */ 5159 NULL, /* lockfuncarg */ 5160 &txr->hn_tx_rndis_dtag); 5161 if (error) { 5162 device_printf(dev, "failed to create rndis dmatag\n"); 5163 return error; 5164 } 5165 5166 /* DMA tag for data. */ 5167 error = bus_dma_tag_create(parent_dtag, /* parent */ 5168 1, /* alignment */ 5169 HN_TX_DATA_BOUNDARY, /* boundary */ 5170 BUS_SPACE_MAXADDR, /* lowaddr */ 5171 BUS_SPACE_MAXADDR, /* highaddr */ 5172 NULL, NULL, /* filter, filterarg */ 5173 HN_TX_DATA_MAXSIZE, /* maxsize */ 5174 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5175 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5176 0, /* flags */ 5177 NULL, /* lockfunc */ 5178 NULL, /* lockfuncarg */ 5179 &txr->hn_tx_data_dtag); 5180 if (error) { 5181 device_printf(dev, "failed to create data dmatag\n"); 5182 return error; 5183 } 5184 5185 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5186 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5187 5188 txd->txr = txr; 5189 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5190 STAILQ_INIT(&txd->agg_list); 5191 5192 /* 5193 * Allocate and load RNDIS packet message. 5194 */ 5195 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5196 (void **)&txd->rndis_pkt, 5197 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5198 &txd->rndis_pkt_dmap); 5199 if (error) { 5200 device_printf(dev, 5201 "failed to allocate rndis_packet_msg, %d\n", i); 5202 return error; 5203 } 5204 5205 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5206 txd->rndis_pkt_dmap, 5207 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5208 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5209 BUS_DMA_NOWAIT); 5210 if (error) { 5211 device_printf(dev, 5212 "failed to load rndis_packet_msg, %d\n", i); 5213 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5214 txd->rndis_pkt, txd->rndis_pkt_dmap); 5215 return error; 5216 } 5217 5218 /* DMA map for TX data. */ 5219 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5220 &txd->data_dmap); 5221 if (error) { 5222 device_printf(dev, 5223 "failed to allocate tx data dmamap\n"); 5224 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5225 txd->rndis_pkt_dmap); 5226 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5227 txd->rndis_pkt, txd->rndis_pkt_dmap); 5228 return error; 5229 } 5230 5231 /* All set, put it to list */ 5232 txd->flags |= HN_TXD_FLAG_ONLIST; 5233 #ifndef HN_USE_TXDESC_BUFRING 5234 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5235 #else 5236 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5237 #endif 5238 } 5239 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5240 5241 if (sc->hn_tx_sysctl_tree != NULL) { 5242 struct sysctl_oid_list *child; 5243 struct sysctl_ctx_list *ctx; 5244 char name[16]; 5245 5246 /* 5247 * Create per TX ring sysctl tree: 5248 * dev.hn.UNIT.tx.RINGID 5249 */ 5250 ctx = device_get_sysctl_ctx(dev); 5251 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5252 5253 snprintf(name, sizeof(name), "%d", id); 5254 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5255 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5256 5257 if (txr->hn_tx_sysctl_tree != NULL) { 5258 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5259 5260 #ifdef HN_DEBUG 5261 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5262 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5263 "# of available TX descs"); 5264 #endif 5265 #ifdef HN_IFSTART_SUPPORT 5266 if (!hn_use_if_start) 5267 #endif 5268 { 5269 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5270 CTLFLAG_RD, &txr->hn_oactive, 0, 5271 "over active"); 5272 } 5273 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5274 CTLFLAG_RW, &txr->hn_pkts, 5275 "# of packets transmitted"); 5276 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5277 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 5278 } 5279 } 5280 5281 return 0; 5282 } 5283 5284 static void 5285 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5286 { 5287 struct hn_tx_ring *txr = txd->txr; 5288 5289 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5290 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5291 5292 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5293 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5294 txd->rndis_pkt_dmap); 5295 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5296 } 5297 5298 static void 5299 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5300 { 5301 5302 KASSERT(txd->refs == 0 || txd->refs == 1, 5303 ("invalid txd refs %d", txd->refs)); 5304 5305 /* Aggregated txds will be freed by their aggregating txd. */ 5306 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5307 int freed; 5308 5309 freed = hn_txdesc_put(txr, txd); 5310 KASSERT(freed, ("can't free txdesc")); 5311 } 5312 } 5313 5314 static void 5315 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5316 { 5317 int i; 5318 5319 if (txr->hn_txdesc == NULL) 5320 return; 5321 5322 /* 5323 * NOTE: 5324 * Because the freeing of aggregated txds will be deferred 5325 * to the aggregating txd, two passes are used here: 5326 * - The first pass GCes any pending txds. This GC is necessary, 5327 * since if the channels are revoked, hypervisor will not 5328 * deliver send-done for all pending txds. 5329 * - The second pass frees the busdma stuffs, i.e. after all txds 5330 * were freed. 5331 */ 5332 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5333 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5334 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5335 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5336 5337 if (txr->hn_tx_data_dtag != NULL) 5338 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5339 if (txr->hn_tx_rndis_dtag != NULL) 5340 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5341 5342 #ifdef HN_USE_TXDESC_BUFRING 5343 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5344 #endif 5345 5346 free(txr->hn_txdesc, M_DEVBUF); 5347 txr->hn_txdesc = NULL; 5348 5349 if (txr->hn_mbuf_br != NULL) 5350 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5351 5352 #ifndef HN_USE_TXDESC_BUFRING 5353 mtx_destroy(&txr->hn_txlist_spin); 5354 #endif 5355 mtx_destroy(&txr->hn_tx_lock); 5356 } 5357 5358 static int 5359 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5360 { 5361 struct sysctl_oid_list *child; 5362 struct sysctl_ctx_list *ctx; 5363 int i; 5364 5365 /* 5366 * Create TXBUF for chimney sending. 5367 * 5368 * NOTE: It is shared by all channels. 5369 */ 5370 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5371 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5372 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5373 if (sc->hn_chim == NULL) { 5374 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5375 return (ENOMEM); 5376 } 5377 5378 sc->hn_tx_ring_cnt = ring_cnt; 5379 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5380 5381 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5382 M_DEVBUF, M_WAITOK | M_ZERO); 5383 5384 ctx = device_get_sysctl_ctx(sc->hn_dev); 5385 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5386 5387 /* Create dev.hn.UNIT.tx sysctl tree */ 5388 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5389 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5390 5391 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5392 int error; 5393 5394 error = hn_tx_ring_create(sc, i); 5395 if (error) 5396 return error; 5397 } 5398 5399 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5400 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5401 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5402 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5403 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5404 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5405 __offsetof(struct hn_tx_ring, hn_send_failed), 5406 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5407 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5408 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5409 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5410 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5411 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5412 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5413 __offsetof(struct hn_tx_ring, hn_flush_failed), 5414 hn_tx_stat_ulong_sysctl, "LU", 5415 "# of packet transmission aggregation flush failure"); 5416 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5417 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5418 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5419 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5420 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5421 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5422 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5423 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5424 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5425 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5426 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5427 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5428 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5429 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5430 "# of total TX descs"); 5431 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5432 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5433 "Chimney send packet size upper boundary"); 5434 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5435 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5436 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5437 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5438 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5439 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5440 hn_tx_conf_int_sysctl, "I", 5441 "Size of the packet for direct transmission"); 5442 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5443 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5444 __offsetof(struct hn_tx_ring, hn_sched_tx), 5445 hn_tx_conf_int_sysctl, "I", 5446 "Always schedule transmission " 5447 "instead of doing direct transmission"); 5448 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5449 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5450 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5451 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5452 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5453 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5454 "Applied packet transmission aggregation size"); 5455 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5456 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5457 hn_txagg_pktmax_sysctl, "I", 5458 "Applied packet transmission aggregation packets"); 5459 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5460 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5461 hn_txagg_align_sysctl, "I", 5462 "Applied packet transmission aggregation alignment"); 5463 5464 return 0; 5465 } 5466 5467 static void 5468 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5469 { 5470 int i; 5471 5472 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5473 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5474 } 5475 5476 static void 5477 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5478 { 5479 struct ifnet *ifp = sc->hn_ifp; 5480 u_int hw_tsomax; 5481 int tso_minlen; 5482 5483 HN_LOCK_ASSERT(sc); 5484 5485 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5486 return; 5487 5488 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5489 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5490 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5491 5492 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5493 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5494 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5495 5496 if (tso_maxlen < tso_minlen) 5497 tso_maxlen = tso_minlen; 5498 else if (tso_maxlen > IP_MAXPACKET) 5499 tso_maxlen = IP_MAXPACKET; 5500 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5501 tso_maxlen = sc->hn_ndis_tso_szmax; 5502 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5503 5504 if (hn_xpnt_vf_isready(sc)) { 5505 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5506 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5507 } 5508 ifp->if_hw_tsomax = hw_tsomax; 5509 if (bootverbose) 5510 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5511 } 5512 5513 static void 5514 hn_fixup_tx_data(struct hn_softc *sc) 5515 { 5516 uint64_t csum_assist; 5517 int i; 5518 5519 hn_set_chim_size(sc, sc->hn_chim_szmax); 5520 if (hn_tx_chimney_size > 0 && 5521 hn_tx_chimney_size < sc->hn_chim_szmax) 5522 hn_set_chim_size(sc, hn_tx_chimney_size); 5523 5524 csum_assist = 0; 5525 if (sc->hn_caps & HN_CAP_IPCS) 5526 csum_assist |= CSUM_IP; 5527 if (sc->hn_caps & HN_CAP_TCP4CS) 5528 csum_assist |= CSUM_IP_TCP; 5529 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5530 csum_assist |= CSUM_IP_UDP; 5531 if (sc->hn_caps & HN_CAP_TCP6CS) 5532 csum_assist |= CSUM_IP6_TCP; 5533 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5534 csum_assist |= CSUM_IP6_UDP; 5535 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5536 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5537 5538 if (sc->hn_caps & HN_CAP_HASHVAL) { 5539 /* 5540 * Support HASHVAL pktinfo on TX path. 5541 */ 5542 if (bootverbose) 5543 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5544 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5545 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5546 } 5547 } 5548 5549 static void 5550 hn_destroy_tx_data(struct hn_softc *sc) 5551 { 5552 int i; 5553 5554 if (sc->hn_chim != NULL) { 5555 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5556 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5557 } else { 5558 device_printf(sc->hn_dev, 5559 "chimney sending buffer is referenced"); 5560 } 5561 sc->hn_chim = NULL; 5562 } 5563 5564 if (sc->hn_tx_ring_cnt == 0) 5565 return; 5566 5567 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5568 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5569 5570 free(sc->hn_tx_ring, M_DEVBUF); 5571 sc->hn_tx_ring = NULL; 5572 5573 sc->hn_tx_ring_cnt = 0; 5574 sc->hn_tx_ring_inuse = 0; 5575 } 5576 5577 #ifdef HN_IFSTART_SUPPORT 5578 5579 static void 5580 hn_start_taskfunc(void *xtxr, int pending __unused) 5581 { 5582 struct hn_tx_ring *txr = xtxr; 5583 5584 mtx_lock(&txr->hn_tx_lock); 5585 hn_start_locked(txr, 0); 5586 mtx_unlock(&txr->hn_tx_lock); 5587 } 5588 5589 static int 5590 hn_start_locked(struct hn_tx_ring *txr, int len) 5591 { 5592 struct hn_softc *sc = txr->hn_sc; 5593 struct ifnet *ifp = sc->hn_ifp; 5594 int sched = 0; 5595 5596 KASSERT(hn_use_if_start, 5597 ("hn_start_locked is called, when if_start is disabled")); 5598 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5599 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5600 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5601 5602 if (__predict_false(txr->hn_suspended)) 5603 return (0); 5604 5605 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5606 IFF_DRV_RUNNING) 5607 return (0); 5608 5609 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5610 struct hn_txdesc *txd; 5611 struct mbuf *m_head; 5612 int error; 5613 5614 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5615 if (m_head == NULL) 5616 break; 5617 5618 if (len > 0 && m_head->m_pkthdr.len > len) { 5619 /* 5620 * This sending could be time consuming; let callers 5621 * dispatch this packet sending (and sending of any 5622 * following up packets) to tx taskqueue. 5623 */ 5624 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5625 sched = 1; 5626 break; 5627 } 5628 5629 #if defined(INET6) || defined(INET) 5630 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5631 m_head = hn_tso_fixup(m_head); 5632 if (__predict_false(m_head == NULL)) { 5633 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5634 continue; 5635 } 5636 } else if (m_head->m_pkthdr.csum_flags & 5637 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5638 m_head = hn_set_hlen(m_head); 5639 if (__predict_false(m_head == NULL)) { 5640 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5641 continue; 5642 } 5643 } 5644 #endif 5645 5646 txd = hn_txdesc_get(txr); 5647 if (txd == NULL) { 5648 txr->hn_no_txdescs++; 5649 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5650 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5651 break; 5652 } 5653 5654 error = hn_encap(ifp, txr, txd, &m_head); 5655 if (error) { 5656 /* Both txd and m_head are freed */ 5657 KASSERT(txr->hn_agg_txd == NULL, 5658 ("encap failed w/ pending aggregating txdesc")); 5659 continue; 5660 } 5661 5662 if (txr->hn_agg_pktleft == 0) { 5663 if (txr->hn_agg_txd != NULL) { 5664 KASSERT(m_head == NULL, 5665 ("pending mbuf for aggregating txdesc")); 5666 error = hn_flush_txagg(ifp, txr); 5667 if (__predict_false(error)) { 5668 atomic_set_int(&ifp->if_drv_flags, 5669 IFF_DRV_OACTIVE); 5670 break; 5671 } 5672 } else { 5673 KASSERT(m_head != NULL, ("mbuf was freed")); 5674 error = hn_txpkt(ifp, txr, txd); 5675 if (__predict_false(error)) { 5676 /* txd is freed, but m_head is not */ 5677 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5678 atomic_set_int(&ifp->if_drv_flags, 5679 IFF_DRV_OACTIVE); 5680 break; 5681 } 5682 } 5683 } 5684 #ifdef INVARIANTS 5685 else { 5686 KASSERT(txr->hn_agg_txd != NULL, 5687 ("no aggregating txdesc")); 5688 KASSERT(m_head == NULL, 5689 ("pending mbuf for aggregating txdesc")); 5690 } 5691 #endif 5692 } 5693 5694 /* Flush pending aggerated transmission. */ 5695 if (txr->hn_agg_txd != NULL) 5696 hn_flush_txagg(ifp, txr); 5697 return (sched); 5698 } 5699 5700 static void 5701 hn_start(struct ifnet *ifp) 5702 { 5703 struct hn_softc *sc = ifp->if_softc; 5704 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5705 5706 if (txr->hn_sched_tx) 5707 goto do_sched; 5708 5709 if (mtx_trylock(&txr->hn_tx_lock)) { 5710 int sched; 5711 5712 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5713 mtx_unlock(&txr->hn_tx_lock); 5714 if (!sched) 5715 return; 5716 } 5717 do_sched: 5718 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5719 } 5720 5721 static void 5722 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5723 { 5724 struct hn_tx_ring *txr = xtxr; 5725 5726 mtx_lock(&txr->hn_tx_lock); 5727 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5728 hn_start_locked(txr, 0); 5729 mtx_unlock(&txr->hn_tx_lock); 5730 } 5731 5732 static void 5733 hn_start_txeof(struct hn_tx_ring *txr) 5734 { 5735 struct hn_softc *sc = txr->hn_sc; 5736 struct ifnet *ifp = sc->hn_ifp; 5737 5738 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5739 5740 if (txr->hn_sched_tx) 5741 goto do_sched; 5742 5743 if (mtx_trylock(&txr->hn_tx_lock)) { 5744 int sched; 5745 5746 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5747 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5748 mtx_unlock(&txr->hn_tx_lock); 5749 if (sched) { 5750 taskqueue_enqueue(txr->hn_tx_taskq, 5751 &txr->hn_tx_task); 5752 } 5753 } else { 5754 do_sched: 5755 /* 5756 * Release the OACTIVE earlier, with the hope, that 5757 * others could catch up. The task will clear the 5758 * flag again with the hn_tx_lock to avoid possible 5759 * races. 5760 */ 5761 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5762 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5763 } 5764 } 5765 5766 #endif /* HN_IFSTART_SUPPORT */ 5767 5768 static int 5769 hn_xmit(struct hn_tx_ring *txr, int len) 5770 { 5771 struct hn_softc *sc = txr->hn_sc; 5772 struct ifnet *ifp = sc->hn_ifp; 5773 struct mbuf *m_head; 5774 int sched = 0; 5775 5776 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5777 #ifdef HN_IFSTART_SUPPORT 5778 KASSERT(hn_use_if_start == 0, 5779 ("hn_xmit is called, when if_start is enabled")); 5780 #endif 5781 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5782 5783 if (__predict_false(txr->hn_suspended)) 5784 return (0); 5785 5786 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5787 return (0); 5788 5789 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5790 struct hn_txdesc *txd; 5791 int error; 5792 5793 if (len > 0 && m_head->m_pkthdr.len > len) { 5794 /* 5795 * This sending could be time consuming; let callers 5796 * dispatch this packet sending (and sending of any 5797 * following up packets) to tx taskqueue. 5798 */ 5799 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5800 sched = 1; 5801 break; 5802 } 5803 5804 txd = hn_txdesc_get(txr); 5805 if (txd == NULL) { 5806 txr->hn_no_txdescs++; 5807 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5808 txr->hn_oactive = 1; 5809 break; 5810 } 5811 5812 error = hn_encap(ifp, txr, txd, &m_head); 5813 if (error) { 5814 /* Both txd and m_head are freed; discard */ 5815 KASSERT(txr->hn_agg_txd == NULL, 5816 ("encap failed w/ pending aggregating txdesc")); 5817 drbr_advance(ifp, txr->hn_mbuf_br); 5818 continue; 5819 } 5820 5821 if (txr->hn_agg_pktleft == 0) { 5822 if (txr->hn_agg_txd != NULL) { 5823 KASSERT(m_head == NULL, 5824 ("pending mbuf for aggregating txdesc")); 5825 error = hn_flush_txagg(ifp, txr); 5826 if (__predict_false(error)) { 5827 txr->hn_oactive = 1; 5828 break; 5829 } 5830 } else { 5831 KASSERT(m_head != NULL, ("mbuf was freed")); 5832 error = hn_txpkt(ifp, txr, txd); 5833 if (__predict_false(error)) { 5834 /* txd is freed, but m_head is not */ 5835 drbr_putback(ifp, txr->hn_mbuf_br, 5836 m_head); 5837 txr->hn_oactive = 1; 5838 break; 5839 } 5840 } 5841 } 5842 #ifdef INVARIANTS 5843 else { 5844 KASSERT(txr->hn_agg_txd != NULL, 5845 ("no aggregating txdesc")); 5846 KASSERT(m_head == NULL, 5847 ("pending mbuf for aggregating txdesc")); 5848 } 5849 #endif 5850 5851 /* Sent */ 5852 drbr_advance(ifp, txr->hn_mbuf_br); 5853 } 5854 5855 /* Flush pending aggerated transmission. */ 5856 if (txr->hn_agg_txd != NULL) 5857 hn_flush_txagg(ifp, txr); 5858 return (sched); 5859 } 5860 5861 static int 5862 hn_transmit(struct ifnet *ifp, struct mbuf *m) 5863 { 5864 struct hn_softc *sc = ifp->if_softc; 5865 struct hn_tx_ring *txr; 5866 int error, idx = 0; 5867 5868 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5869 struct rm_priotracker pt; 5870 5871 rm_rlock(&sc->hn_vf_lock, &pt); 5872 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5873 struct mbuf *m_bpf = NULL; 5874 int obytes, omcast; 5875 5876 obytes = m->m_pkthdr.len; 5877 if (m->m_flags & M_MCAST) 5878 omcast = 1; 5879 5880 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 5881 if (bpf_peers_present(ifp->if_bpf)) { 5882 m_bpf = m_copypacket(m, M_NOWAIT); 5883 if (m_bpf == NULL) { 5884 /* 5885 * Failed to grab a shallow 5886 * copy; tap now. 5887 */ 5888 ETHER_BPF_MTAP(ifp, m); 5889 } 5890 } 5891 } else { 5892 ETHER_BPF_MTAP(ifp, m); 5893 } 5894 5895 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 5896 rm_runlock(&sc->hn_vf_lock, &pt); 5897 5898 if (m_bpf != NULL) { 5899 if (!error) 5900 ETHER_BPF_MTAP(ifp, m_bpf); 5901 m_freem(m_bpf); 5902 } 5903 5904 if (error == ENOBUFS) { 5905 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 5906 } else if (error) { 5907 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5908 } else { 5909 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 5910 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 5911 if (omcast) { 5912 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 5913 omcast); 5914 } 5915 } 5916 return (error); 5917 } 5918 rm_runlock(&sc->hn_vf_lock, &pt); 5919 } 5920 5921 #if defined(INET6) || defined(INET) 5922 /* 5923 * Perform TSO packet header fixup or get l2/l3 header length now, 5924 * since packet headers should be cache-hot. 5925 */ 5926 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 5927 m = hn_tso_fixup(m); 5928 if (__predict_false(m == NULL)) { 5929 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5930 return EIO; 5931 } 5932 } else if (m->m_pkthdr.csum_flags & 5933 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5934 m = hn_set_hlen(m); 5935 if (__predict_false(m == NULL)) { 5936 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5937 return EIO; 5938 } 5939 } 5940 #endif 5941 5942 /* 5943 * Select the TX ring based on flowid 5944 */ 5945 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 5946 #ifdef RSS 5947 uint32_t bid; 5948 5949 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 5950 &bid) == 0) 5951 idx = bid % sc->hn_tx_ring_inuse; 5952 else 5953 #endif 5954 { 5955 #if defined(INET6) || defined(INET) 5956 int tcpsyn = 0; 5957 5958 if (m->m_pkthdr.len < 128 && 5959 (m->m_pkthdr.csum_flags & 5960 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 5961 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 5962 m = hn_check_tcpsyn(m, &tcpsyn); 5963 if (__predict_false(m == NULL)) { 5964 if_inc_counter(ifp, 5965 IFCOUNTER_OERRORS, 1); 5966 return (EIO); 5967 } 5968 } 5969 #else 5970 const int tcpsyn = 0; 5971 #endif 5972 if (tcpsyn) 5973 idx = 0; 5974 else 5975 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 5976 } 5977 } 5978 txr = &sc->hn_tx_ring[idx]; 5979 5980 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 5981 if (error) { 5982 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 5983 return error; 5984 } 5985 5986 if (txr->hn_oactive) 5987 return 0; 5988 5989 if (txr->hn_sched_tx) 5990 goto do_sched; 5991 5992 if (mtx_trylock(&txr->hn_tx_lock)) { 5993 int sched; 5994 5995 sched = hn_xmit(txr, txr->hn_direct_tx_size); 5996 mtx_unlock(&txr->hn_tx_lock); 5997 if (!sched) 5998 return 0; 5999 } 6000 do_sched: 6001 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6002 return 0; 6003 } 6004 6005 static void 6006 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6007 { 6008 struct mbuf *m; 6009 6010 mtx_lock(&txr->hn_tx_lock); 6011 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6012 m_freem(m); 6013 mtx_unlock(&txr->hn_tx_lock); 6014 } 6015 6016 static void 6017 hn_xmit_qflush(struct ifnet *ifp) 6018 { 6019 struct hn_softc *sc = ifp->if_softc; 6020 struct rm_priotracker pt; 6021 int i; 6022 6023 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6024 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6025 if_qflush(ifp); 6026 6027 rm_rlock(&sc->hn_vf_lock, &pt); 6028 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6029 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6030 rm_runlock(&sc->hn_vf_lock, &pt); 6031 } 6032 6033 static void 6034 hn_xmit_txeof(struct hn_tx_ring *txr) 6035 { 6036 6037 if (txr->hn_sched_tx) 6038 goto do_sched; 6039 6040 if (mtx_trylock(&txr->hn_tx_lock)) { 6041 int sched; 6042 6043 txr->hn_oactive = 0; 6044 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6045 mtx_unlock(&txr->hn_tx_lock); 6046 if (sched) { 6047 taskqueue_enqueue(txr->hn_tx_taskq, 6048 &txr->hn_tx_task); 6049 } 6050 } else { 6051 do_sched: 6052 /* 6053 * Release the oactive earlier, with the hope, that 6054 * others could catch up. The task will clear the 6055 * oactive again with the hn_tx_lock to avoid possible 6056 * races. 6057 */ 6058 txr->hn_oactive = 0; 6059 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6060 } 6061 } 6062 6063 static void 6064 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6065 { 6066 struct hn_tx_ring *txr = xtxr; 6067 6068 mtx_lock(&txr->hn_tx_lock); 6069 hn_xmit(txr, 0); 6070 mtx_unlock(&txr->hn_tx_lock); 6071 } 6072 6073 static void 6074 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6075 { 6076 struct hn_tx_ring *txr = xtxr; 6077 6078 mtx_lock(&txr->hn_tx_lock); 6079 txr->hn_oactive = 0; 6080 hn_xmit(txr, 0); 6081 mtx_unlock(&txr->hn_tx_lock); 6082 } 6083 6084 static int 6085 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6086 { 6087 struct vmbus_chan_br cbr; 6088 struct hn_rx_ring *rxr; 6089 struct hn_tx_ring *txr = NULL; 6090 int idx, error; 6091 6092 idx = vmbus_chan_subidx(chan); 6093 6094 /* 6095 * Link this channel to RX/TX ring. 6096 */ 6097 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6098 ("invalid channel index %d, should > 0 && < %d", 6099 idx, sc->hn_rx_ring_inuse)); 6100 rxr = &sc->hn_rx_ring[idx]; 6101 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6102 ("RX ring %d already attached", idx)); 6103 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6104 rxr->hn_chan = chan; 6105 6106 if (bootverbose) { 6107 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6108 idx, vmbus_chan_id(chan)); 6109 } 6110 6111 if (idx < sc->hn_tx_ring_inuse) { 6112 txr = &sc->hn_tx_ring[idx]; 6113 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6114 ("TX ring %d already attached", idx)); 6115 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6116 6117 txr->hn_chan = chan; 6118 if (bootverbose) { 6119 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6120 idx, vmbus_chan_id(chan)); 6121 } 6122 } 6123 6124 /* Bind this channel to a proper CPU. */ 6125 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6126 6127 /* 6128 * Open this channel 6129 */ 6130 cbr.cbr = rxr->hn_br; 6131 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6132 cbr.cbr_txsz = HN_TXBR_SIZE; 6133 cbr.cbr_rxsz = HN_RXBR_SIZE; 6134 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6135 if (error) { 6136 if (error == EISCONN) { 6137 if_printf(sc->hn_ifp, "bufring is connected after " 6138 "chan%u open failure\n", vmbus_chan_id(chan)); 6139 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6140 } else { 6141 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6142 vmbus_chan_id(chan), error); 6143 } 6144 } 6145 return (error); 6146 } 6147 6148 static void 6149 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6150 { 6151 struct hn_rx_ring *rxr; 6152 int idx, error; 6153 6154 idx = vmbus_chan_subidx(chan); 6155 6156 /* 6157 * Link this channel to RX/TX ring. 6158 */ 6159 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6160 ("invalid channel index %d, should > 0 && < %d", 6161 idx, sc->hn_rx_ring_inuse)); 6162 rxr = &sc->hn_rx_ring[idx]; 6163 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6164 ("RX ring %d is not attached", idx)); 6165 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6166 6167 if (idx < sc->hn_tx_ring_inuse) { 6168 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6169 6170 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6171 ("TX ring %d is not attached attached", idx)); 6172 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6173 } 6174 6175 /* 6176 * Close this channel. 6177 * 6178 * NOTE: 6179 * Channel closing does _not_ destroy the target channel. 6180 */ 6181 error = vmbus_chan_close_direct(chan); 6182 if (error == EISCONN) { 6183 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6184 "after being closed\n", vmbus_chan_id(chan)); 6185 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6186 } else if (error) { 6187 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6188 vmbus_chan_id(chan), error); 6189 } 6190 } 6191 6192 static int 6193 hn_attach_subchans(struct hn_softc *sc) 6194 { 6195 struct vmbus_channel **subchans; 6196 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6197 int i, error = 0; 6198 6199 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6200 6201 /* Attach the sub-channels. */ 6202 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6203 for (i = 0; i < subchan_cnt; ++i) { 6204 int error1; 6205 6206 error1 = hn_chan_attach(sc, subchans[i]); 6207 if (error1) { 6208 error = error1; 6209 /* Move on; all channels will be detached later. */ 6210 } 6211 } 6212 vmbus_subchan_rel(subchans, subchan_cnt); 6213 6214 if (error) { 6215 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6216 } else { 6217 if (bootverbose) { 6218 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6219 subchan_cnt); 6220 } 6221 } 6222 return (error); 6223 } 6224 6225 static void 6226 hn_detach_allchans(struct hn_softc *sc) 6227 { 6228 struct vmbus_channel **subchans; 6229 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6230 int i; 6231 6232 if (subchan_cnt == 0) 6233 goto back; 6234 6235 /* Detach the sub-channels. */ 6236 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6237 for (i = 0; i < subchan_cnt; ++i) 6238 hn_chan_detach(sc, subchans[i]); 6239 vmbus_subchan_rel(subchans, subchan_cnt); 6240 6241 back: 6242 /* 6243 * Detach the primary channel, _after_ all sub-channels 6244 * are detached. 6245 */ 6246 hn_chan_detach(sc, sc->hn_prichan); 6247 6248 /* Wait for sub-channels to be destroyed, if any. */ 6249 vmbus_subchan_drain(sc->hn_prichan); 6250 6251 #ifdef INVARIANTS 6252 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6253 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6254 HN_RX_FLAG_ATTACHED) == 0, 6255 ("%dth RX ring is still attached", i)); 6256 } 6257 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6258 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6259 HN_TX_FLAG_ATTACHED) == 0, 6260 ("%dth TX ring is still attached", i)); 6261 } 6262 #endif 6263 } 6264 6265 static int 6266 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6267 { 6268 struct vmbus_channel **subchans; 6269 int nchan, rxr_cnt, error; 6270 6271 nchan = *nsubch + 1; 6272 if (nchan == 1) { 6273 /* 6274 * Multiple RX/TX rings are not requested. 6275 */ 6276 *nsubch = 0; 6277 return (0); 6278 } 6279 6280 /* 6281 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6282 * table entries. 6283 */ 6284 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6285 if (error) { 6286 /* No RSS; this is benign. */ 6287 *nsubch = 0; 6288 return (0); 6289 } 6290 if (bootverbose) { 6291 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6292 rxr_cnt, nchan); 6293 } 6294 6295 if (nchan > rxr_cnt) 6296 nchan = rxr_cnt; 6297 if (nchan == 1) { 6298 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6299 *nsubch = 0; 6300 return (0); 6301 } 6302 6303 /* 6304 * Allocate sub-channels from NVS. 6305 */ 6306 *nsubch = nchan - 1; 6307 error = hn_nvs_alloc_subchans(sc, nsubch); 6308 if (error || *nsubch == 0) { 6309 /* Failed to allocate sub-channels. */ 6310 *nsubch = 0; 6311 return (0); 6312 } 6313 6314 /* 6315 * Wait for all sub-channels to become ready before moving on. 6316 */ 6317 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6318 vmbus_subchan_rel(subchans, *nsubch); 6319 return (0); 6320 } 6321 6322 static bool 6323 hn_synth_attachable(const struct hn_softc *sc) 6324 { 6325 int i; 6326 6327 if (sc->hn_flags & HN_FLAG_ERRORS) 6328 return (false); 6329 6330 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6331 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6332 6333 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6334 return (false); 6335 } 6336 return (true); 6337 } 6338 6339 /* 6340 * Make sure that the RX filter is zero after the successful 6341 * RNDIS initialization. 6342 * 6343 * NOTE: 6344 * Under certain conditions on certain versions of Hyper-V, 6345 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6346 * after the successful RNDIS initialization, which breaks 6347 * the assumption of any following code (well, it breaks the 6348 * RNDIS API contract actually). Clear the RNDIS rxfilter 6349 * explicitly, drain packets sneaking through, and drain the 6350 * interrupt taskqueues scheduled due to the stealth packets. 6351 */ 6352 static void 6353 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6354 { 6355 6356 hn_disable_rx(sc); 6357 hn_drain_rxtx(sc, nchan); 6358 } 6359 6360 static int 6361 hn_synth_attach(struct hn_softc *sc, int mtu) 6362 { 6363 #define ATTACHED_NVS 0x0002 6364 #define ATTACHED_RNDIS 0x0004 6365 6366 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6367 int error, nsubch, nchan = 1, i, rndis_inited; 6368 uint32_t old_caps, attached = 0; 6369 6370 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6371 ("synthetic parts were attached")); 6372 6373 if (!hn_synth_attachable(sc)) 6374 return (ENXIO); 6375 6376 /* Save capabilities for later verification. */ 6377 old_caps = sc->hn_caps; 6378 sc->hn_caps = 0; 6379 6380 /* Clear RSS stuffs. */ 6381 sc->hn_rss_ind_size = 0; 6382 sc->hn_rss_hash = 0; 6383 sc->hn_rss_hcap = 0; 6384 6385 /* 6386 * Attach the primary channel _before_ attaching NVS and RNDIS. 6387 */ 6388 error = hn_chan_attach(sc, sc->hn_prichan); 6389 if (error) 6390 goto failed; 6391 6392 /* 6393 * Attach NVS. 6394 */ 6395 error = hn_nvs_attach(sc, mtu); 6396 if (error) 6397 goto failed; 6398 attached |= ATTACHED_NVS; 6399 6400 /* 6401 * Attach RNDIS _after_ NVS is attached. 6402 */ 6403 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6404 if (rndis_inited) 6405 attached |= ATTACHED_RNDIS; 6406 if (error) 6407 goto failed; 6408 6409 /* 6410 * Make sure capabilities are not changed. 6411 */ 6412 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6413 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6414 old_caps, sc->hn_caps); 6415 error = ENXIO; 6416 goto failed; 6417 } 6418 6419 /* 6420 * Allocate sub-channels for multi-TX/RX rings. 6421 * 6422 * NOTE: 6423 * The # of RX rings that can be used is equivalent to the # of 6424 * channels to be requested. 6425 */ 6426 nsubch = sc->hn_rx_ring_cnt - 1; 6427 error = hn_synth_alloc_subchans(sc, &nsubch); 6428 if (error) 6429 goto failed; 6430 /* NOTE: _Full_ synthetic parts detach is required now. */ 6431 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6432 6433 /* 6434 * Set the # of TX/RX rings that could be used according to 6435 * the # of channels that NVS offered. 6436 */ 6437 nchan = nsubch + 1; 6438 hn_set_ring_inuse(sc, nchan); 6439 if (nchan == 1) { 6440 /* Only the primary channel can be used; done */ 6441 goto back; 6442 } 6443 6444 /* 6445 * Attach the sub-channels. 6446 * 6447 * NOTE: hn_set_ring_inuse() _must_ have been called. 6448 */ 6449 error = hn_attach_subchans(sc); 6450 if (error) 6451 goto failed; 6452 6453 /* 6454 * Configure RSS key and indirect table _after_ all sub-channels 6455 * are attached. 6456 */ 6457 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6458 /* 6459 * RSS key is not set yet; set it to the default RSS key. 6460 */ 6461 if (bootverbose) 6462 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6463 #ifdef RSS 6464 rss_getkey(rss->rss_key); 6465 #else 6466 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6467 #endif 6468 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6469 } 6470 6471 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6472 /* 6473 * RSS indirect table is not set yet; set it up in round- 6474 * robin fashion. 6475 */ 6476 if (bootverbose) { 6477 if_printf(sc->hn_ifp, "setup default RSS indirect " 6478 "table\n"); 6479 } 6480 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6481 uint32_t subidx; 6482 6483 #ifdef RSS 6484 subidx = rss_get_indirection_to_bucket(i); 6485 #else 6486 subidx = i; 6487 #endif 6488 rss->rss_ind[i] = subidx % nchan; 6489 } 6490 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6491 } else { 6492 /* 6493 * # of usable channels may be changed, so we have to 6494 * make sure that all entries in RSS indirect table 6495 * are valid. 6496 * 6497 * NOTE: hn_set_ring_inuse() _must_ have been called. 6498 */ 6499 hn_rss_ind_fixup(sc); 6500 } 6501 6502 sc->hn_rss_hash = sc->hn_rss_hcap; 6503 if ((sc->hn_flags & HN_FLAG_RXVF) || 6504 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6505 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6506 hn_vf_rss_fixup(sc, false); 6507 } 6508 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6509 if (error) 6510 goto failed; 6511 back: 6512 /* 6513 * Fixup transmission aggregation setup. 6514 */ 6515 hn_set_txagg(sc); 6516 hn_rndis_init_fixat(sc, nchan); 6517 return (0); 6518 6519 failed: 6520 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6521 hn_rndis_init_fixat(sc, nchan); 6522 hn_synth_detach(sc); 6523 } else { 6524 if (attached & ATTACHED_RNDIS) { 6525 hn_rndis_init_fixat(sc, nchan); 6526 hn_rndis_detach(sc); 6527 } 6528 if (attached & ATTACHED_NVS) 6529 hn_nvs_detach(sc); 6530 hn_chan_detach(sc, sc->hn_prichan); 6531 /* Restore old capabilities. */ 6532 sc->hn_caps = old_caps; 6533 } 6534 return (error); 6535 6536 #undef ATTACHED_RNDIS 6537 #undef ATTACHED_NVS 6538 } 6539 6540 /* 6541 * NOTE: 6542 * The interface must have been suspended though hn_suspend(), before 6543 * this function get called. 6544 */ 6545 static void 6546 hn_synth_detach(struct hn_softc *sc) 6547 { 6548 6549 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6550 ("synthetic parts were not attached")); 6551 6552 /* Detach the RNDIS first. */ 6553 hn_rndis_detach(sc); 6554 6555 /* Detach NVS. */ 6556 hn_nvs_detach(sc); 6557 6558 /* Detach all of the channels. */ 6559 hn_detach_allchans(sc); 6560 6561 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6562 } 6563 6564 static void 6565 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6566 { 6567 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6568 ("invalid ring count %d", ring_cnt)); 6569 6570 if (sc->hn_tx_ring_cnt > ring_cnt) 6571 sc->hn_tx_ring_inuse = ring_cnt; 6572 else 6573 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6574 sc->hn_rx_ring_inuse = ring_cnt; 6575 6576 #ifdef RSS 6577 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6578 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6579 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6580 rss_getnumbuckets()); 6581 } 6582 #endif 6583 6584 if (bootverbose) { 6585 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6586 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6587 } 6588 } 6589 6590 static void 6591 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6592 { 6593 6594 /* 6595 * NOTE: 6596 * The TX bufring will not be drained by the hypervisor, 6597 * if the primary channel is revoked. 6598 */ 6599 while (!vmbus_chan_rx_empty(chan) || 6600 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6601 !vmbus_chan_tx_empty(chan))) 6602 pause("waitch", 1); 6603 vmbus_chan_intr_drain(chan); 6604 } 6605 6606 static void 6607 hn_disable_rx(struct hn_softc *sc) 6608 { 6609 6610 /* 6611 * Disable RX by clearing RX filter forcefully. 6612 */ 6613 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6614 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6615 6616 /* 6617 * Give RNDIS enough time to flush all pending data packets. 6618 */ 6619 pause("waitrx", (200 * hz) / 1000); 6620 } 6621 6622 /* 6623 * NOTE: 6624 * RX/TX _must_ have been suspended/disabled, before this function 6625 * is called. 6626 */ 6627 static void 6628 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6629 { 6630 struct vmbus_channel **subch = NULL; 6631 int nsubch; 6632 6633 /* 6634 * Drain RX/TX bufrings and interrupts. 6635 */ 6636 nsubch = nchan - 1; 6637 if (nsubch > 0) 6638 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6639 6640 if (subch != NULL) { 6641 int i; 6642 6643 for (i = 0; i < nsubch; ++i) 6644 hn_chan_drain(sc, subch[i]); 6645 } 6646 hn_chan_drain(sc, sc->hn_prichan); 6647 6648 if (subch != NULL) 6649 vmbus_subchan_rel(subch, nsubch); 6650 } 6651 6652 static void 6653 hn_suspend_data(struct hn_softc *sc) 6654 { 6655 struct hn_tx_ring *txr; 6656 int i; 6657 6658 HN_LOCK_ASSERT(sc); 6659 6660 /* 6661 * Suspend TX. 6662 */ 6663 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6664 txr = &sc->hn_tx_ring[i]; 6665 6666 mtx_lock(&txr->hn_tx_lock); 6667 txr->hn_suspended = 1; 6668 mtx_unlock(&txr->hn_tx_lock); 6669 /* No one is able send more packets now. */ 6670 6671 /* 6672 * Wait for all pending sends to finish. 6673 * 6674 * NOTE: 6675 * We will _not_ receive all pending send-done, if the 6676 * primary channel is revoked. 6677 */ 6678 while (hn_tx_ring_pending(txr) && 6679 !vmbus_chan_is_revoked(sc->hn_prichan)) 6680 pause("hnwtx", 1 /* 1 tick */); 6681 } 6682 6683 /* 6684 * Disable RX. 6685 */ 6686 hn_disable_rx(sc); 6687 6688 /* 6689 * Drain RX/TX. 6690 */ 6691 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6692 6693 /* 6694 * Drain any pending TX tasks. 6695 * 6696 * NOTE: 6697 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6698 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6699 */ 6700 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6701 txr = &sc->hn_tx_ring[i]; 6702 6703 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6704 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6705 } 6706 } 6707 6708 static void 6709 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6710 { 6711 6712 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6713 } 6714 6715 static void 6716 hn_suspend_mgmt(struct hn_softc *sc) 6717 { 6718 struct task task; 6719 6720 HN_LOCK_ASSERT(sc); 6721 6722 /* 6723 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6724 * through hn_mgmt_taskq. 6725 */ 6726 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6727 vmbus_chan_run_task(sc->hn_prichan, &task); 6728 6729 /* 6730 * Make sure that all pending management tasks are completed. 6731 */ 6732 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6733 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6734 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6735 } 6736 6737 static void 6738 hn_suspend(struct hn_softc *sc) 6739 { 6740 6741 /* Disable polling. */ 6742 hn_polling(sc, 0); 6743 6744 /* 6745 * If the non-transparent mode VF is activated, the synthetic 6746 * device is receiving packets, so the data path of the 6747 * synthetic device must be suspended. 6748 */ 6749 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6750 (sc->hn_flags & HN_FLAG_RXVF)) 6751 hn_suspend_data(sc); 6752 hn_suspend_mgmt(sc); 6753 } 6754 6755 static void 6756 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6757 { 6758 int i; 6759 6760 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6761 ("invalid TX ring count %d", tx_ring_cnt)); 6762 6763 for (i = 0; i < tx_ring_cnt; ++i) { 6764 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6765 6766 mtx_lock(&txr->hn_tx_lock); 6767 txr->hn_suspended = 0; 6768 mtx_unlock(&txr->hn_tx_lock); 6769 } 6770 } 6771 6772 static void 6773 hn_resume_data(struct hn_softc *sc) 6774 { 6775 int i; 6776 6777 HN_LOCK_ASSERT(sc); 6778 6779 /* 6780 * Re-enable RX. 6781 */ 6782 hn_rxfilter_config(sc); 6783 6784 /* 6785 * Make sure to clear suspend status on "all" TX rings, 6786 * since hn_tx_ring_inuse can be changed after 6787 * hn_suspend_data(). 6788 */ 6789 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6790 6791 #ifdef HN_IFSTART_SUPPORT 6792 if (!hn_use_if_start) 6793 #endif 6794 { 6795 /* 6796 * Flush unused drbrs, since hn_tx_ring_inuse may be 6797 * reduced. 6798 */ 6799 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6800 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6801 } 6802 6803 /* 6804 * Kick start TX. 6805 */ 6806 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6807 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6808 6809 /* 6810 * Use txeof task, so that any pending oactive can be 6811 * cleared properly. 6812 */ 6813 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6814 } 6815 } 6816 6817 static void 6818 hn_resume_mgmt(struct hn_softc *sc) 6819 { 6820 6821 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6822 6823 /* 6824 * Kick off network change detection, if it was pending. 6825 * If no network change was pending, start link status 6826 * checks, which is more lightweight than network change 6827 * detection. 6828 */ 6829 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6830 hn_change_network(sc); 6831 else 6832 hn_update_link_status(sc); 6833 } 6834 6835 static void 6836 hn_resume(struct hn_softc *sc) 6837 { 6838 6839 /* 6840 * If the non-transparent mode VF is activated, the synthetic 6841 * device have to receive packets, so the data path of the 6842 * synthetic device must be resumed. 6843 */ 6844 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6845 (sc->hn_flags & HN_FLAG_RXVF)) 6846 hn_resume_data(sc); 6847 6848 /* 6849 * Don't resume link status change if VF is attached/activated. 6850 * - In the non-transparent VF mode, the synthetic device marks 6851 * link down until the VF is deactivated; i.e. VF is down. 6852 * - In transparent VF mode, VF's media status is used until 6853 * the VF is detached. 6854 */ 6855 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 6856 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 6857 hn_resume_mgmt(sc); 6858 6859 /* 6860 * Re-enable polling if this interface is running and 6861 * the polling is requested. 6862 */ 6863 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 6864 hn_polling(sc, sc->hn_pollhz); 6865 } 6866 6867 static void 6868 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 6869 { 6870 const struct rndis_status_msg *msg; 6871 int ofs; 6872 6873 if (dlen < sizeof(*msg)) { 6874 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 6875 return; 6876 } 6877 msg = data; 6878 6879 switch (msg->rm_status) { 6880 case RNDIS_STATUS_MEDIA_CONNECT: 6881 case RNDIS_STATUS_MEDIA_DISCONNECT: 6882 hn_update_link_status(sc); 6883 break; 6884 6885 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 6886 case RNDIS_STATUS_LINK_SPEED_CHANGE: 6887 /* Not really useful; ignore. */ 6888 break; 6889 6890 case RNDIS_STATUS_NETWORK_CHANGE: 6891 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 6892 if (dlen < ofs + msg->rm_stbuflen || 6893 msg->rm_stbuflen < sizeof(uint32_t)) { 6894 if_printf(sc->hn_ifp, "network changed\n"); 6895 } else { 6896 uint32_t change; 6897 6898 memcpy(&change, ((const uint8_t *)msg) + ofs, 6899 sizeof(change)); 6900 if_printf(sc->hn_ifp, "network changed, change %u\n", 6901 change); 6902 } 6903 hn_change_network(sc); 6904 break; 6905 6906 default: 6907 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 6908 msg->rm_status); 6909 break; 6910 } 6911 } 6912 6913 static int 6914 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 6915 { 6916 const struct rndis_pktinfo *pi = info_data; 6917 uint32_t mask = 0; 6918 6919 while (info_dlen != 0) { 6920 const void *data; 6921 uint32_t dlen; 6922 6923 if (__predict_false(info_dlen < sizeof(*pi))) 6924 return (EINVAL); 6925 if (__predict_false(info_dlen < pi->rm_size)) 6926 return (EINVAL); 6927 info_dlen -= pi->rm_size; 6928 6929 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 6930 return (EINVAL); 6931 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 6932 return (EINVAL); 6933 dlen = pi->rm_size - pi->rm_pktinfooffset; 6934 data = pi->rm_data; 6935 6936 switch (pi->rm_type) { 6937 case NDIS_PKTINFO_TYPE_VLAN: 6938 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 6939 return (EINVAL); 6940 info->vlan_info = *((const uint32_t *)data); 6941 mask |= HN_RXINFO_VLAN; 6942 break; 6943 6944 case NDIS_PKTINFO_TYPE_CSUM: 6945 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 6946 return (EINVAL); 6947 info->csum_info = *((const uint32_t *)data); 6948 mask |= HN_RXINFO_CSUM; 6949 break; 6950 6951 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 6952 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 6953 return (EINVAL); 6954 info->hash_value = *((const uint32_t *)data); 6955 mask |= HN_RXINFO_HASHVAL; 6956 break; 6957 6958 case HN_NDIS_PKTINFO_TYPE_HASHINF: 6959 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 6960 return (EINVAL); 6961 info->hash_info = *((const uint32_t *)data); 6962 mask |= HN_RXINFO_HASHINF; 6963 break; 6964 6965 default: 6966 goto next; 6967 } 6968 6969 if (mask == HN_RXINFO_ALL) { 6970 /* All found; done */ 6971 break; 6972 } 6973 next: 6974 pi = (const struct rndis_pktinfo *) 6975 ((const uint8_t *)pi + pi->rm_size); 6976 } 6977 6978 /* 6979 * Final fixup. 6980 * - If there is no hash value, invalidate the hash info. 6981 */ 6982 if ((mask & HN_RXINFO_HASHVAL) == 0) 6983 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 6984 return (0); 6985 } 6986 6987 static __inline bool 6988 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 6989 { 6990 6991 if (off < check_off) { 6992 if (__predict_true(off + len <= check_off)) 6993 return (false); 6994 } else if (off > check_off) { 6995 if (__predict_true(check_off + check_len <= off)) 6996 return (false); 6997 } 6998 return (true); 6999 } 7000 7001 static void 7002 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7003 { 7004 const struct rndis_packet_msg *pkt; 7005 struct hn_rxinfo info; 7006 int data_off, pktinfo_off, data_len, pktinfo_len; 7007 7008 /* 7009 * Check length. 7010 */ 7011 if (__predict_false(dlen < sizeof(*pkt))) { 7012 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7013 return; 7014 } 7015 pkt = data; 7016 7017 if (__predict_false(dlen < pkt->rm_len)) { 7018 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7019 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7020 return; 7021 } 7022 if (__predict_false(pkt->rm_len < 7023 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7024 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7025 "msglen %u, data %u, oob %u, pktinfo %u\n", 7026 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7027 pkt->rm_pktinfolen); 7028 return; 7029 } 7030 if (__predict_false(pkt->rm_datalen == 0)) { 7031 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7032 return; 7033 } 7034 7035 /* 7036 * Check offests. 7037 */ 7038 #define IS_OFFSET_INVALID(ofs) \ 7039 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7040 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7041 7042 /* XXX Hyper-V does not meet data offset alignment requirement */ 7043 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7044 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7045 "data offset %u\n", pkt->rm_dataoffset); 7046 return; 7047 } 7048 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7049 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7050 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7051 "oob offset %u\n", pkt->rm_oobdataoffset); 7052 return; 7053 } 7054 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7055 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7056 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7057 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7058 return; 7059 } 7060 7061 #undef IS_OFFSET_INVALID 7062 7063 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7064 data_len = pkt->rm_datalen; 7065 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7066 pktinfo_len = pkt->rm_pktinfolen; 7067 7068 /* 7069 * Check OOB coverage. 7070 */ 7071 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7072 int oob_off, oob_len; 7073 7074 if_printf(rxr->hn_ifp, "got oobdata\n"); 7075 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7076 oob_len = pkt->rm_oobdatalen; 7077 7078 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7079 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7080 "oob overflow, msglen %u, oob abs %d len %d\n", 7081 pkt->rm_len, oob_off, oob_len); 7082 return; 7083 } 7084 7085 /* 7086 * Check against data. 7087 */ 7088 if (hn_rndis_check_overlap(oob_off, oob_len, 7089 data_off, data_len)) { 7090 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7091 "oob overlaps data, oob abs %d len %d, " 7092 "data abs %d len %d\n", 7093 oob_off, oob_len, data_off, data_len); 7094 return; 7095 } 7096 7097 /* 7098 * Check against pktinfo. 7099 */ 7100 if (pktinfo_len != 0 && 7101 hn_rndis_check_overlap(oob_off, oob_len, 7102 pktinfo_off, pktinfo_len)) { 7103 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7104 "oob overlaps pktinfo, oob abs %d len %d, " 7105 "pktinfo abs %d len %d\n", 7106 oob_off, oob_len, pktinfo_off, pktinfo_len); 7107 return; 7108 } 7109 } 7110 7111 /* 7112 * Check per-packet-info coverage and find useful per-packet-info. 7113 */ 7114 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 7115 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 7116 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 7117 if (__predict_true(pktinfo_len != 0)) { 7118 bool overlap; 7119 int error; 7120 7121 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7122 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7123 "pktinfo overflow, msglen %u, " 7124 "pktinfo abs %d len %d\n", 7125 pkt->rm_len, pktinfo_off, pktinfo_len); 7126 return; 7127 } 7128 7129 /* 7130 * Check packet info coverage. 7131 */ 7132 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7133 data_off, data_len); 7134 if (__predict_false(overlap)) { 7135 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7136 "pktinfo overlap data, pktinfo abs %d len %d, " 7137 "data abs %d len %d\n", 7138 pktinfo_off, pktinfo_len, data_off, data_len); 7139 return; 7140 } 7141 7142 /* 7143 * Find useful per-packet-info. 7144 */ 7145 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7146 pktinfo_len, &info); 7147 if (__predict_false(error)) { 7148 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7149 "pktinfo\n"); 7150 return; 7151 } 7152 } 7153 7154 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7155 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7156 "data overflow, msglen %u, data abs %d len %d\n", 7157 pkt->rm_len, data_off, data_len); 7158 return; 7159 } 7160 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 7161 } 7162 7163 static __inline void 7164 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7165 { 7166 const struct rndis_msghdr *hdr; 7167 7168 if (__predict_false(dlen < sizeof(*hdr))) { 7169 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7170 return; 7171 } 7172 hdr = data; 7173 7174 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7175 /* Hot data path. */ 7176 hn_rndis_rx_data(rxr, data, dlen); 7177 /* Done! */ 7178 return; 7179 } 7180 7181 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7182 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7183 else 7184 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7185 } 7186 7187 static void 7188 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7189 { 7190 const struct hn_nvs_hdr *hdr; 7191 7192 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7193 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7194 return; 7195 } 7196 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7197 7198 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7199 /* Useless; ignore */ 7200 return; 7201 } 7202 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7203 } 7204 7205 static void 7206 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7207 const struct vmbus_chanpkt_hdr *pkt) 7208 { 7209 struct hn_nvs_sendctx *sndc; 7210 7211 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7212 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7213 VMBUS_CHANPKT_DATALEN(pkt)); 7214 /* 7215 * NOTE: 7216 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7217 * its callback. 7218 */ 7219 } 7220 7221 static void 7222 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7223 const struct vmbus_chanpkt_hdr *pkthdr) 7224 { 7225 const struct vmbus_chanpkt_rxbuf *pkt; 7226 const struct hn_nvs_hdr *nvs_hdr; 7227 int count, i, hlen; 7228 7229 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7230 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7231 return; 7232 } 7233 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7234 7235 /* Make sure that this is a RNDIS message. */ 7236 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7237 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7238 nvs_hdr->nvs_type); 7239 return; 7240 } 7241 7242 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7243 if (__predict_false(hlen < sizeof(*pkt))) { 7244 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7245 return; 7246 } 7247 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7248 7249 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7250 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7251 pkt->cp_rxbuf_id); 7252 return; 7253 } 7254 7255 count = pkt->cp_rxbuf_cnt; 7256 if (__predict_false(hlen < 7257 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7258 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7259 return; 7260 } 7261 7262 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7263 for (i = 0; i < count; ++i) { 7264 int ofs, len; 7265 7266 ofs = pkt->cp_rxbuf[i].rb_ofs; 7267 len = pkt->cp_rxbuf[i].rb_len; 7268 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7269 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7270 "ofs %d, len %d\n", i, ofs, len); 7271 continue; 7272 } 7273 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7274 } 7275 7276 /* 7277 * Ack the consumed RXBUF associated w/ this channel packet, 7278 * so that this RXBUF can be recycled by the hypervisor. 7279 */ 7280 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7281 } 7282 7283 static void 7284 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7285 uint64_t tid) 7286 { 7287 struct hn_nvs_rndis_ack ack; 7288 int retries, error; 7289 7290 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7291 ack.nvs_status = HN_NVS_STATUS_OK; 7292 7293 retries = 0; 7294 again: 7295 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7296 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7297 if (__predict_false(error == EAGAIN)) { 7298 /* 7299 * NOTE: 7300 * This should _not_ happen in real world, since the 7301 * consumption of the TX bufring from the TX path is 7302 * controlled. 7303 */ 7304 if (rxr->hn_ack_failed == 0) 7305 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7306 rxr->hn_ack_failed++; 7307 retries++; 7308 if (retries < 10) { 7309 DELAY(100); 7310 goto again; 7311 } 7312 /* RXBUF leaks! */ 7313 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7314 } 7315 } 7316 7317 static void 7318 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7319 { 7320 struct hn_rx_ring *rxr = xrxr; 7321 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7322 7323 for (;;) { 7324 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7325 int error, pktlen; 7326 7327 pktlen = rxr->hn_pktbuf_len; 7328 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7329 if (__predict_false(error == ENOBUFS)) { 7330 void *nbuf; 7331 int nlen; 7332 7333 /* 7334 * Expand channel packet buffer. 7335 * 7336 * XXX 7337 * Use M_WAITOK here, since allocation failure 7338 * is fatal. 7339 */ 7340 nlen = rxr->hn_pktbuf_len * 2; 7341 while (nlen < pktlen) 7342 nlen *= 2; 7343 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7344 7345 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7346 rxr->hn_pktbuf_len, nlen); 7347 7348 free(rxr->hn_pktbuf, M_DEVBUF); 7349 rxr->hn_pktbuf = nbuf; 7350 rxr->hn_pktbuf_len = nlen; 7351 /* Retry! */ 7352 continue; 7353 } else if (__predict_false(error == EAGAIN)) { 7354 /* No more channel packets; done! */ 7355 break; 7356 } 7357 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7358 7359 switch (pkt->cph_type) { 7360 case VMBUS_CHANPKT_TYPE_COMP: 7361 hn_nvs_handle_comp(sc, chan, pkt); 7362 break; 7363 7364 case VMBUS_CHANPKT_TYPE_RXBUF: 7365 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7366 break; 7367 7368 case VMBUS_CHANPKT_TYPE_INBAND: 7369 hn_nvs_handle_notify(sc, pkt); 7370 break; 7371 7372 default: 7373 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7374 pkt->cph_type); 7375 break; 7376 } 7377 } 7378 hn_chan_rollup(rxr, rxr->hn_txr); 7379 } 7380 7381 static void 7382 hn_sysinit(void *arg __unused) 7383 { 7384 int i; 7385 7386 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7387 7388 #ifdef HN_IFSTART_SUPPORT 7389 /* 7390 * Don't use ifnet.if_start if transparent VF mode is requested; 7391 * mainly due to the IFF_DRV_OACTIVE flag. 7392 */ 7393 if (hn_xpnt_vf && hn_use_if_start) { 7394 hn_use_if_start = 0; 7395 printf("hn: tranparent VF mode, if_transmit will be used, " 7396 "instead of if_start\n"); 7397 } 7398 #endif 7399 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7400 printf("hn: invalid transparent VF attach routing " 7401 "wait timeout %d, reset to %d\n", 7402 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7403 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7404 } 7405 7406 /* 7407 * Initialize VF map. 7408 */ 7409 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7410 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7411 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7412 M_WAITOK | M_ZERO); 7413 7414 /* 7415 * Fix the # of TX taskqueues. 7416 */ 7417 if (hn_tx_taskq_cnt <= 0) 7418 hn_tx_taskq_cnt = 1; 7419 else if (hn_tx_taskq_cnt > mp_ncpus) 7420 hn_tx_taskq_cnt = mp_ncpus; 7421 7422 /* 7423 * Fix the TX taskqueue mode. 7424 */ 7425 switch (hn_tx_taskq_mode) { 7426 case HN_TX_TASKQ_M_INDEP: 7427 case HN_TX_TASKQ_M_GLOBAL: 7428 case HN_TX_TASKQ_M_EVTTQ: 7429 break; 7430 default: 7431 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7432 break; 7433 } 7434 7435 if (vm_guest != VM_GUEST_HV) 7436 return; 7437 7438 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7439 return; 7440 7441 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7442 M_DEVBUF, M_WAITOK); 7443 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7444 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7445 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7446 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7447 "hn tx%d", i); 7448 } 7449 } 7450 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7451 7452 static void 7453 hn_sysuninit(void *arg __unused) 7454 { 7455 7456 if (hn_tx_taskque != NULL) { 7457 int i; 7458 7459 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7460 taskqueue_free(hn_tx_taskque[i]); 7461 free(hn_tx_taskque, M_DEVBUF); 7462 } 7463 7464 if (hn_vfmap != NULL) 7465 free(hn_vfmap, M_DEVBUF); 7466 rm_destroy(&hn_vfmap_lock); 7467 7468 counter_u64_free(hn_udpcs_fixup); 7469 } 7470 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7471