1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <machine/atomic.h> 89 #include <machine/in_cksum.h> 90 91 #include <net/bpf.h> 92 #include <net/ethernet.h> 93 #include <net/if.h> 94 #include <net/if_dl.h> 95 #include <net/if_media.h> 96 #include <net/if_types.h> 97 #include <net/if_var.h> 98 #include <net/rndis.h> 99 #ifdef RSS 100 #include <net/rss_config.h> 101 #endif 102 103 #include <netinet/in_systm.h> 104 #include <netinet/in.h> 105 #include <netinet/ip.h> 106 #include <netinet/ip6.h> 107 #include <netinet/tcp.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/udp.h> 110 111 #include <dev/hyperv/include/hyperv.h> 112 #include <dev/hyperv/include/hyperv_busdma.h> 113 #include <dev/hyperv/include/vmbus.h> 114 #include <dev/hyperv/include/vmbus_xact.h> 115 116 #include <dev/hyperv/netvsc/ndis.h> 117 #include <dev/hyperv/netvsc/if_hnreg.h> 118 #include <dev/hyperv/netvsc/if_hnvar.h> 119 #include <dev/hyperv/netvsc/hn_nvs.h> 120 #include <dev/hyperv/netvsc/hn_rndis.h> 121 122 #include "vmbus_if.h" 123 124 #define HN_IFSTART_SUPPORT 125 126 #define HN_RING_CNT_DEF_MAX 8 127 128 #define HN_VFMAP_SIZE_DEF 8 129 130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 131 132 /* YYY should get it from the underlying channel */ 133 #define HN_TX_DESC_CNT 512 134 135 #define HN_RNDIS_PKT_LEN \ 136 (sizeof(struct rndis_packet_msg) + \ 137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 143 144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 147 /* -1 for RNDIS packet message */ 148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 149 150 #define HN_DIRECT_TX_SIZE_DEF 128 151 152 #define HN_EARLY_TXEOF_THRESH 8 153 154 #define HN_PKTBUF_LEN_DEF (16 * 1024) 155 156 #define HN_LROENT_CNT_DEF 128 157 158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 160 /* YYY 2*MTU is a bit rough, but should be good enough. */ 161 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 162 163 #define HN_LRO_ACKCNT_DEF 1 164 165 #define HN_LOCK_INIT(sc) \ 166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 169 #define HN_LOCK(sc) \ 170 do { \ 171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 172 /* Relinquish cpu to avoid deadlock */ \ 173 sched_relinquish(curthread); \ 174 DELAY(1000); \ 175 } \ 176 } while (0) 177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 178 179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 181 #define HN_CSUM_IP_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 183 #define HN_CSUM_IP6_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 185 186 #define HN_PKTSIZE_MIN(align) \ 187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 188 HN_RNDIS_PKT_LEN, (align)) 189 #define HN_PKTSIZE(m, align) \ 190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 191 192 #ifdef RSS 193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 194 #else 195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 196 #endif 197 198 struct hn_txdesc { 199 #ifndef HN_USE_TXDESC_BUFRING 200 SLIST_ENTRY(hn_txdesc) link; 201 #endif 202 STAILQ_ENTRY(hn_txdesc) agg_link; 203 204 /* Aggregated txdescs, in sending order. */ 205 STAILQ_HEAD(, hn_txdesc) agg_list; 206 207 /* The oldest packet, if transmission aggregation happens. */ 208 struct mbuf *m; 209 struct hn_tx_ring *txr; 210 int refs; 211 uint32_t flags; /* HN_TXD_FLAG_ */ 212 struct hn_nvs_sendctx send_ctx; 213 uint32_t chim_index; 214 int chim_size; 215 216 bus_dmamap_t data_dmap; 217 218 bus_addr_t rndis_pkt_paddr; 219 struct rndis_packet_msg *rndis_pkt; 220 bus_dmamap_t rndis_pkt_dmap; 221 }; 222 223 #define HN_TXD_FLAG_ONLIST 0x0001 224 #define HN_TXD_FLAG_DMAMAP 0x0002 225 #define HN_TXD_FLAG_ONAGG 0x0004 226 227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 230 231 struct packet_info_id { 232 uint8_t ver; 233 uint8_t flag; 234 uint16_t pkt_id; 235 }; 236 237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 238 239 240 struct hn_rxinfo { 241 const uint32_t *vlan_info; 242 const uint32_t *csum_info; 243 const uint32_t *hash_info; 244 const uint32_t *hash_value; 245 const struct packet_info_id *pktinfo_id; 246 }; 247 248 struct hn_rxvf_setarg { 249 struct hn_rx_ring *rxr; 250 struct ifnet *vf_ifp; 251 }; 252 253 #define HN_RXINFO_VLAN 0x0001 254 #define HN_RXINFO_CSUM 0x0002 255 #define HN_RXINFO_HASHINF 0x0004 256 #define HN_RXINFO_HASHVAL 0x0008 257 #define HN_RXINFO_PKTINFO_ID 0x0010 258 #define HN_RXINFO_ALL \ 259 (HN_RXINFO_VLAN | \ 260 HN_RXINFO_CSUM | \ 261 HN_RXINFO_HASHINF | \ 262 HN_RXINFO_HASHVAL | \ 263 HN_RXINFO_PKTINFO_ID) 264 265 static int hn_probe(device_t); 266 static int hn_attach(device_t); 267 static int hn_detach(device_t); 268 static int hn_shutdown(device_t); 269 static void hn_chan_callback(struct vmbus_channel *, 270 void *); 271 272 static void hn_init(void *); 273 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 274 #ifdef HN_IFSTART_SUPPORT 275 static void hn_start(struct ifnet *); 276 #endif 277 static int hn_transmit(struct ifnet *, struct mbuf *); 278 static void hn_xmit_qflush(struct ifnet *); 279 static int hn_ifmedia_upd(struct ifnet *); 280 static void hn_ifmedia_sts(struct ifnet *, 281 struct ifmediareq *); 282 283 static void hn_ifnet_event(void *, struct ifnet *, int); 284 static void hn_ifaddr_event(void *, struct ifnet *); 285 static void hn_ifnet_attevent(void *, struct ifnet *); 286 static void hn_ifnet_detevent(void *, struct ifnet *); 287 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 288 289 static bool hn_ismyvf(const struct hn_softc *, 290 const struct ifnet *); 291 static void hn_rxvf_change(struct hn_softc *, 292 struct ifnet *, bool); 293 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 294 static void hn_rxvf_set_task(void *, int); 295 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 298 struct ifreq *); 299 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 300 static bool hn_xpnt_vf_isready(struct hn_softc *); 301 static void hn_xpnt_vf_setready(struct hn_softc *); 302 static void hn_xpnt_vf_init_taskfunc(void *, int); 303 static void hn_xpnt_vf_init(struct hn_softc *); 304 static void hn_xpnt_vf_setenable(struct hn_softc *); 305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 306 static void hn_vf_rss_fixup(struct hn_softc *, bool); 307 static void hn_vf_rss_restore(struct hn_softc *); 308 309 static int hn_rndis_rxinfo(const void *, int, 310 struct hn_rxinfo *); 311 static void hn_rndis_rx_data(struct hn_rx_ring *, 312 const void *, int); 313 static void hn_rndis_rx_status(struct hn_softc *, 314 const void *, int); 315 static void hn_rndis_init_fixat(struct hn_softc *, int); 316 317 static void hn_nvs_handle_notify(struct hn_softc *, 318 const struct vmbus_chanpkt_hdr *); 319 static void hn_nvs_handle_comp(struct hn_softc *, 320 struct vmbus_channel *, 321 const struct vmbus_chanpkt_hdr *); 322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 323 struct vmbus_channel *, 324 const struct vmbus_chanpkt_hdr *); 325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 326 struct vmbus_channel *, uint64_t); 327 328 #if __FreeBSD_version >= 1100099 329 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 331 #endif 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 #if __FreeBSD_version < 1100095 335 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 336 #else 337 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 338 #endif 339 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 346 #ifndef RSS 347 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 349 #endif 350 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 362 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 363 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 364 365 static void hn_stop(struct hn_softc *, bool); 366 static void hn_init_locked(struct hn_softc *); 367 static int hn_chan_attach(struct hn_softc *, 368 struct vmbus_channel *); 369 static void hn_chan_detach(struct hn_softc *, 370 struct vmbus_channel *); 371 static int hn_attach_subchans(struct hn_softc *); 372 static void hn_detach_allchans(struct hn_softc *); 373 static void hn_chan_rollup(struct hn_rx_ring *, 374 struct hn_tx_ring *); 375 static void hn_set_ring_inuse(struct hn_softc *, int); 376 static int hn_synth_attach(struct hn_softc *, int); 377 static void hn_synth_detach(struct hn_softc *); 378 static int hn_synth_alloc_subchans(struct hn_softc *, 379 int *); 380 static bool hn_synth_attachable(const struct hn_softc *); 381 static void hn_suspend(struct hn_softc *); 382 static void hn_suspend_data(struct hn_softc *); 383 static void hn_suspend_mgmt(struct hn_softc *); 384 static void hn_resume(struct hn_softc *); 385 static void hn_resume_data(struct hn_softc *); 386 static void hn_resume_mgmt(struct hn_softc *); 387 static void hn_suspend_mgmt_taskfunc(void *, int); 388 static void hn_chan_drain(struct hn_softc *, 389 struct vmbus_channel *); 390 static void hn_disable_rx(struct hn_softc *); 391 static void hn_drain_rxtx(struct hn_softc *, int); 392 static void hn_polling(struct hn_softc *, u_int); 393 static void hn_chan_polling(struct vmbus_channel *, u_int); 394 static void hn_mtu_change_fixup(struct hn_softc *); 395 396 static void hn_update_link_status(struct hn_softc *); 397 static void hn_change_network(struct hn_softc *); 398 static void hn_link_taskfunc(void *, int); 399 static void hn_netchg_init_taskfunc(void *, int); 400 static void hn_netchg_status_taskfunc(void *, int); 401 static void hn_link_status(struct hn_softc *); 402 403 static int hn_create_rx_data(struct hn_softc *, int); 404 static void hn_destroy_rx_data(struct hn_softc *); 405 static int hn_check_iplen(const struct mbuf *, int); 406 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 407 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 408 static int hn_rxfilter_config(struct hn_softc *); 409 static int hn_rss_reconfig(struct hn_softc *); 410 static void hn_rss_ind_fixup(struct hn_softc *); 411 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 412 static int hn_rxpkt(struct hn_rx_ring *); 413 static uint32_t hn_rss_type_fromndis(uint32_t); 414 static uint32_t hn_rss_type_tondis(uint32_t); 415 416 static int hn_tx_ring_create(struct hn_softc *, int); 417 static void hn_tx_ring_destroy(struct hn_tx_ring *); 418 static int hn_create_tx_data(struct hn_softc *, int); 419 static void hn_fixup_tx_data(struct hn_softc *); 420 static void hn_fixup_rx_data(struct hn_softc *); 421 static void hn_destroy_tx_data(struct hn_softc *); 422 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 423 static void hn_txdesc_gc(struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 426 struct hn_txdesc *, struct mbuf **); 427 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 428 struct hn_txdesc *); 429 static void hn_set_chim_size(struct hn_softc *, int); 430 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 431 static bool hn_tx_ring_pending(struct hn_tx_ring *); 432 static void hn_tx_ring_qflush(struct hn_tx_ring *); 433 static void hn_resume_tx(struct hn_softc *, int); 434 static void hn_set_txagg(struct hn_softc *); 435 static void *hn_try_txagg(struct ifnet *, 436 struct hn_tx_ring *, struct hn_txdesc *, 437 int); 438 static int hn_get_txswq_depth(const struct hn_tx_ring *); 439 static void hn_txpkt_done(struct hn_nvs_sendctx *, 440 struct hn_softc *, struct vmbus_channel *, 441 const void *, int); 442 static int hn_txpkt_sglist(struct hn_tx_ring *, 443 struct hn_txdesc *); 444 static int hn_txpkt_chim(struct hn_tx_ring *, 445 struct hn_txdesc *); 446 static int hn_xmit(struct hn_tx_ring *, int); 447 static void hn_xmit_taskfunc(void *, int); 448 static void hn_xmit_txeof(struct hn_tx_ring *); 449 static void hn_xmit_txeof_taskfunc(void *, int); 450 #ifdef HN_IFSTART_SUPPORT 451 static int hn_start_locked(struct hn_tx_ring *, int); 452 static void hn_start_taskfunc(void *, int); 453 static void hn_start_txeof(struct hn_tx_ring *); 454 static void hn_start_txeof_taskfunc(void *, int); 455 #endif 456 457 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 458 "Hyper-V network interface"); 459 460 /* Trust tcp segment verification on host side. */ 461 static int hn_trust_hosttcp = 1; 462 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 463 &hn_trust_hosttcp, 0, 464 "Trust tcp segment verification on host side, " 465 "when csum info is missing (global setting)"); 466 467 /* Trust udp datagrams verification on host side. */ 468 static int hn_trust_hostudp = 1; 469 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 470 &hn_trust_hostudp, 0, 471 "Trust udp datagram verification on host side, " 472 "when csum info is missing (global setting)"); 473 474 /* Trust ip packets verification on host side. */ 475 static int hn_trust_hostip = 1; 476 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 477 &hn_trust_hostip, 0, 478 "Trust ip packet verification on host side, " 479 "when csum info is missing (global setting)"); 480 481 /* 482 * Offload UDP/IPv4 checksum. 483 */ 484 static int hn_enable_udp4cs = 1; 485 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 486 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 487 488 /* 489 * Offload UDP/IPv6 checksum. 490 */ 491 static int hn_enable_udp6cs = 1; 492 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 493 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 494 495 /* Stats. */ 496 static counter_u64_t hn_udpcs_fixup; 497 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 498 &hn_udpcs_fixup, "# of UDP checksum fixup"); 499 500 /* 501 * See hn_set_hlen(). 502 * 503 * This value is for Azure. For Hyper-V, set this above 504 * 65536 to disable UDP datagram checksum fixup. 505 */ 506 static int hn_udpcs_fixup_mtu = 1420; 507 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 508 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 509 510 /* Limit TSO burst size */ 511 static int hn_tso_maxlen = IP_MAXPACKET; 512 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 513 &hn_tso_maxlen, 0, "TSO burst limit"); 514 515 /* Limit chimney send size */ 516 static int hn_tx_chimney_size = 0; 517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 518 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 519 520 /* Limit the size of packet for direct transmission */ 521 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 522 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 523 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 524 525 /* # of LRO entries per RX ring */ 526 #if defined(INET) || defined(INET6) 527 #if __FreeBSD_version >= 1100095 528 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 529 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 530 &hn_lro_entry_count, 0, "LRO entry count"); 531 #endif 532 #endif 533 534 static int hn_tx_taskq_cnt = 1; 535 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 536 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 537 538 #define HN_TX_TASKQ_M_INDEP 0 539 #define HN_TX_TASKQ_M_GLOBAL 1 540 #define HN_TX_TASKQ_M_EVTTQ 2 541 542 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 543 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 544 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 545 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 546 547 #ifndef HN_USE_TXDESC_BUFRING 548 static int hn_use_txdesc_bufring = 0; 549 #else 550 static int hn_use_txdesc_bufring = 1; 551 #endif 552 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 553 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 554 555 #ifdef HN_IFSTART_SUPPORT 556 /* Use ifnet.if_start instead of ifnet.if_transmit */ 557 static int hn_use_if_start = 0; 558 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 559 &hn_use_if_start, 0, "Use if_start TX method"); 560 #endif 561 562 /* # of channels to use */ 563 static int hn_chan_cnt = 0; 564 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 565 &hn_chan_cnt, 0, 566 "# of channels to use; each channel has one RX ring and one TX ring"); 567 568 /* # of transmit rings to use */ 569 static int hn_tx_ring_cnt = 0; 570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 571 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 572 573 /* Software TX ring deptch */ 574 static int hn_tx_swq_depth = 0; 575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 576 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 577 578 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 579 #if __FreeBSD_version >= 1100095 580 static u_int hn_lro_mbufq_depth = 0; 581 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 582 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 583 #endif 584 585 /* Packet transmission aggregation size limit */ 586 static int hn_tx_agg_size = -1; 587 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 588 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 589 590 /* Packet transmission aggregation count limit */ 591 static int hn_tx_agg_pkts = -1; 592 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 593 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 594 595 /* VF list */ 596 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 597 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 598 hn_vflist_sysctl, "A", 599 "VF list"); 600 601 /* VF mapping */ 602 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 603 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 604 hn_vfmap_sysctl, "A", 605 "VF mapping"); 606 607 /* Transparent VF */ 608 static int hn_xpnt_vf = 1; 609 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 610 &hn_xpnt_vf, 0, "Transparent VF mod"); 611 612 /* Accurate BPF support for Transparent VF */ 613 static int hn_xpnt_vf_accbpf = 0; 614 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 615 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 616 617 /* Extra wait for transparent VF attach routing; unit seconds. */ 618 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 619 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 620 &hn_xpnt_vf_attwait, 0, 621 "Extra wait for transparent VF attach routing; unit: seconds"); 622 623 static u_int hn_cpu_index; /* next CPU for channel */ 624 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 625 626 static struct rmlock hn_vfmap_lock; 627 static int hn_vfmap_size; 628 static struct ifnet **hn_vfmap; 629 630 #ifndef RSS 631 static const uint8_t 632 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 633 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 634 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 635 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 636 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 637 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 638 }; 639 #endif /* !RSS */ 640 641 static const struct hyperv_guid hn_guid = { 642 .hv_guid = { 643 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 644 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 645 }; 646 647 static device_method_t hn_methods[] = { 648 /* Device interface */ 649 DEVMETHOD(device_probe, hn_probe), 650 DEVMETHOD(device_attach, hn_attach), 651 DEVMETHOD(device_detach, hn_detach), 652 DEVMETHOD(device_shutdown, hn_shutdown), 653 DEVMETHOD_END 654 }; 655 656 static driver_t hn_driver = { 657 "hn", 658 hn_methods, 659 sizeof(struct hn_softc) 660 }; 661 662 static devclass_t hn_devclass; 663 664 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 665 MODULE_VERSION(hn, 1); 666 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 667 668 #if __FreeBSD_version >= 1100099 669 static void 670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 671 { 672 int i; 673 674 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 675 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 676 } 677 #endif 678 679 static int 680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 681 { 682 683 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 684 txd->chim_size == 0, ("invalid rndis sglist txd")); 685 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 686 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 687 } 688 689 static int 690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 691 { 692 struct hn_nvs_rndis rndis; 693 694 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 695 txd->chim_size > 0, ("invalid rndis chim txd")); 696 697 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 698 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 699 rndis.nvs_chim_idx = txd->chim_index; 700 rndis.nvs_chim_sz = txd->chim_size; 701 702 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 703 &rndis, sizeof(rndis), &txd->send_ctx)); 704 } 705 706 static __inline uint32_t 707 hn_chim_alloc(struct hn_softc *sc) 708 { 709 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 710 u_long *bmap = sc->hn_chim_bmap; 711 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 712 713 for (i = 0; i < bmap_cnt; ++i) { 714 int idx; 715 716 idx = ffsl(~bmap[i]); 717 if (idx == 0) 718 continue; 719 720 --idx; /* ffsl is 1-based */ 721 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 722 ("invalid i %d and idx %d", i, idx)); 723 724 if (atomic_testandset_long(&bmap[i], idx)) 725 continue; 726 727 ret = i * LONG_BIT + idx; 728 break; 729 } 730 return (ret); 731 } 732 733 static __inline void 734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 735 { 736 u_long mask; 737 uint32_t idx; 738 739 idx = chim_idx / LONG_BIT; 740 KASSERT(idx < sc->hn_chim_bmap_cnt, 741 ("invalid chimney index 0x%x", chim_idx)); 742 743 mask = 1UL << (chim_idx % LONG_BIT); 744 KASSERT(sc->hn_chim_bmap[idx] & mask, 745 ("index bitmap 0x%lx, chimney index %u, " 746 "bitmap idx %d, bitmask 0x%lx", 747 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 748 749 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 750 } 751 752 #if defined(INET6) || defined(INET) 753 754 #define PULLUP_HDR(m, len) \ 755 do { \ 756 if (__predict_false((m)->m_len < (len))) { \ 757 (m) = m_pullup((m), (len)); \ 758 if ((m) == NULL) \ 759 return (NULL); \ 760 } \ 761 } while (0) 762 763 /* 764 * NOTE: If this function failed, the m_head would be freed. 765 */ 766 static __inline struct mbuf * 767 hn_tso_fixup(struct mbuf *m_head) 768 { 769 struct ether_vlan_header *evl; 770 struct tcphdr *th; 771 int ehlen; 772 773 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 774 775 PULLUP_HDR(m_head, sizeof(*evl)); 776 evl = mtod(m_head, struct ether_vlan_header *); 777 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 778 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 779 else 780 ehlen = ETHER_HDR_LEN; 781 m_head->m_pkthdr.l2hlen = ehlen; 782 783 #ifdef INET 784 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 785 struct ip *ip; 786 int iphlen; 787 788 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 789 ip = mtodo(m_head, ehlen); 790 iphlen = ip->ip_hl << 2; 791 m_head->m_pkthdr.l3hlen = iphlen; 792 793 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 794 th = mtodo(m_head, ehlen + iphlen); 795 796 ip->ip_len = 0; 797 ip->ip_sum = 0; 798 th->th_sum = in_pseudo(ip->ip_src.s_addr, 799 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 800 } 801 #endif 802 #if defined(INET6) && defined(INET) 803 else 804 #endif 805 #ifdef INET6 806 { 807 struct ip6_hdr *ip6; 808 809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 810 ip6 = mtodo(m_head, ehlen); 811 if (ip6->ip6_nxt != IPPROTO_TCP) { 812 m_freem(m_head); 813 return (NULL); 814 } 815 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 816 817 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 818 th = mtodo(m_head, ehlen + sizeof(*ip6)); 819 820 ip6->ip6_plen = 0; 821 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 822 } 823 #endif 824 return (m_head); 825 } 826 827 /* 828 * NOTE: If this function failed, the m_head would be freed. 829 */ 830 static __inline struct mbuf * 831 hn_set_hlen(struct mbuf *m_head) 832 { 833 const struct ether_vlan_header *evl; 834 int ehlen; 835 836 PULLUP_HDR(m_head, sizeof(*evl)); 837 evl = mtod(m_head, const struct ether_vlan_header *); 838 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 839 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 840 else 841 ehlen = ETHER_HDR_LEN; 842 m_head->m_pkthdr.l2hlen = ehlen; 843 844 #ifdef INET 845 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 846 const struct ip *ip; 847 int iphlen; 848 849 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 850 ip = mtodo(m_head, ehlen); 851 iphlen = ip->ip_hl << 2; 852 m_head->m_pkthdr.l3hlen = iphlen; 853 854 /* 855 * UDP checksum offload does not work in Azure, if the 856 * following conditions meet: 857 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 858 * - IP_DF is not set in the IP hdr. 859 * 860 * Fallback to software checksum for these UDP datagrams. 861 */ 862 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 863 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 864 (ntohs(ip->ip_off) & IP_DF) == 0) { 865 uint16_t off = ehlen + iphlen; 866 867 counter_u64_add(hn_udpcs_fixup, 1); 868 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 869 *(uint16_t *)(m_head->m_data + off + 870 m_head->m_pkthdr.csum_data) = in_cksum_skip( 871 m_head, m_head->m_pkthdr.len, off); 872 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 873 } 874 } 875 #endif 876 #if defined(INET6) && defined(INET) 877 else 878 #endif 879 #ifdef INET6 880 { 881 const struct ip6_hdr *ip6; 882 883 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 884 ip6 = mtodo(m_head, ehlen); 885 if (ip6->ip6_nxt != IPPROTO_TCP && 886 ip6->ip6_nxt != IPPROTO_UDP) { 887 m_freem(m_head); 888 return (NULL); 889 } 890 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 891 } 892 #endif 893 return (m_head); 894 } 895 896 /* 897 * NOTE: If this function failed, the m_head would be freed. 898 */ 899 static __inline struct mbuf * 900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 901 { 902 const struct tcphdr *th; 903 int ehlen, iphlen; 904 905 *tcpsyn = 0; 906 ehlen = m_head->m_pkthdr.l2hlen; 907 iphlen = m_head->m_pkthdr.l3hlen; 908 909 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 910 th = mtodo(m_head, ehlen + iphlen); 911 if (th->th_flags & TH_SYN) 912 *tcpsyn = 1; 913 return (m_head); 914 } 915 916 #undef PULLUP_HDR 917 918 #endif /* INET6 || INET */ 919 920 static int 921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 922 { 923 int error = 0; 924 925 HN_LOCK_ASSERT(sc); 926 927 if (sc->hn_rx_filter != filter) { 928 error = hn_rndis_set_rxfilter(sc, filter); 929 if (!error) 930 sc->hn_rx_filter = filter; 931 } 932 return (error); 933 } 934 935 static int 936 hn_rxfilter_config(struct hn_softc *sc) 937 { 938 struct ifnet *ifp = sc->hn_ifp; 939 uint32_t filter; 940 941 HN_LOCK_ASSERT(sc); 942 943 /* 944 * If the non-transparent mode VF is activated, we don't know how 945 * its RX filter is configured, so stick the synthetic device in 946 * the promiscous mode. 947 */ 948 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 949 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 950 } else { 951 filter = NDIS_PACKET_TYPE_DIRECTED; 952 if (ifp->if_flags & IFF_BROADCAST) 953 filter |= NDIS_PACKET_TYPE_BROADCAST; 954 /* TODO: support multicast list */ 955 if ((ifp->if_flags & IFF_ALLMULTI) || 956 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 957 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 958 } 959 return (hn_set_rxfilter(sc, filter)); 960 } 961 962 static void 963 hn_set_txagg(struct hn_softc *sc) 964 { 965 uint32_t size, pkts; 966 int i; 967 968 /* 969 * Setup aggregation size. 970 */ 971 if (sc->hn_agg_size < 0) 972 size = UINT32_MAX; 973 else 974 size = sc->hn_agg_size; 975 976 if (sc->hn_rndis_agg_size < size) 977 size = sc->hn_rndis_agg_size; 978 979 /* NOTE: We only aggregate packets using chimney sending buffers. */ 980 if (size > (uint32_t)sc->hn_chim_szmax) 981 size = sc->hn_chim_szmax; 982 983 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 984 /* Disable */ 985 size = 0; 986 pkts = 0; 987 goto done; 988 } 989 990 /* NOTE: Type of the per TX ring setting is 'int'. */ 991 if (size > INT_MAX) 992 size = INT_MAX; 993 994 /* 995 * Setup aggregation packet count. 996 */ 997 if (sc->hn_agg_pkts < 0) 998 pkts = UINT32_MAX; 999 else 1000 pkts = sc->hn_agg_pkts; 1001 1002 if (sc->hn_rndis_agg_pkts < pkts) 1003 pkts = sc->hn_rndis_agg_pkts; 1004 1005 if (pkts <= 1) { 1006 /* Disable */ 1007 size = 0; 1008 pkts = 0; 1009 goto done; 1010 } 1011 1012 /* NOTE: Type of the per TX ring setting is 'short'. */ 1013 if (pkts > SHRT_MAX) 1014 pkts = SHRT_MAX; 1015 1016 done: 1017 /* NOTE: Type of the per TX ring setting is 'short'. */ 1018 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1019 /* Disable */ 1020 size = 0; 1021 pkts = 0; 1022 } 1023 1024 if (bootverbose) { 1025 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1026 size, pkts, sc->hn_rndis_agg_align); 1027 } 1028 1029 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1030 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1031 1032 mtx_lock(&txr->hn_tx_lock); 1033 txr->hn_agg_szmax = size; 1034 txr->hn_agg_pktmax = pkts; 1035 txr->hn_agg_align = sc->hn_rndis_agg_align; 1036 mtx_unlock(&txr->hn_tx_lock); 1037 } 1038 } 1039 1040 static int 1041 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1042 { 1043 1044 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1045 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1046 return txr->hn_txdesc_cnt; 1047 return hn_tx_swq_depth; 1048 } 1049 1050 static int 1051 hn_rss_reconfig(struct hn_softc *sc) 1052 { 1053 int error; 1054 1055 HN_LOCK_ASSERT(sc); 1056 1057 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1058 return (ENXIO); 1059 1060 /* 1061 * Disable RSS first. 1062 * 1063 * NOTE: 1064 * Direct reconfiguration by setting the UNCHG flags does 1065 * _not_ work properly. 1066 */ 1067 if (bootverbose) 1068 if_printf(sc->hn_ifp, "disable RSS\n"); 1069 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1070 if (error) { 1071 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1072 return (error); 1073 } 1074 1075 /* 1076 * Reenable the RSS w/ the updated RSS key or indirect 1077 * table. 1078 */ 1079 if (bootverbose) 1080 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1081 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1082 if (error) { 1083 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1084 return (error); 1085 } 1086 return (0); 1087 } 1088 1089 static void 1090 hn_rss_ind_fixup(struct hn_softc *sc) 1091 { 1092 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1093 int i, nchan; 1094 1095 nchan = sc->hn_rx_ring_inuse; 1096 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1097 1098 /* 1099 * Check indirect table to make sure that all channels in it 1100 * can be used. 1101 */ 1102 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1103 if (rss->rss_ind[i] >= nchan) { 1104 if_printf(sc->hn_ifp, 1105 "RSS indirect table %d fixup: %u -> %d\n", 1106 i, rss->rss_ind[i], nchan - 1); 1107 rss->rss_ind[i] = nchan - 1; 1108 } 1109 } 1110 } 1111 1112 static int 1113 hn_ifmedia_upd(struct ifnet *ifp __unused) 1114 { 1115 1116 return EOPNOTSUPP; 1117 } 1118 1119 static void 1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1121 { 1122 struct hn_softc *sc = ifp->if_softc; 1123 1124 ifmr->ifm_status = IFM_AVALID; 1125 ifmr->ifm_active = IFM_ETHER; 1126 1127 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1128 ifmr->ifm_active |= IFM_NONE; 1129 return; 1130 } 1131 ifmr->ifm_status |= IFM_ACTIVE; 1132 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1133 } 1134 1135 static void 1136 hn_rxvf_set_task(void *xarg, int pending __unused) 1137 { 1138 struct hn_rxvf_setarg *arg = xarg; 1139 1140 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1141 } 1142 1143 static void 1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1145 { 1146 struct hn_rx_ring *rxr; 1147 struct hn_rxvf_setarg arg; 1148 struct task task; 1149 int i; 1150 1151 HN_LOCK_ASSERT(sc); 1152 1153 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1154 1155 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1156 rxr = &sc->hn_rx_ring[i]; 1157 1158 if (i < sc->hn_rx_ring_inuse) { 1159 arg.rxr = rxr; 1160 arg.vf_ifp = vf_ifp; 1161 vmbus_chan_run_task(rxr->hn_chan, &task); 1162 } else { 1163 rxr->hn_rxvf_ifp = vf_ifp; 1164 } 1165 } 1166 } 1167 1168 static bool 1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1170 { 1171 const struct ifnet *hn_ifp; 1172 1173 hn_ifp = sc->hn_ifp; 1174 1175 if (ifp == hn_ifp) 1176 return (false); 1177 1178 if (ifp->if_alloctype != IFT_ETHER) 1179 return (false); 1180 1181 /* Ignore lagg/vlan interfaces */ 1182 if (strcmp(ifp->if_dname, "lagg") == 0 || 1183 strcmp(ifp->if_dname, "vlan") == 0) 1184 return (false); 1185 1186 /* 1187 * During detach events ifp->if_addr might be NULL. 1188 * Make sure the bcmp() below doesn't panic on that: 1189 */ 1190 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1191 return (false); 1192 1193 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1194 return (false); 1195 1196 return (true); 1197 } 1198 1199 static void 1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1201 { 1202 struct ifnet *hn_ifp; 1203 1204 HN_LOCK(sc); 1205 1206 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1207 goto out; 1208 1209 if (!hn_ismyvf(sc, ifp)) 1210 goto out; 1211 hn_ifp = sc->hn_ifp; 1212 1213 if (rxvf) { 1214 if (sc->hn_flags & HN_FLAG_RXVF) 1215 goto out; 1216 1217 sc->hn_flags |= HN_FLAG_RXVF; 1218 hn_rxfilter_config(sc); 1219 } else { 1220 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1221 goto out; 1222 1223 sc->hn_flags &= ~HN_FLAG_RXVF; 1224 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1225 hn_rxfilter_config(sc); 1226 else 1227 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1228 } 1229 1230 hn_nvs_set_datapath(sc, 1231 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1232 1233 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1234 1235 if (rxvf) { 1236 hn_vf_rss_fixup(sc, true); 1237 hn_suspend_mgmt(sc); 1238 sc->hn_link_flags &= 1239 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1240 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1241 } else { 1242 hn_vf_rss_restore(sc); 1243 hn_resume_mgmt(sc); 1244 } 1245 1246 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1247 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1248 1249 if (bootverbose) { 1250 if_printf(hn_ifp, "datapath is switched %s %s\n", 1251 rxvf ? "to" : "from", ifp->if_xname); 1252 } 1253 out: 1254 HN_UNLOCK(sc); 1255 } 1256 1257 static void 1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1259 { 1260 1261 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1262 return; 1263 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1264 } 1265 1266 static void 1267 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1268 { 1269 1270 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1271 } 1272 1273 static int 1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1275 { 1276 struct ifnet *ifp, *vf_ifp; 1277 uint64_t tmp; 1278 int error; 1279 1280 HN_LOCK_ASSERT(sc); 1281 ifp = sc->hn_ifp; 1282 vf_ifp = sc->hn_vf_ifp; 1283 1284 /* 1285 * Fix up requested capabilities w/ supported capabilities, 1286 * since the supported capabilities could have been changed. 1287 */ 1288 ifr->ifr_reqcap &= ifp->if_capabilities; 1289 /* Pass SIOCSIFCAP to VF. */ 1290 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1291 1292 /* 1293 * NOTE: 1294 * The error will be propagated to the callers, however, it 1295 * is _not_ useful here. 1296 */ 1297 1298 /* 1299 * Merge VF's enabled capabilities. 1300 */ 1301 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1302 1303 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1304 if (ifp->if_capenable & IFCAP_TXCSUM) 1305 ifp->if_hwassist |= tmp; 1306 else 1307 ifp->if_hwassist &= ~tmp; 1308 1309 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1310 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1311 ifp->if_hwassist |= tmp; 1312 else 1313 ifp->if_hwassist &= ~tmp; 1314 1315 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1316 if (ifp->if_capenable & IFCAP_TSO4) 1317 ifp->if_hwassist |= tmp; 1318 else 1319 ifp->if_hwassist &= ~tmp; 1320 1321 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1322 if (ifp->if_capenable & IFCAP_TSO6) 1323 ifp->if_hwassist |= tmp; 1324 else 1325 ifp->if_hwassist &= ~tmp; 1326 1327 return (error); 1328 } 1329 1330 static int 1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1332 { 1333 struct ifnet *vf_ifp; 1334 struct ifreq ifr; 1335 1336 HN_LOCK_ASSERT(sc); 1337 vf_ifp = sc->hn_vf_ifp; 1338 1339 memset(&ifr, 0, sizeof(ifr)); 1340 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1341 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1342 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1343 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1344 } 1345 1346 static void 1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1348 { 1349 struct ifnet *ifp = sc->hn_ifp; 1350 int allmulti = 0; 1351 1352 HN_LOCK_ASSERT(sc); 1353 1354 /* XXX vlan(4) style mcast addr maintenance */ 1355 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1356 allmulti = IFF_ALLMULTI; 1357 1358 /* Always set the VF's if_flags */ 1359 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1360 } 1361 1362 static void 1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1364 { 1365 struct rm_priotracker pt; 1366 struct ifnet *hn_ifp = NULL; 1367 struct mbuf *mn; 1368 1369 /* 1370 * XXX racy, if hn(4) ever detached. 1371 */ 1372 rm_rlock(&hn_vfmap_lock, &pt); 1373 if (vf_ifp->if_index < hn_vfmap_size) 1374 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1375 rm_runlock(&hn_vfmap_lock, &pt); 1376 1377 if (hn_ifp != NULL) { 1378 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1379 /* 1380 * Allow tapping on the VF. 1381 */ 1382 ETHER_BPF_MTAP(vf_ifp, mn); 1383 1384 /* 1385 * Update VF stats. 1386 */ 1387 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1388 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1389 mn->m_pkthdr.len); 1390 } 1391 /* 1392 * XXX IFCOUNTER_IMCAST 1393 * This stat updating is kinda invasive, since it 1394 * requires two checks on the mbuf: the length check 1395 * and the ethernet header check. As of this write, 1396 * all multicast packets go directly to hn(4), which 1397 * makes imcast stat updating in the VF a try in vian. 1398 */ 1399 1400 /* 1401 * Fix up rcvif and increase hn(4)'s ipackets. 1402 */ 1403 mn->m_pkthdr.rcvif = hn_ifp; 1404 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1405 } 1406 /* 1407 * Go through hn(4)'s if_input. 1408 */ 1409 hn_ifp->if_input(hn_ifp, m); 1410 } else { 1411 /* 1412 * In the middle of the transition; free this 1413 * mbuf chain. 1414 */ 1415 while (m != NULL) { 1416 mn = m->m_nextpkt; 1417 m->m_nextpkt = NULL; 1418 m_freem(m); 1419 m = mn; 1420 } 1421 } 1422 } 1423 1424 static void 1425 hn_mtu_change_fixup(struct hn_softc *sc) 1426 { 1427 struct ifnet *ifp; 1428 1429 HN_LOCK_ASSERT(sc); 1430 ifp = sc->hn_ifp; 1431 1432 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1433 #if __FreeBSD_version >= 1100099 1434 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1435 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1436 #endif 1437 } 1438 1439 static uint32_t 1440 hn_rss_type_fromndis(uint32_t rss_hash) 1441 { 1442 uint32_t types = 0; 1443 1444 if (rss_hash & NDIS_HASH_IPV4) 1445 types |= RSS_TYPE_IPV4; 1446 if (rss_hash & NDIS_HASH_TCP_IPV4) 1447 types |= RSS_TYPE_TCP_IPV4; 1448 if (rss_hash & NDIS_HASH_IPV6) 1449 types |= RSS_TYPE_IPV6; 1450 if (rss_hash & NDIS_HASH_IPV6_EX) 1451 types |= RSS_TYPE_IPV6_EX; 1452 if (rss_hash & NDIS_HASH_TCP_IPV6) 1453 types |= RSS_TYPE_TCP_IPV6; 1454 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1455 types |= RSS_TYPE_TCP_IPV6_EX; 1456 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1457 types |= RSS_TYPE_UDP_IPV4; 1458 return (types); 1459 } 1460 1461 static uint32_t 1462 hn_rss_type_tondis(uint32_t types) 1463 { 1464 uint32_t rss_hash = 0; 1465 1466 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1467 ("UDP6 and UDP6EX are not supported")); 1468 1469 if (types & RSS_TYPE_IPV4) 1470 rss_hash |= NDIS_HASH_IPV4; 1471 if (types & RSS_TYPE_TCP_IPV4) 1472 rss_hash |= NDIS_HASH_TCP_IPV4; 1473 if (types & RSS_TYPE_IPV6) 1474 rss_hash |= NDIS_HASH_IPV6; 1475 if (types & RSS_TYPE_IPV6_EX) 1476 rss_hash |= NDIS_HASH_IPV6_EX; 1477 if (types & RSS_TYPE_TCP_IPV6) 1478 rss_hash |= NDIS_HASH_TCP_IPV6; 1479 if (types & RSS_TYPE_TCP_IPV6_EX) 1480 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1481 if (types & RSS_TYPE_UDP_IPV4) 1482 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1483 return (rss_hash); 1484 } 1485 1486 static void 1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1488 { 1489 int i; 1490 1491 HN_LOCK_ASSERT(sc); 1492 1493 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1494 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1495 } 1496 1497 static void 1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1499 { 1500 struct ifnet *ifp, *vf_ifp; 1501 struct ifrsshash ifrh; 1502 struct ifrsskey ifrk; 1503 int error; 1504 uint32_t my_types, diff_types, mbuf_types = 0; 1505 1506 HN_LOCK_ASSERT(sc); 1507 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1508 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1509 1510 if (sc->hn_rx_ring_inuse == 1) { 1511 /* No RSS on synthetic parts; done. */ 1512 return; 1513 } 1514 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1515 /* Synthetic parts do not support Toeplitz; done. */ 1516 return; 1517 } 1518 1519 ifp = sc->hn_ifp; 1520 vf_ifp = sc->hn_vf_ifp; 1521 1522 /* 1523 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1524 * supported. 1525 */ 1526 memset(&ifrk, 0, sizeof(ifrk)); 1527 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1528 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1529 if (error) { 1530 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1531 vf_ifp->if_xname, error); 1532 goto done; 1533 } 1534 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1535 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1536 vf_ifp->if_xname, ifrk.ifrk_func); 1537 goto done; 1538 } 1539 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1540 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1541 vf_ifp->if_xname, ifrk.ifrk_keylen); 1542 goto done; 1543 } 1544 1545 /* 1546 * Extract VF's RSS hash. Only Toeplitz is supported. 1547 */ 1548 memset(&ifrh, 0, sizeof(ifrh)); 1549 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1550 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1551 if (error) { 1552 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1553 vf_ifp->if_xname, error); 1554 goto done; 1555 } 1556 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1557 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1558 vf_ifp->if_xname, ifrh.ifrh_func); 1559 goto done; 1560 } 1561 1562 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1563 if ((ifrh.ifrh_types & my_types) == 0) { 1564 /* This disables RSS; ignore it then */ 1565 if_printf(ifp, "%s intersection of RSS types failed. " 1566 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1567 ifrh.ifrh_types, my_types); 1568 goto done; 1569 } 1570 1571 diff_types = my_types ^ ifrh.ifrh_types; 1572 my_types &= ifrh.ifrh_types; 1573 mbuf_types = my_types; 1574 1575 /* 1576 * Detect RSS hash value/type confliction. 1577 * 1578 * NOTE: 1579 * We don't disable the hash type, but stop delivery the hash 1580 * value/type through mbufs on RX path. 1581 * 1582 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1583 * hash is delivered with type of TCP_IPV4. This means if 1584 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1585 * least to hn_mbuf_hash. However, given that _all_ of the 1586 * NICs implement TCP_IPV4, this will _not_ impose any issues 1587 * here. 1588 */ 1589 if ((my_types & RSS_TYPE_IPV4) && 1590 (diff_types & ifrh.ifrh_types & 1591 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1592 /* Conflict; disable IPV4 hash type/value delivery. */ 1593 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1594 mbuf_types &= ~RSS_TYPE_IPV4; 1595 } 1596 if ((my_types & RSS_TYPE_IPV6) && 1597 (diff_types & ifrh.ifrh_types & 1598 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1599 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1600 RSS_TYPE_IPV6_EX))) { 1601 /* Conflict; disable IPV6 hash type/value delivery. */ 1602 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1603 mbuf_types &= ~RSS_TYPE_IPV6; 1604 } 1605 if ((my_types & RSS_TYPE_IPV6_EX) && 1606 (diff_types & ifrh.ifrh_types & 1607 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1608 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1609 RSS_TYPE_IPV6))) { 1610 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1611 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1612 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1613 } 1614 if ((my_types & RSS_TYPE_TCP_IPV6) && 1615 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1616 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1617 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1618 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1619 } 1620 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1621 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1622 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1623 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1624 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1625 } 1626 if ((my_types & RSS_TYPE_UDP_IPV6) && 1627 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1628 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1629 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1630 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1631 } 1632 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1633 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1634 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1635 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1636 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1637 } 1638 1639 /* 1640 * Indirect table does not matter. 1641 */ 1642 1643 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1644 hn_rss_type_tondis(my_types); 1645 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1646 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1647 1648 if (reconf) { 1649 error = hn_rss_reconfig(sc); 1650 if (error) { 1651 /* XXX roll-back? */ 1652 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1653 /* XXX keep going. */ 1654 } 1655 } 1656 done: 1657 /* Hash deliverability for mbufs. */ 1658 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1659 } 1660 1661 static void 1662 hn_vf_rss_restore(struct hn_softc *sc) 1663 { 1664 1665 HN_LOCK_ASSERT(sc); 1666 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1667 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1668 1669 if (sc->hn_rx_ring_inuse == 1) 1670 goto done; 1671 1672 /* 1673 * Restore hash types. Key does _not_ matter. 1674 */ 1675 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1676 int error; 1677 1678 sc->hn_rss_hash = sc->hn_rss_hcap; 1679 error = hn_rss_reconfig(sc); 1680 if (error) { 1681 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1682 error); 1683 /* XXX keep going. */ 1684 } 1685 } 1686 done: 1687 /* Hash deliverability for mbufs. */ 1688 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1689 } 1690 1691 static void 1692 hn_xpnt_vf_setready(struct hn_softc *sc) 1693 { 1694 struct ifnet *ifp, *vf_ifp; 1695 struct ifreq ifr; 1696 1697 HN_LOCK_ASSERT(sc); 1698 ifp = sc->hn_ifp; 1699 vf_ifp = sc->hn_vf_ifp; 1700 1701 /* 1702 * Mark the VF ready. 1703 */ 1704 sc->hn_vf_rdytick = 0; 1705 1706 /* 1707 * Save information for restoration. 1708 */ 1709 sc->hn_saved_caps = ifp->if_capabilities; 1710 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1711 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1712 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1713 1714 /* 1715 * Intersect supported/enabled capabilities. 1716 * 1717 * NOTE: 1718 * if_hwassist is not changed here. 1719 */ 1720 ifp->if_capabilities &= vf_ifp->if_capabilities; 1721 ifp->if_capenable &= ifp->if_capabilities; 1722 1723 /* 1724 * Fix TSO settings. 1725 */ 1726 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1727 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1728 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1729 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1730 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1731 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1732 1733 /* 1734 * Change VF's enabled capabilities. 1735 */ 1736 memset(&ifr, 0, sizeof(ifr)); 1737 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1738 ifr.ifr_reqcap = ifp->if_capenable; 1739 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1740 1741 if (ifp->if_mtu != ETHERMTU) { 1742 int error; 1743 1744 /* 1745 * Change VF's MTU. 1746 */ 1747 memset(&ifr, 0, sizeof(ifr)); 1748 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1749 ifr.ifr_mtu = ifp->if_mtu; 1750 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1751 if (error) { 1752 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1753 vf_ifp->if_xname, ifp->if_mtu); 1754 if (ifp->if_mtu > ETHERMTU) { 1755 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1756 1757 /* 1758 * XXX 1759 * No need to adjust the synthetic parts' MTU; 1760 * failure of the adjustment will cause us 1761 * infinite headache. 1762 */ 1763 ifp->if_mtu = ETHERMTU; 1764 hn_mtu_change_fixup(sc); 1765 } 1766 } 1767 } 1768 } 1769 1770 static bool 1771 hn_xpnt_vf_isready(struct hn_softc *sc) 1772 { 1773 1774 HN_LOCK_ASSERT(sc); 1775 1776 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1777 return (false); 1778 1779 if (sc->hn_vf_rdytick == 0) 1780 return (true); 1781 1782 if (sc->hn_vf_rdytick > ticks) 1783 return (false); 1784 1785 /* Mark VF as ready. */ 1786 hn_xpnt_vf_setready(sc); 1787 return (true); 1788 } 1789 1790 static void 1791 hn_xpnt_vf_setenable(struct hn_softc *sc) 1792 { 1793 int i; 1794 1795 HN_LOCK_ASSERT(sc); 1796 1797 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1798 rm_wlock(&sc->hn_vf_lock); 1799 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1800 rm_wunlock(&sc->hn_vf_lock); 1801 1802 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1803 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1804 } 1805 1806 static void 1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1808 { 1809 int i; 1810 1811 HN_LOCK_ASSERT(sc); 1812 1813 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1814 rm_wlock(&sc->hn_vf_lock); 1815 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1816 if (clear_vf) 1817 sc->hn_vf_ifp = NULL; 1818 rm_wunlock(&sc->hn_vf_lock); 1819 1820 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1821 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1822 } 1823 1824 static void 1825 hn_xpnt_vf_init(struct hn_softc *sc) 1826 { 1827 int error; 1828 1829 HN_LOCK_ASSERT(sc); 1830 1831 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1832 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1833 1834 if (bootverbose) { 1835 if_printf(sc->hn_ifp, "try bringing up %s\n", 1836 sc->hn_vf_ifp->if_xname); 1837 } 1838 1839 /* 1840 * Bring the VF up. 1841 */ 1842 hn_xpnt_vf_saveifflags(sc); 1843 sc->hn_vf_ifp->if_flags |= IFF_UP; 1844 error = hn_xpnt_vf_iocsetflags(sc); 1845 if (error) { 1846 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1847 sc->hn_vf_ifp->if_xname, error); 1848 return; 1849 } 1850 1851 /* 1852 * NOTE: 1853 * Datapath setting must happen _after_ bringing the VF up. 1854 */ 1855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1856 1857 /* 1858 * NOTE: 1859 * Fixup RSS related bits _after_ the VF is brought up, since 1860 * many VFs generate RSS key during it's initialization. 1861 */ 1862 hn_vf_rss_fixup(sc, true); 1863 1864 /* Mark transparent mode VF as enabled. */ 1865 hn_xpnt_vf_setenable(sc); 1866 } 1867 1868 static void 1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1870 { 1871 struct hn_softc *sc = xsc; 1872 1873 HN_LOCK(sc); 1874 1875 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1876 goto done; 1877 if (sc->hn_vf_ifp == NULL) 1878 goto done; 1879 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1880 goto done; 1881 1882 if (sc->hn_vf_rdytick != 0) { 1883 /* Mark VF as ready. */ 1884 hn_xpnt_vf_setready(sc); 1885 } 1886 1887 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1888 /* 1889 * Delayed VF initialization. 1890 */ 1891 if (bootverbose) { 1892 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1893 sc->hn_vf_ifp->if_xname); 1894 } 1895 hn_xpnt_vf_init(sc); 1896 } 1897 done: 1898 HN_UNLOCK(sc); 1899 } 1900 1901 static void 1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1903 { 1904 struct hn_softc *sc = xsc; 1905 1906 HN_LOCK(sc); 1907 1908 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1909 goto done; 1910 1911 if (!hn_ismyvf(sc, ifp)) 1912 goto done; 1913 1914 if (sc->hn_vf_ifp != NULL) { 1915 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1916 sc->hn_vf_ifp->if_xname); 1917 goto done; 1918 } 1919 1920 if (hn_xpnt_vf && ifp->if_start != NULL) { 1921 /* 1922 * ifnet.if_start is _not_ supported by transparent 1923 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1924 */ 1925 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1926 "in transparent VF mode.\n", ifp->if_xname); 1927 goto done; 1928 } 1929 1930 rm_wlock(&hn_vfmap_lock); 1931 1932 if (ifp->if_index >= hn_vfmap_size) { 1933 struct ifnet **newmap; 1934 int newsize; 1935 1936 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1937 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1938 M_WAITOK | M_ZERO); 1939 1940 memcpy(newmap, hn_vfmap, 1941 sizeof(struct ifnet *) * hn_vfmap_size); 1942 free(hn_vfmap, M_DEVBUF); 1943 hn_vfmap = newmap; 1944 hn_vfmap_size = newsize; 1945 } 1946 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1947 ("%s: ifindex %d was mapped to %s", 1948 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1949 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1950 1951 rm_wunlock(&hn_vfmap_lock); 1952 1953 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1954 rm_wlock(&sc->hn_vf_lock); 1955 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1956 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1957 sc->hn_vf_ifp = ifp; 1958 rm_wunlock(&sc->hn_vf_lock); 1959 1960 if (hn_xpnt_vf) { 1961 int wait_ticks; 1962 1963 /* 1964 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1965 * Save vf_ifp's current if_input for later restoration. 1966 */ 1967 sc->hn_vf_input = ifp->if_input; 1968 ifp->if_input = hn_xpnt_vf_input; 1969 1970 /* 1971 * Stop link status management; use the VF's. 1972 */ 1973 hn_suspend_mgmt(sc); 1974 1975 /* 1976 * Give VF sometime to complete its attach routing. 1977 */ 1978 wait_ticks = hn_xpnt_vf_attwait * hz; 1979 sc->hn_vf_rdytick = ticks + wait_ticks; 1980 1981 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1982 wait_ticks); 1983 } 1984 done: 1985 HN_UNLOCK(sc); 1986 } 1987 1988 static void 1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1990 { 1991 struct hn_softc *sc = xsc; 1992 1993 HN_LOCK(sc); 1994 1995 if (sc->hn_vf_ifp == NULL) 1996 goto done; 1997 1998 if (!hn_ismyvf(sc, ifp)) 1999 goto done; 2000 2001 if (hn_xpnt_vf) { 2002 /* 2003 * Make sure that the delayed initialization is not running. 2004 * 2005 * NOTE: 2006 * - This lock _must_ be released, since the hn_vf_init task 2007 * will try holding this lock. 2008 * - It is safe to release this lock here, since the 2009 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2010 * 2011 * XXX racy, if hn(4) ever detached. 2012 */ 2013 HN_UNLOCK(sc); 2014 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2015 HN_LOCK(sc); 2016 2017 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2018 sc->hn_ifp->if_xname)); 2019 ifp->if_input = sc->hn_vf_input; 2020 sc->hn_vf_input = NULL; 2021 2022 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2023 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2024 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2025 2026 if (sc->hn_vf_rdytick == 0) { 2027 /* 2028 * The VF was ready; restore some settings. 2029 */ 2030 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2031 /* 2032 * NOTE: 2033 * There is _no_ need to fixup if_capenable and 2034 * if_hwassist, since the if_capabilities before 2035 * restoration was an intersection of the VF's 2036 * if_capabilites and the synthetic device's 2037 * if_capabilites. 2038 */ 2039 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2040 sc->hn_ifp->if_hw_tsomaxsegcount = 2041 sc->hn_saved_tsosegcnt; 2042 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2043 } 2044 2045 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2046 /* 2047 * Restore RSS settings. 2048 */ 2049 hn_vf_rss_restore(sc); 2050 2051 /* 2052 * Resume link status management, which was suspended 2053 * by hn_ifnet_attevent(). 2054 */ 2055 hn_resume_mgmt(sc); 2056 } 2057 } 2058 2059 /* Mark transparent mode VF as disabled. */ 2060 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2061 2062 rm_wlock(&hn_vfmap_lock); 2063 2064 KASSERT(ifp->if_index < hn_vfmap_size, 2065 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2066 if (hn_vfmap[ifp->if_index] != NULL) { 2067 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2068 ("%s: ifindex %d was mapped to %s", 2069 ifp->if_xname, ifp->if_index, 2070 hn_vfmap[ifp->if_index]->if_xname)); 2071 hn_vfmap[ifp->if_index] = NULL; 2072 } 2073 2074 rm_wunlock(&hn_vfmap_lock); 2075 done: 2076 HN_UNLOCK(sc); 2077 } 2078 2079 static void 2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2081 { 2082 struct hn_softc *sc = xsc; 2083 2084 if (sc->hn_vf_ifp == ifp) 2085 if_link_state_change(sc->hn_ifp, link_state); 2086 } 2087 2088 static int 2089 hn_probe(device_t dev) 2090 { 2091 2092 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2093 device_set_desc(dev, "Hyper-V Network Interface"); 2094 return BUS_PROBE_DEFAULT; 2095 } 2096 return ENXIO; 2097 } 2098 2099 static int 2100 hn_attach(device_t dev) 2101 { 2102 struct hn_softc *sc = device_get_softc(dev); 2103 struct sysctl_oid_list *child; 2104 struct sysctl_ctx_list *ctx; 2105 uint8_t eaddr[ETHER_ADDR_LEN]; 2106 struct ifnet *ifp = NULL; 2107 int error, ring_cnt, tx_ring_cnt; 2108 uint32_t mtu; 2109 2110 sc->hn_dev = dev; 2111 sc->hn_prichan = vmbus_get_channel(dev); 2112 HN_LOCK_INIT(sc); 2113 rm_init(&sc->hn_vf_lock, "hnvf"); 2114 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2115 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2116 2117 /* 2118 * Initialize these tunables once. 2119 */ 2120 sc->hn_agg_size = hn_tx_agg_size; 2121 sc->hn_agg_pkts = hn_tx_agg_pkts; 2122 2123 /* 2124 * Setup taskqueue for transmission. 2125 */ 2126 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2127 int i; 2128 2129 sc->hn_tx_taskqs = 2130 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2131 M_DEVBUF, M_WAITOK); 2132 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2133 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2134 M_WAITOK, taskqueue_thread_enqueue, 2135 &sc->hn_tx_taskqs[i]); 2136 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2137 "%s tx%d", device_get_nameunit(dev), i); 2138 } 2139 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2140 sc->hn_tx_taskqs = hn_tx_taskque; 2141 } 2142 2143 /* 2144 * Setup taskqueue for mangement tasks, e.g. link status. 2145 */ 2146 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2147 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2148 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2149 device_get_nameunit(dev)); 2150 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2151 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2152 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2153 hn_netchg_status_taskfunc, sc); 2154 2155 if (hn_xpnt_vf) { 2156 /* 2157 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2158 */ 2159 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2160 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2161 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2162 device_get_nameunit(dev)); 2163 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2164 hn_xpnt_vf_init_taskfunc, sc); 2165 } 2166 2167 /* 2168 * Allocate ifnet and setup its name earlier, so that if_printf 2169 * can be used by functions, which will be called after 2170 * ether_ifattach(). 2171 */ 2172 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2173 ifp->if_softc = sc; 2174 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2175 2176 /* 2177 * Initialize ifmedia earlier so that it can be unconditionally 2178 * destroyed, if error happened later on. 2179 */ 2180 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2181 2182 /* 2183 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2184 * to use (tx_ring_cnt). 2185 * 2186 * NOTE: 2187 * The # of RX rings to use is same as the # of channels to use. 2188 */ 2189 ring_cnt = hn_chan_cnt; 2190 if (ring_cnt <= 0) { 2191 /* Default */ 2192 ring_cnt = mp_ncpus; 2193 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2194 ring_cnt = HN_RING_CNT_DEF_MAX; 2195 } else if (ring_cnt > mp_ncpus) { 2196 ring_cnt = mp_ncpus; 2197 } 2198 #ifdef RSS 2199 if (ring_cnt > rss_getnumbuckets()) 2200 ring_cnt = rss_getnumbuckets(); 2201 #endif 2202 2203 tx_ring_cnt = hn_tx_ring_cnt; 2204 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2205 tx_ring_cnt = ring_cnt; 2206 #ifdef HN_IFSTART_SUPPORT 2207 if (hn_use_if_start) { 2208 /* ifnet.if_start only needs one TX ring. */ 2209 tx_ring_cnt = 1; 2210 } 2211 #endif 2212 2213 /* 2214 * Set the leader CPU for channels. 2215 */ 2216 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2217 2218 /* 2219 * Create enough TX/RX rings, even if only limited number of 2220 * channels can be allocated. 2221 */ 2222 error = hn_create_tx_data(sc, tx_ring_cnt); 2223 if (error) 2224 goto failed; 2225 error = hn_create_rx_data(sc, ring_cnt); 2226 if (error) 2227 goto failed; 2228 2229 /* 2230 * Create transaction context for NVS and RNDIS transactions. 2231 */ 2232 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2233 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2234 if (sc->hn_xact == NULL) { 2235 error = ENXIO; 2236 goto failed; 2237 } 2238 2239 /* 2240 * Install orphan handler for the revocation of this device's 2241 * primary channel. 2242 * 2243 * NOTE: 2244 * The processing order is critical here: 2245 * Install the orphan handler, _before_ testing whether this 2246 * device's primary channel has been revoked or not. 2247 */ 2248 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2249 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2250 error = ENXIO; 2251 goto failed; 2252 } 2253 2254 /* 2255 * Attach the synthetic parts, i.e. NVS and RNDIS. 2256 */ 2257 error = hn_synth_attach(sc, ETHERMTU); 2258 if (error) 2259 goto failed; 2260 2261 error = hn_rndis_get_eaddr(sc, eaddr); 2262 if (error) 2263 goto failed; 2264 2265 error = hn_rndis_get_mtu(sc, &mtu); 2266 if (error) 2267 mtu = ETHERMTU; 2268 else if (bootverbose) 2269 device_printf(dev, "RNDIS mtu %u\n", mtu); 2270 2271 #if __FreeBSD_version >= 1100099 2272 if (sc->hn_rx_ring_inuse > 1) { 2273 /* 2274 * Reduce TCP segment aggregation limit for multiple 2275 * RX rings to increase ACK timeliness. 2276 */ 2277 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2278 } 2279 #endif 2280 2281 /* 2282 * Fixup TX/RX stuffs after synthetic parts are attached. 2283 */ 2284 hn_fixup_tx_data(sc); 2285 hn_fixup_rx_data(sc); 2286 2287 ctx = device_get_sysctl_ctx(dev); 2288 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2289 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2290 &sc->hn_nvs_ver, 0, "NVS version"); 2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2292 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2293 hn_ndis_version_sysctl, "A", "NDIS version"); 2294 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2295 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2296 hn_caps_sysctl, "A", "capabilities"); 2297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2298 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2299 hn_hwassist_sysctl, "A", "hwassist"); 2300 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2301 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2303 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2304 "max # of TSO segments"); 2305 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2306 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2307 "max size of TSO segment"); 2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2309 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2310 hn_rxfilter_sysctl, "A", "rxfilter"); 2311 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2312 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2313 hn_rss_hash_sysctl, "A", "RSS hash"); 2314 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2315 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2316 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2319 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2320 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2321 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2322 #ifndef RSS 2323 /* 2324 * Don't allow RSS key/indirect table changes, if RSS is defined. 2325 */ 2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2327 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2328 hn_rss_key_sysctl, "IU", "RSS key"); 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2330 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2331 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2332 #endif 2333 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2334 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2335 "RNDIS offered packet transmission aggregation size limit"); 2336 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2337 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2338 "RNDIS offered packet transmission aggregation count limit"); 2339 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2340 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2341 "RNDIS packet transmission aggregation alignment"); 2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2343 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2344 hn_txagg_size_sysctl, "I", 2345 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2347 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2348 hn_txagg_pkts_sysctl, "I", 2349 "Packet transmission aggregation packets, " 2350 "0 -- disable, -1 -- auto"); 2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2352 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2353 hn_polling_sysctl, "I", 2354 "Polling frequency: [100,1000000], 0 disable polling"); 2355 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2356 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2357 hn_vf_sysctl, "A", "Virtual Function's name"); 2358 if (!hn_xpnt_vf) { 2359 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2360 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2361 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2362 } else { 2363 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2364 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2365 hn_xpnt_vf_enabled_sysctl, "I", 2366 "Transparent VF enabled"); 2367 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2368 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2369 hn_xpnt_vf_accbpf_sysctl, "I", 2370 "Accurate BPF for transparent VF"); 2371 } 2372 2373 /* 2374 * Setup the ifmedia, which has been initialized earlier. 2375 */ 2376 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2377 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2378 /* XXX ifmedia_set really should do this for us */ 2379 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2380 2381 /* 2382 * Setup the ifnet for this interface. 2383 */ 2384 2385 ifp->if_baudrate = IF_Gbps(10); 2386 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2387 ifp->if_ioctl = hn_ioctl; 2388 ifp->if_init = hn_init; 2389 #ifdef HN_IFSTART_SUPPORT 2390 if (hn_use_if_start) { 2391 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2392 2393 ifp->if_start = hn_start; 2394 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2395 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2396 IFQ_SET_READY(&ifp->if_snd); 2397 } else 2398 #endif 2399 { 2400 ifp->if_transmit = hn_transmit; 2401 ifp->if_qflush = hn_xmit_qflush; 2402 } 2403 2404 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2405 #ifdef foo 2406 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2407 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2408 #endif 2409 if (sc->hn_caps & HN_CAP_VLAN) { 2410 /* XXX not sure about VLAN_MTU. */ 2411 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2412 } 2413 2414 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2415 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2416 ifp->if_capabilities |= IFCAP_TXCSUM; 2417 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2418 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2419 if (sc->hn_caps & HN_CAP_TSO4) { 2420 ifp->if_capabilities |= IFCAP_TSO4; 2421 ifp->if_hwassist |= CSUM_IP_TSO; 2422 } 2423 if (sc->hn_caps & HN_CAP_TSO6) { 2424 ifp->if_capabilities |= IFCAP_TSO6; 2425 ifp->if_hwassist |= CSUM_IP6_TSO; 2426 } 2427 2428 /* Enable all available capabilities by default. */ 2429 ifp->if_capenable = ifp->if_capabilities; 2430 2431 /* 2432 * Disable IPv6 TSO and TXCSUM by default, they still can 2433 * be enabled through SIOCSIFCAP. 2434 */ 2435 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2436 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2437 2438 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2439 /* 2440 * Lock hn_set_tso_maxsize() to simplify its 2441 * internal logic. 2442 */ 2443 HN_LOCK(sc); 2444 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2445 HN_UNLOCK(sc); 2446 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2447 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2448 } 2449 2450 ether_ifattach(ifp, eaddr); 2451 2452 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2453 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2454 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2455 } 2456 if (mtu < ETHERMTU) { 2457 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2458 ifp->if_mtu = mtu; 2459 } 2460 2461 /* Inform the upper layer about the long frame support. */ 2462 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2463 2464 /* 2465 * Kick off link status check. 2466 */ 2467 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2468 hn_update_link_status(sc); 2469 2470 if (!hn_xpnt_vf) { 2471 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2472 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2473 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2474 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2475 } else { 2476 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2477 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2478 } 2479 2480 /* 2481 * NOTE: 2482 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2483 * since interface's LLADDR is needed; interface LLADDR is not 2484 * available when ifnet_arrival event is triggered. 2485 */ 2486 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2487 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2488 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2489 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2490 2491 return (0); 2492 failed: 2493 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2494 hn_synth_detach(sc); 2495 hn_detach(dev); 2496 return (error); 2497 } 2498 2499 static int 2500 hn_detach(device_t dev) 2501 { 2502 struct hn_softc *sc = device_get_softc(dev); 2503 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2504 2505 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2506 /* 2507 * In case that the vmbus missed the orphan handler 2508 * installation. 2509 */ 2510 vmbus_xact_ctx_orphan(sc->hn_xact); 2511 } 2512 2513 if (sc->hn_ifaddr_evthand != NULL) 2514 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2515 if (sc->hn_ifnet_evthand != NULL) 2516 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2517 if (sc->hn_ifnet_atthand != NULL) { 2518 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2519 sc->hn_ifnet_atthand); 2520 } 2521 if (sc->hn_ifnet_dethand != NULL) { 2522 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2523 sc->hn_ifnet_dethand); 2524 } 2525 if (sc->hn_ifnet_lnkhand != NULL) 2526 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2527 2528 vf_ifp = sc->hn_vf_ifp; 2529 __compiler_membar(); 2530 if (vf_ifp != NULL) 2531 hn_ifnet_detevent(sc, vf_ifp); 2532 2533 if (device_is_attached(dev)) { 2534 HN_LOCK(sc); 2535 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2536 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2537 hn_stop(sc, true); 2538 /* 2539 * NOTE: 2540 * hn_stop() only suspends data, so managment 2541 * stuffs have to be suspended manually here. 2542 */ 2543 hn_suspend_mgmt(sc); 2544 hn_synth_detach(sc); 2545 } 2546 HN_UNLOCK(sc); 2547 ether_ifdetach(ifp); 2548 } 2549 2550 ifmedia_removeall(&sc->hn_media); 2551 hn_destroy_rx_data(sc); 2552 hn_destroy_tx_data(sc); 2553 2554 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2555 int i; 2556 2557 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2558 taskqueue_free(sc->hn_tx_taskqs[i]); 2559 free(sc->hn_tx_taskqs, M_DEVBUF); 2560 } 2561 taskqueue_free(sc->hn_mgmt_taskq0); 2562 if (sc->hn_vf_taskq != NULL) 2563 taskqueue_free(sc->hn_vf_taskq); 2564 2565 if (sc->hn_xact != NULL) { 2566 /* 2567 * Uninstall the orphan handler _before_ the xact is 2568 * destructed. 2569 */ 2570 vmbus_chan_unset_orphan(sc->hn_prichan); 2571 vmbus_xact_ctx_destroy(sc->hn_xact); 2572 } 2573 2574 if_free(ifp); 2575 2576 HN_LOCK_DESTROY(sc); 2577 rm_destroy(&sc->hn_vf_lock); 2578 return (0); 2579 } 2580 2581 static int 2582 hn_shutdown(device_t dev) 2583 { 2584 2585 return (0); 2586 } 2587 2588 static void 2589 hn_link_status(struct hn_softc *sc) 2590 { 2591 uint32_t link_status; 2592 int error; 2593 2594 error = hn_rndis_get_linkstatus(sc, &link_status); 2595 if (error) { 2596 /* XXX what to do? */ 2597 return; 2598 } 2599 2600 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2601 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2602 else 2603 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2604 if_link_state_change(sc->hn_ifp, 2605 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2606 LINK_STATE_UP : LINK_STATE_DOWN); 2607 } 2608 2609 static void 2610 hn_link_taskfunc(void *xsc, int pending __unused) 2611 { 2612 struct hn_softc *sc = xsc; 2613 2614 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2615 return; 2616 hn_link_status(sc); 2617 } 2618 2619 static void 2620 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2621 { 2622 struct hn_softc *sc = xsc; 2623 2624 /* Prevent any link status checks from running. */ 2625 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2626 2627 /* 2628 * Fake up a [link down --> link up] state change; 5 seconds 2629 * delay is used, which closely simulates miibus reaction 2630 * upon link down event. 2631 */ 2632 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2633 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2634 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2635 &sc->hn_netchg_status, 5 * hz); 2636 } 2637 2638 static void 2639 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2640 { 2641 struct hn_softc *sc = xsc; 2642 2643 /* Re-allow link status checks. */ 2644 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2645 hn_link_status(sc); 2646 } 2647 2648 static void 2649 hn_update_link_status(struct hn_softc *sc) 2650 { 2651 2652 if (sc->hn_mgmt_taskq != NULL) 2653 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2654 } 2655 2656 static void 2657 hn_change_network(struct hn_softc *sc) 2658 { 2659 2660 if (sc->hn_mgmt_taskq != NULL) 2661 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2662 } 2663 2664 static __inline int 2665 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2666 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2667 { 2668 struct mbuf *m = *m_head; 2669 int error; 2670 2671 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2672 2673 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2674 m, segs, nsegs, BUS_DMA_NOWAIT); 2675 if (error == EFBIG) { 2676 struct mbuf *m_new; 2677 2678 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2679 if (m_new == NULL) 2680 return ENOBUFS; 2681 else 2682 *m_head = m = m_new; 2683 txr->hn_tx_collapsed++; 2684 2685 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2686 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2687 } 2688 if (!error) { 2689 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2690 BUS_DMASYNC_PREWRITE); 2691 txd->flags |= HN_TXD_FLAG_DMAMAP; 2692 } 2693 return error; 2694 } 2695 2696 static __inline int 2697 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2698 { 2699 2700 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2701 ("put an onlist txd %#x", txd->flags)); 2702 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2703 ("put an onagg txd %#x", txd->flags)); 2704 2705 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2706 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2707 return 0; 2708 2709 if (!STAILQ_EMPTY(&txd->agg_list)) { 2710 struct hn_txdesc *tmp_txd; 2711 2712 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2713 int freed __diagused; 2714 2715 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2716 ("resursive aggregation on aggregated txdesc")); 2717 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2718 ("not aggregated txdesc")); 2719 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2720 ("aggregated txdesc uses dmamap")); 2721 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2722 ("aggregated txdesc consumes " 2723 "chimney sending buffer")); 2724 KASSERT(tmp_txd->chim_size == 0, 2725 ("aggregated txdesc has non-zero " 2726 "chimney sending size")); 2727 2728 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2729 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2730 freed = hn_txdesc_put(txr, tmp_txd); 2731 KASSERT(freed, ("failed to free aggregated txdesc")); 2732 } 2733 } 2734 2735 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2736 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2737 ("chim txd uses dmamap")); 2738 hn_chim_free(txr->hn_sc, txd->chim_index); 2739 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2740 txd->chim_size = 0; 2741 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2742 bus_dmamap_sync(txr->hn_tx_data_dtag, 2743 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2744 bus_dmamap_unload(txr->hn_tx_data_dtag, 2745 txd->data_dmap); 2746 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2747 } 2748 2749 if (txd->m != NULL) { 2750 m_freem(txd->m); 2751 txd->m = NULL; 2752 } 2753 2754 txd->flags |= HN_TXD_FLAG_ONLIST; 2755 #ifndef HN_USE_TXDESC_BUFRING 2756 mtx_lock_spin(&txr->hn_txlist_spin); 2757 KASSERT(txr->hn_txdesc_avail >= 0 && 2758 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2759 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2760 txr->hn_txdesc_avail++; 2761 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2762 mtx_unlock_spin(&txr->hn_txlist_spin); 2763 #else /* HN_USE_TXDESC_BUFRING */ 2764 #ifdef HN_DEBUG 2765 atomic_add_int(&txr->hn_txdesc_avail, 1); 2766 #endif 2767 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2768 #endif /* !HN_USE_TXDESC_BUFRING */ 2769 2770 return 1; 2771 } 2772 2773 static __inline struct hn_txdesc * 2774 hn_txdesc_get(struct hn_tx_ring *txr) 2775 { 2776 struct hn_txdesc *txd; 2777 2778 #ifndef HN_USE_TXDESC_BUFRING 2779 mtx_lock_spin(&txr->hn_txlist_spin); 2780 txd = SLIST_FIRST(&txr->hn_txlist); 2781 if (txd != NULL) { 2782 KASSERT(txr->hn_txdesc_avail > 0, 2783 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2784 txr->hn_txdesc_avail--; 2785 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2786 } 2787 mtx_unlock_spin(&txr->hn_txlist_spin); 2788 #else 2789 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2790 #endif 2791 2792 if (txd != NULL) { 2793 #ifdef HN_USE_TXDESC_BUFRING 2794 #ifdef HN_DEBUG 2795 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2796 #endif 2797 #endif /* HN_USE_TXDESC_BUFRING */ 2798 KASSERT(txd->m == NULL && txd->refs == 0 && 2799 STAILQ_EMPTY(&txd->agg_list) && 2800 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2801 txd->chim_size == 0 && 2802 (txd->flags & HN_TXD_FLAG_ONLIST) && 2803 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2804 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2805 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2806 txd->refs = 1; 2807 } 2808 return txd; 2809 } 2810 2811 static __inline void 2812 hn_txdesc_hold(struct hn_txdesc *txd) 2813 { 2814 2815 /* 0->1 transition will never work */ 2816 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2817 atomic_add_int(&txd->refs, 1); 2818 } 2819 2820 static __inline void 2821 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2822 { 2823 2824 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2825 ("recursive aggregation on aggregating txdesc")); 2826 2827 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2828 ("already aggregated")); 2829 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2830 ("recursive aggregation on to-be-aggregated txdesc")); 2831 2832 txd->flags |= HN_TXD_FLAG_ONAGG; 2833 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2834 } 2835 2836 static bool 2837 hn_tx_ring_pending(struct hn_tx_ring *txr) 2838 { 2839 bool pending = false; 2840 2841 #ifndef HN_USE_TXDESC_BUFRING 2842 mtx_lock_spin(&txr->hn_txlist_spin); 2843 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2844 pending = true; 2845 mtx_unlock_spin(&txr->hn_txlist_spin); 2846 #else 2847 if (!buf_ring_full(txr->hn_txdesc_br)) 2848 pending = true; 2849 #endif 2850 return (pending); 2851 } 2852 2853 static __inline void 2854 hn_txeof(struct hn_tx_ring *txr) 2855 { 2856 txr->hn_has_txeof = 0; 2857 txr->hn_txeof(txr); 2858 } 2859 2860 static void 2861 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2862 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2863 { 2864 struct hn_txdesc *txd = sndc->hn_cbarg; 2865 struct hn_tx_ring *txr; 2866 2867 txr = txd->txr; 2868 KASSERT(txr->hn_chan == chan, 2869 ("channel mismatch, on chan%u, should be chan%u", 2870 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2871 2872 txr->hn_has_txeof = 1; 2873 hn_txdesc_put(txr, txd); 2874 2875 ++txr->hn_txdone_cnt; 2876 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2877 txr->hn_txdone_cnt = 0; 2878 if (txr->hn_oactive) 2879 hn_txeof(txr); 2880 } 2881 } 2882 2883 static void 2884 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2885 { 2886 #if defined(INET) || defined(INET6) 2887 struct epoch_tracker et; 2888 2889 NET_EPOCH_ENTER(et); 2890 tcp_lro_flush_all(&rxr->hn_lro); 2891 NET_EPOCH_EXIT(et); 2892 #endif 2893 2894 /* 2895 * NOTE: 2896 * 'txr' could be NULL, if multiple channels and 2897 * ifnet.if_start method are enabled. 2898 */ 2899 if (txr == NULL || !txr->hn_has_txeof) 2900 return; 2901 2902 txr->hn_txdone_cnt = 0; 2903 hn_txeof(txr); 2904 } 2905 2906 static __inline uint32_t 2907 hn_rndis_pktmsg_offset(uint32_t ofs) 2908 { 2909 2910 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2911 ("invalid RNDIS packet msg offset %u", ofs)); 2912 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2913 } 2914 2915 static __inline void * 2916 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2917 size_t pi_dlen, uint32_t pi_type) 2918 { 2919 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2920 struct rndis_pktinfo *pi; 2921 2922 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2923 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2924 2925 /* 2926 * Per-packet-info does not move; it only grows. 2927 * 2928 * NOTE: 2929 * rm_pktinfooffset in this phase counts from the beginning 2930 * of rndis_packet_msg. 2931 */ 2932 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2933 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2934 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2935 pkt->rm_pktinfolen); 2936 pkt->rm_pktinfolen += pi_size; 2937 2938 pi->rm_size = pi_size; 2939 pi->rm_type = pi_type; 2940 pi->rm_internal = 0; 2941 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2942 2943 return (pi->rm_data); 2944 } 2945 2946 static __inline int 2947 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2948 { 2949 struct hn_txdesc *txd; 2950 struct mbuf *m; 2951 int error, pkts; 2952 2953 txd = txr->hn_agg_txd; 2954 KASSERT(txd != NULL, ("no aggregate txdesc")); 2955 2956 /* 2957 * Since hn_txpkt() will reset this temporary stat, save 2958 * it now, so that oerrors can be updated properly, if 2959 * hn_txpkt() ever fails. 2960 */ 2961 pkts = txr->hn_stat_pkts; 2962 2963 /* 2964 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2965 * failure, save it for later freeing, if hn_txpkt() ever 2966 * fails. 2967 */ 2968 m = txd->m; 2969 error = hn_txpkt(ifp, txr, txd); 2970 if (__predict_false(error)) { 2971 /* txd is freed, but m is not. */ 2972 m_freem(m); 2973 2974 txr->hn_flush_failed++; 2975 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2976 } 2977 2978 /* Reset all aggregation states. */ 2979 txr->hn_agg_txd = NULL; 2980 txr->hn_agg_szleft = 0; 2981 txr->hn_agg_pktleft = 0; 2982 txr->hn_agg_prevpkt = NULL; 2983 2984 return (error); 2985 } 2986 2987 static void * 2988 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2989 int pktsize) 2990 { 2991 void *chim; 2992 2993 if (txr->hn_agg_txd != NULL) { 2994 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2995 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2996 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2997 int olen; 2998 2999 /* 3000 * Update the previous RNDIS packet's total length, 3001 * it can be increased due to the mandatory alignment 3002 * padding for this RNDIS packet. And update the 3003 * aggregating txdesc's chimney sending buffer size 3004 * accordingly. 3005 * 3006 * XXX 3007 * Zero-out the padding, as required by the RNDIS spec. 3008 */ 3009 olen = pkt->rm_len; 3010 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3011 agg_txd->chim_size += pkt->rm_len - olen; 3012 3013 /* Link this txdesc to the parent. */ 3014 hn_txdesc_agg(agg_txd, txd); 3015 3016 chim = (uint8_t *)pkt + pkt->rm_len; 3017 /* Save the current packet for later fixup. */ 3018 txr->hn_agg_prevpkt = chim; 3019 3020 txr->hn_agg_pktleft--; 3021 txr->hn_agg_szleft -= pktsize; 3022 if (txr->hn_agg_szleft <= 3023 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3024 /* 3025 * Probably can't aggregate more packets, 3026 * flush this aggregating txdesc proactively. 3027 */ 3028 txr->hn_agg_pktleft = 0; 3029 } 3030 /* Done! */ 3031 return (chim); 3032 } 3033 hn_flush_txagg(ifp, txr); 3034 } 3035 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3036 3037 txr->hn_tx_chimney_tried++; 3038 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3039 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3040 return (NULL); 3041 txr->hn_tx_chimney++; 3042 3043 chim = txr->hn_sc->hn_chim + 3044 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3045 3046 if (txr->hn_agg_pktmax > 1 && 3047 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3048 txr->hn_agg_txd = txd; 3049 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3050 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3051 txr->hn_agg_prevpkt = chim; 3052 } 3053 return (chim); 3054 } 3055 3056 /* 3057 * NOTE: 3058 * If this function fails, then both txd and m_head0 will be freed. 3059 */ 3060 static int 3061 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3062 struct mbuf **m_head0) 3063 { 3064 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3065 int error, nsegs, i; 3066 struct mbuf *m_head = *m_head0; 3067 struct rndis_packet_msg *pkt; 3068 uint32_t *pi_data; 3069 void *chim = NULL; 3070 int pkt_hlen, pkt_size; 3071 3072 pkt = txd->rndis_pkt; 3073 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3074 if (pkt_size < txr->hn_chim_size) { 3075 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3076 if (chim != NULL) 3077 pkt = chim; 3078 } else { 3079 if (txr->hn_agg_txd != NULL) 3080 hn_flush_txagg(ifp, txr); 3081 } 3082 3083 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3084 pkt->rm_len = m_head->m_pkthdr.len; 3085 pkt->rm_dataoffset = 0; 3086 pkt->rm_datalen = m_head->m_pkthdr.len; 3087 pkt->rm_oobdataoffset = 0; 3088 pkt->rm_oobdatalen = 0; 3089 pkt->rm_oobdataelements = 0; 3090 pkt->rm_pktinfooffset = sizeof(*pkt); 3091 pkt->rm_pktinfolen = 0; 3092 pkt->rm_vchandle = 0; 3093 pkt->rm_reserved = 0; 3094 3095 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3096 /* 3097 * Set the hash value for this packet. 3098 */ 3099 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3100 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3101 3102 if (M_HASHTYPE_ISHASH(m_head)) 3103 /* 3104 * The flowid field contains the hash value host 3105 * set in the rx queue if it is a ip forwarding pkt. 3106 * Set the same hash value so host can send on the 3107 * cpu it was received. 3108 */ 3109 *pi_data = m_head->m_pkthdr.flowid; 3110 else 3111 /* 3112 * Otherwise just put the tx queue index. 3113 */ 3114 *pi_data = txr->hn_tx_idx; 3115 } 3116 3117 if (m_head->m_flags & M_VLANTAG) { 3118 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3119 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3120 *pi_data = NDIS_VLAN_INFO_MAKE( 3121 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3122 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3123 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3124 } 3125 3126 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3127 #if defined(INET6) || defined(INET) 3128 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3129 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3130 #ifdef INET 3131 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3132 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3133 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3134 m_head->m_pkthdr.tso_segsz); 3135 } 3136 #endif 3137 #if defined(INET6) && defined(INET) 3138 else 3139 #endif 3140 #ifdef INET6 3141 { 3142 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3143 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3144 m_head->m_pkthdr.tso_segsz); 3145 } 3146 #endif 3147 #endif /* INET6 || INET */ 3148 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3149 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3150 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3151 if (m_head->m_pkthdr.csum_flags & 3152 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3153 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3154 } else { 3155 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3156 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3157 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3158 } 3159 3160 if (m_head->m_pkthdr.csum_flags & 3161 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3162 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3163 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3164 } else if (m_head->m_pkthdr.csum_flags & 3165 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3166 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3167 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3168 } 3169 } 3170 3171 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3172 /* Fixup RNDIS packet message total length */ 3173 pkt->rm_len += pkt_hlen; 3174 /* Convert RNDIS packet message offsets */ 3175 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3176 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3177 3178 /* 3179 * Fast path: Chimney sending. 3180 */ 3181 if (chim != NULL) { 3182 struct hn_txdesc *tgt_txd = txd; 3183 3184 if (txr->hn_agg_txd != NULL) { 3185 tgt_txd = txr->hn_agg_txd; 3186 #ifdef INVARIANTS 3187 *m_head0 = NULL; 3188 #endif 3189 } 3190 3191 KASSERT(pkt == chim, 3192 ("RNDIS pkt not in chimney sending buffer")); 3193 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3194 ("chimney sending buffer is not used")); 3195 tgt_txd->chim_size += pkt->rm_len; 3196 3197 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3198 ((uint8_t *)chim) + pkt_hlen); 3199 3200 txr->hn_gpa_cnt = 0; 3201 txr->hn_sendpkt = hn_txpkt_chim; 3202 goto done; 3203 } 3204 3205 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3206 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3207 ("chimney buffer is used")); 3208 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3209 3210 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3211 if (__predict_false(error)) { 3212 int freed __diagused; 3213 3214 /* 3215 * This mbuf is not linked w/ the txd yet, so free it now. 3216 */ 3217 m_freem(m_head); 3218 *m_head0 = NULL; 3219 3220 freed = hn_txdesc_put(txr, txd); 3221 KASSERT(freed != 0, 3222 ("fail to free txd upon txdma error")); 3223 3224 txr->hn_txdma_failed++; 3225 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3226 return error; 3227 } 3228 *m_head0 = m_head; 3229 3230 /* +1 RNDIS packet message */ 3231 txr->hn_gpa_cnt = nsegs + 1; 3232 3233 /* send packet with page buffer */ 3234 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3235 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3236 txr->hn_gpa[0].gpa_len = pkt_hlen; 3237 3238 /* 3239 * Fill the page buffers with mbuf info after the page 3240 * buffer for RNDIS packet message. 3241 */ 3242 for (i = 0; i < nsegs; ++i) { 3243 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3244 3245 gpa->gpa_page = atop(segs[i].ds_addr); 3246 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3247 gpa->gpa_len = segs[i].ds_len; 3248 } 3249 3250 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3251 txd->chim_size = 0; 3252 txr->hn_sendpkt = hn_txpkt_sglist; 3253 done: 3254 txd->m = m_head; 3255 3256 /* Set the completion routine */ 3257 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3258 3259 /* Update temporary stats for later use. */ 3260 txr->hn_stat_pkts++; 3261 txr->hn_stat_size += m_head->m_pkthdr.len; 3262 if (m_head->m_flags & M_MCAST) 3263 txr->hn_stat_mcasts++; 3264 3265 return 0; 3266 } 3267 3268 /* 3269 * NOTE: 3270 * If this function fails, then txd will be freed, but the mbuf 3271 * associated w/ the txd will _not_ be freed. 3272 */ 3273 static int 3274 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3275 { 3276 int error, send_failed = 0, has_bpf; 3277 3278 again: 3279 has_bpf = bpf_peers_present(ifp->if_bpf); 3280 if (has_bpf) { 3281 /* 3282 * Make sure that this txd and any aggregated txds are not 3283 * freed before ETHER_BPF_MTAP. 3284 */ 3285 hn_txdesc_hold(txd); 3286 } 3287 error = txr->hn_sendpkt(txr, txd); 3288 if (!error) { 3289 if (has_bpf) { 3290 const struct hn_txdesc *tmp_txd; 3291 3292 ETHER_BPF_MTAP(ifp, txd->m); 3293 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3294 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3295 } 3296 3297 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3298 #ifdef HN_IFSTART_SUPPORT 3299 if (!hn_use_if_start) 3300 #endif 3301 { 3302 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3303 txr->hn_stat_size); 3304 if (txr->hn_stat_mcasts != 0) { 3305 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3306 txr->hn_stat_mcasts); 3307 } 3308 } 3309 txr->hn_pkts += txr->hn_stat_pkts; 3310 txr->hn_sends++; 3311 } 3312 if (has_bpf) 3313 hn_txdesc_put(txr, txd); 3314 3315 if (__predict_false(error)) { 3316 int freed __diagused; 3317 3318 /* 3319 * This should "really rarely" happen. 3320 * 3321 * XXX Too many RX to be acked or too many sideband 3322 * commands to run? Ask netvsc_channel_rollup() 3323 * to kick start later. 3324 */ 3325 txr->hn_has_txeof = 1; 3326 if (!send_failed) { 3327 txr->hn_send_failed++; 3328 send_failed = 1; 3329 /* 3330 * Try sending again after set hn_has_txeof; 3331 * in case that we missed the last 3332 * netvsc_channel_rollup(). 3333 */ 3334 goto again; 3335 } 3336 if_printf(ifp, "send failed\n"); 3337 3338 /* 3339 * Caller will perform further processing on the 3340 * associated mbuf, so don't free it in hn_txdesc_put(); 3341 * only unload it from the DMA map in hn_txdesc_put(), 3342 * if it was loaded. 3343 */ 3344 txd->m = NULL; 3345 freed = hn_txdesc_put(txr, txd); 3346 KASSERT(freed != 0, 3347 ("fail to free txd upon send error")); 3348 3349 txr->hn_send_failed++; 3350 } 3351 3352 /* Reset temporary stats, after this sending is done. */ 3353 txr->hn_stat_size = 0; 3354 txr->hn_stat_pkts = 0; 3355 txr->hn_stat_mcasts = 0; 3356 3357 return (error); 3358 } 3359 3360 /* 3361 * Append the specified data to the indicated mbuf chain, 3362 * Extend the mbuf chain if the new data does not fit in 3363 * existing space. 3364 * 3365 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3366 * There should be an equivalent in the kernel mbuf code, 3367 * but there does not appear to be one yet. 3368 * 3369 * Differs from m_append() in that additional mbufs are 3370 * allocated with cluster size MJUMPAGESIZE, and filled 3371 * accordingly. 3372 * 3373 * Return the last mbuf in the chain or NULL if failed to 3374 * allocate new mbuf. 3375 */ 3376 static struct mbuf * 3377 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3378 { 3379 struct mbuf *m, *n; 3380 int remainder, space; 3381 3382 for (m = m0; m->m_next != NULL; m = m->m_next) 3383 ; 3384 remainder = len; 3385 space = M_TRAILINGSPACE(m); 3386 if (space > 0) { 3387 /* 3388 * Copy into available space. 3389 */ 3390 if (space > remainder) 3391 space = remainder; 3392 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3393 m->m_len += space; 3394 cp += space; 3395 remainder -= space; 3396 } 3397 while (remainder > 0) { 3398 /* 3399 * Allocate a new mbuf; could check space 3400 * and allocate a cluster instead. 3401 */ 3402 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3403 if (n == NULL) 3404 return NULL; 3405 n->m_len = min(MJUMPAGESIZE, remainder); 3406 bcopy(cp, mtod(n, caddr_t), n->m_len); 3407 cp += n->m_len; 3408 remainder -= n->m_len; 3409 m->m_next = n; 3410 m = n; 3411 } 3412 3413 return m; 3414 } 3415 3416 #if defined(INET) || defined(INET6) 3417 static __inline int 3418 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3419 { 3420 #if __FreeBSD_version >= 1100095 3421 if (hn_lro_mbufq_depth) { 3422 tcp_lro_queue_mbuf(lc, m); 3423 return 0; 3424 } 3425 #endif 3426 return tcp_lro_rx(lc, m, 0); 3427 } 3428 #endif 3429 3430 static int 3431 hn_rxpkt(struct hn_rx_ring *rxr) 3432 { 3433 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3434 struct mbuf *m_new, *n; 3435 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3436 int hash_type = M_HASHTYPE_NONE; 3437 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3438 int i; 3439 3440 ifp = hn_ifp; 3441 if (rxr->hn_rxvf_ifp != NULL) { 3442 /* 3443 * Non-transparent mode VF; pretend this packet is from 3444 * the VF. 3445 */ 3446 ifp = rxr->hn_rxvf_ifp; 3447 is_vf = 1; 3448 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3449 /* Transparent mode VF. */ 3450 is_vf = 1; 3451 } 3452 3453 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3454 /* 3455 * NOTE: 3456 * See the NOTE of hn_rndis_init_fixat(). This 3457 * function can be reached, immediately after the 3458 * RNDIS is initialized but before the ifnet is 3459 * setup on the hn_attach() path; drop the unexpected 3460 * packets. 3461 */ 3462 return (0); 3463 } 3464 3465 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3466 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3467 return (0); 3468 } 3469 3470 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3471 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3472 if (m_new == NULL) { 3473 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3474 return (0); 3475 } 3476 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3477 rxr->rsc.frag_len[0]); 3478 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3479 } else { 3480 /* 3481 * Get an mbuf with a cluster. For packets 2K or less, 3482 * get a standard 2K cluster. For anything larger, get a 3483 * 4K cluster. Any buffers larger than 4K can cause problems 3484 * if looped around to the Hyper-V TX channel, so avoid them. 3485 */ 3486 size = MCLBYTES; 3487 if (rxr->rsc.pktlen > MCLBYTES) { 3488 /* 4096 */ 3489 size = MJUMPAGESIZE; 3490 } 3491 3492 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3493 if (m_new == NULL) { 3494 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3495 return (0); 3496 } 3497 3498 n = m_new; 3499 for (i = 0; i < rxr->rsc.cnt; i++) { 3500 n = hv_m_append(n, rxr->rsc.frag_len[i], 3501 rxr->rsc.frag_data[i]); 3502 if (n == NULL) { 3503 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3504 return (0); 3505 } else { 3506 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3507 } 3508 } 3509 } 3510 if (rxr->rsc.pktlen <= MHLEN) 3511 rxr->hn_small_pkts++; 3512 3513 m_new->m_pkthdr.rcvif = ifp; 3514 3515 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3516 do_csum = 0; 3517 3518 /* receive side checksum offload */ 3519 if (rxr->rsc.csum_info != NULL) { 3520 /* IP csum offload */ 3521 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3522 m_new->m_pkthdr.csum_flags |= 3523 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3524 rxr->hn_csum_ip++; 3525 } 3526 3527 /* TCP/UDP csum offload */ 3528 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3529 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3530 m_new->m_pkthdr.csum_flags |= 3531 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3532 m_new->m_pkthdr.csum_data = 0xffff; 3533 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3534 rxr->hn_csum_tcp++; 3535 else 3536 rxr->hn_csum_udp++; 3537 } 3538 3539 /* 3540 * XXX 3541 * As of this write (Oct 28th, 2016), host side will turn 3542 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3543 * the do_lro setting here is actually _not_ accurate. We 3544 * depend on the RSS hash type check to reset do_lro. 3545 */ 3546 if ((*(rxr->rsc.csum_info) & 3547 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3548 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3549 do_lro = 1; 3550 } else { 3551 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3552 if (l3proto == ETHERTYPE_IP) { 3553 if (l4proto == IPPROTO_TCP) { 3554 if (do_csum && 3555 (rxr->hn_trust_hcsum & 3556 HN_TRUST_HCSUM_TCP)) { 3557 rxr->hn_csum_trusted++; 3558 m_new->m_pkthdr.csum_flags |= 3559 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3560 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3561 m_new->m_pkthdr.csum_data = 0xffff; 3562 } 3563 do_lro = 1; 3564 } else if (l4proto == IPPROTO_UDP) { 3565 if (do_csum && 3566 (rxr->hn_trust_hcsum & 3567 HN_TRUST_HCSUM_UDP)) { 3568 rxr->hn_csum_trusted++; 3569 m_new->m_pkthdr.csum_flags |= 3570 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3571 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3572 m_new->m_pkthdr.csum_data = 0xffff; 3573 } 3574 } else if (l4proto != IPPROTO_DONE && do_csum && 3575 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3576 rxr->hn_csum_trusted++; 3577 m_new->m_pkthdr.csum_flags |= 3578 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3579 } 3580 } 3581 } 3582 3583 if (rxr->rsc.vlan_info != NULL) { 3584 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3585 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3586 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3587 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3588 m_new->m_flags |= M_VLANTAG; 3589 } 3590 3591 /* 3592 * If VF is activated (tranparent/non-transparent mode does not 3593 * matter here). 3594 * 3595 * - Disable LRO 3596 * 3597 * hn(4) will only receive broadcast packets, multicast packets, 3598 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3599 * packet types. 3600 * 3601 * For non-transparent, we definitely _cannot_ enable LRO at 3602 * all, since the LRO flush will use hn(4) as the receiving 3603 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3604 */ 3605 if (is_vf) 3606 do_lro = 0; 3607 3608 /* 3609 * If VF is activated (tranparent/non-transparent mode does not 3610 * matter here), do _not_ mess with unsupported hash types or 3611 * functions. 3612 */ 3613 if (rxr->rsc.hash_info != NULL) { 3614 rxr->hn_rss_pkts++; 3615 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3616 if (!is_vf) 3617 hash_type = M_HASHTYPE_OPAQUE_HASH; 3618 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3619 NDIS_HASH_FUNCTION_TOEPLITZ) { 3620 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3621 rxr->hn_mbuf_hash); 3622 3623 /* 3624 * NOTE: 3625 * do_lro is resetted, if the hash types are not TCP 3626 * related. See the comment in the above csum_flags 3627 * setup section. 3628 */ 3629 switch (type) { 3630 case NDIS_HASH_IPV4: 3631 hash_type = M_HASHTYPE_RSS_IPV4; 3632 do_lro = 0; 3633 break; 3634 3635 case NDIS_HASH_TCP_IPV4: 3636 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3637 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3638 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3639 3640 if (is_vf) 3641 def_htype = M_HASHTYPE_NONE; 3642 3643 /* 3644 * UDP 4-tuple hash is delivered as 3645 * TCP 4-tuple hash. 3646 */ 3647 if (l3proto == ETHERTYPE_MAX) { 3648 hn_rxpkt_proto(m_new, 3649 &l3proto, &l4proto); 3650 } 3651 if (l3proto == ETHERTYPE_IP) { 3652 if (l4proto == IPPROTO_UDP && 3653 (rxr->hn_mbuf_hash & 3654 NDIS_HASH_UDP_IPV4_X)) { 3655 hash_type = 3656 M_HASHTYPE_RSS_UDP_IPV4; 3657 do_lro = 0; 3658 } else if (l4proto != 3659 IPPROTO_TCP) { 3660 hash_type = def_htype; 3661 do_lro = 0; 3662 } 3663 } else { 3664 hash_type = def_htype; 3665 do_lro = 0; 3666 } 3667 } 3668 break; 3669 3670 case NDIS_HASH_IPV6: 3671 hash_type = M_HASHTYPE_RSS_IPV6; 3672 do_lro = 0; 3673 break; 3674 3675 case NDIS_HASH_IPV6_EX: 3676 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3677 do_lro = 0; 3678 break; 3679 3680 case NDIS_HASH_TCP_IPV6: 3681 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3682 break; 3683 3684 case NDIS_HASH_TCP_IPV6_EX: 3685 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3686 break; 3687 } 3688 } 3689 } else if (!is_vf) { 3690 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3691 hash_type = M_HASHTYPE_OPAQUE; 3692 } 3693 M_HASHTYPE_SET(m_new, hash_type); 3694 3695 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3696 if (hn_ifp != ifp) { 3697 const struct ether_header *eh; 3698 3699 /* 3700 * Non-transparent mode VF is activated. 3701 */ 3702 3703 /* 3704 * Allow tapping on hn(4). 3705 */ 3706 ETHER_BPF_MTAP(hn_ifp, m_new); 3707 3708 /* 3709 * Update hn(4)'s stats. 3710 */ 3711 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3712 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3713 /* Checked at the beginning of this function. */ 3714 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3715 eh = mtod(m_new, struct ether_header *); 3716 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3717 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3718 } 3719 rxr->hn_pkts++; 3720 3721 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3722 #if defined(INET) || defined(INET6) 3723 struct lro_ctrl *lro = &rxr->hn_lro; 3724 3725 if (lro->lro_cnt) { 3726 rxr->hn_lro_tried++; 3727 if (hn_lro_rx(lro, m_new) == 0) { 3728 /* DONE! */ 3729 return 0; 3730 } 3731 } 3732 #endif 3733 } 3734 ifp->if_input(ifp, m_new); 3735 3736 return (0); 3737 } 3738 3739 static int 3740 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3741 { 3742 struct hn_softc *sc = ifp->if_softc; 3743 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3744 struct ifnet *vf_ifp; 3745 int mask, error = 0; 3746 struct ifrsskey *ifrk; 3747 struct ifrsshash *ifrh; 3748 uint32_t mtu; 3749 3750 switch (cmd) { 3751 case SIOCSIFMTU: 3752 if (ifr->ifr_mtu > HN_MTU_MAX) { 3753 error = EINVAL; 3754 break; 3755 } 3756 3757 HN_LOCK(sc); 3758 3759 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3760 HN_UNLOCK(sc); 3761 break; 3762 } 3763 3764 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3765 /* Can't change MTU */ 3766 HN_UNLOCK(sc); 3767 error = EOPNOTSUPP; 3768 break; 3769 } 3770 3771 if (ifp->if_mtu == ifr->ifr_mtu) { 3772 HN_UNLOCK(sc); 3773 break; 3774 } 3775 3776 if (hn_xpnt_vf_isready(sc)) { 3777 vf_ifp = sc->hn_vf_ifp; 3778 ifr_vf = *ifr; 3779 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3780 sizeof(ifr_vf.ifr_name)); 3781 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3782 (caddr_t)&ifr_vf); 3783 if (error) { 3784 HN_UNLOCK(sc); 3785 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3786 vf_ifp->if_xname, ifr->ifr_mtu, error); 3787 break; 3788 } 3789 } 3790 3791 /* 3792 * Suspend this interface before the synthetic parts 3793 * are ripped. 3794 */ 3795 hn_suspend(sc); 3796 3797 /* 3798 * Detach the synthetics parts, i.e. NVS and RNDIS. 3799 */ 3800 hn_synth_detach(sc); 3801 3802 /* 3803 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3804 * with the new MTU setting. 3805 */ 3806 error = hn_synth_attach(sc, ifr->ifr_mtu); 3807 if (error) { 3808 HN_UNLOCK(sc); 3809 break; 3810 } 3811 3812 error = hn_rndis_get_mtu(sc, &mtu); 3813 if (error) 3814 mtu = ifr->ifr_mtu; 3815 else if (bootverbose) 3816 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3817 3818 /* 3819 * Commit the requested MTU, after the synthetic parts 3820 * have been successfully attached. 3821 */ 3822 if (mtu >= ifr->ifr_mtu) { 3823 mtu = ifr->ifr_mtu; 3824 } else { 3825 if_printf(ifp, "fixup mtu %d -> %u\n", 3826 ifr->ifr_mtu, mtu); 3827 } 3828 ifp->if_mtu = mtu; 3829 3830 /* 3831 * Synthetic parts' reattach may change the chimney 3832 * sending size; update it. 3833 */ 3834 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3835 hn_set_chim_size(sc, sc->hn_chim_szmax); 3836 3837 /* 3838 * Make sure that various parameters based on MTU are 3839 * still valid, after the MTU change. 3840 */ 3841 hn_mtu_change_fixup(sc); 3842 3843 /* 3844 * All done! Resume the interface now. 3845 */ 3846 hn_resume(sc); 3847 3848 if ((sc->hn_flags & HN_FLAG_RXVF) || 3849 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3850 /* 3851 * Since we have reattached the NVS part, 3852 * change the datapath to VF again; in case 3853 * that it is lost, after the NVS was detached. 3854 */ 3855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3856 } 3857 3858 HN_UNLOCK(sc); 3859 break; 3860 3861 case SIOCSIFFLAGS: 3862 HN_LOCK(sc); 3863 3864 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3865 HN_UNLOCK(sc); 3866 break; 3867 } 3868 3869 if (hn_xpnt_vf_isready(sc)) 3870 hn_xpnt_vf_saveifflags(sc); 3871 3872 if (ifp->if_flags & IFF_UP) { 3873 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3874 /* 3875 * Caller meight hold mutex, e.g. 3876 * bpf; use busy-wait for the RNDIS 3877 * reply. 3878 */ 3879 HN_NO_SLEEPING(sc); 3880 hn_rxfilter_config(sc); 3881 HN_SLEEPING_OK(sc); 3882 3883 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3884 error = hn_xpnt_vf_iocsetflags(sc); 3885 } else { 3886 hn_init_locked(sc); 3887 } 3888 } else { 3889 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3890 hn_stop(sc, false); 3891 } 3892 sc->hn_if_flags = ifp->if_flags; 3893 3894 HN_UNLOCK(sc); 3895 break; 3896 3897 case SIOCSIFCAP: 3898 HN_LOCK(sc); 3899 3900 if (hn_xpnt_vf_isready(sc)) { 3901 ifr_vf = *ifr; 3902 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3903 sizeof(ifr_vf.ifr_name)); 3904 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3905 HN_UNLOCK(sc); 3906 break; 3907 } 3908 3909 /* 3910 * Fix up requested capabilities w/ supported capabilities, 3911 * since the supported capabilities could have been changed. 3912 */ 3913 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3914 ifp->if_capenable; 3915 3916 if (mask & IFCAP_TXCSUM) { 3917 ifp->if_capenable ^= IFCAP_TXCSUM; 3918 if (ifp->if_capenable & IFCAP_TXCSUM) 3919 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3920 else 3921 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3922 } 3923 if (mask & IFCAP_TXCSUM_IPV6) { 3924 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3925 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3926 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3927 else 3928 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3929 } 3930 3931 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3932 if (mask & IFCAP_RXCSUM) 3933 ifp->if_capenable ^= IFCAP_RXCSUM; 3934 #ifdef foo 3935 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3936 if (mask & IFCAP_RXCSUM_IPV6) 3937 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3938 #endif 3939 3940 if (mask & IFCAP_LRO) 3941 ifp->if_capenable ^= IFCAP_LRO; 3942 3943 if (mask & IFCAP_TSO4) { 3944 ifp->if_capenable ^= IFCAP_TSO4; 3945 if (ifp->if_capenable & IFCAP_TSO4) 3946 ifp->if_hwassist |= CSUM_IP_TSO; 3947 else 3948 ifp->if_hwassist &= ~CSUM_IP_TSO; 3949 } 3950 if (mask & IFCAP_TSO6) { 3951 ifp->if_capenable ^= IFCAP_TSO6; 3952 if (ifp->if_capenable & IFCAP_TSO6) 3953 ifp->if_hwassist |= CSUM_IP6_TSO; 3954 else 3955 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3956 } 3957 3958 HN_UNLOCK(sc); 3959 break; 3960 3961 case SIOCADDMULTI: 3962 case SIOCDELMULTI: 3963 HN_LOCK(sc); 3964 3965 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3966 HN_UNLOCK(sc); 3967 break; 3968 } 3969 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3970 /* 3971 * Multicast uses mutex; use busy-wait for 3972 * the RNDIS reply. 3973 */ 3974 HN_NO_SLEEPING(sc); 3975 hn_rxfilter_config(sc); 3976 HN_SLEEPING_OK(sc); 3977 } 3978 3979 /* XXX vlan(4) style mcast addr maintenance */ 3980 if (hn_xpnt_vf_isready(sc)) { 3981 int old_if_flags; 3982 3983 old_if_flags = sc->hn_vf_ifp->if_flags; 3984 hn_xpnt_vf_saveifflags(sc); 3985 3986 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3987 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3988 IFF_ALLMULTI)) 3989 error = hn_xpnt_vf_iocsetflags(sc); 3990 } 3991 3992 HN_UNLOCK(sc); 3993 break; 3994 3995 case SIOCSIFMEDIA: 3996 case SIOCGIFMEDIA: 3997 HN_LOCK(sc); 3998 if (hn_xpnt_vf_isready(sc)) { 3999 /* 4000 * SIOCGIFMEDIA expects ifmediareq, so don't 4001 * create and pass ifr_vf to the VF here; just 4002 * replace the ifr_name. 4003 */ 4004 vf_ifp = sc->hn_vf_ifp; 4005 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 4006 sizeof(ifr->ifr_name)); 4007 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 4008 /* Restore the ifr_name. */ 4009 strlcpy(ifr->ifr_name, ifp->if_xname, 4010 sizeof(ifr->ifr_name)); 4011 HN_UNLOCK(sc); 4012 break; 4013 } 4014 HN_UNLOCK(sc); 4015 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4016 break; 4017 4018 case SIOCGIFRSSHASH: 4019 ifrh = (struct ifrsshash *)data; 4020 HN_LOCK(sc); 4021 if (sc->hn_rx_ring_inuse == 1) { 4022 HN_UNLOCK(sc); 4023 ifrh->ifrh_func = RSS_FUNC_NONE; 4024 ifrh->ifrh_types = 0; 4025 break; 4026 } 4027 4028 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4029 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4030 else 4031 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4032 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4033 HN_UNLOCK(sc); 4034 break; 4035 4036 case SIOCGIFRSSKEY: 4037 ifrk = (struct ifrsskey *)data; 4038 HN_LOCK(sc); 4039 if (sc->hn_rx_ring_inuse == 1) { 4040 HN_UNLOCK(sc); 4041 ifrk->ifrk_func = RSS_FUNC_NONE; 4042 ifrk->ifrk_keylen = 0; 4043 break; 4044 } 4045 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4046 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4047 else 4048 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4049 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4050 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4051 NDIS_HASH_KEYSIZE_TOEPLITZ); 4052 HN_UNLOCK(sc); 4053 break; 4054 4055 default: 4056 error = ether_ioctl(ifp, cmd, data); 4057 break; 4058 } 4059 return (error); 4060 } 4061 4062 static void 4063 hn_stop(struct hn_softc *sc, bool detaching) 4064 { 4065 struct ifnet *ifp = sc->hn_ifp; 4066 int i; 4067 4068 HN_LOCK_ASSERT(sc); 4069 4070 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4071 ("synthetic parts were not attached")); 4072 4073 /* Clear RUNNING bit ASAP. */ 4074 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4075 4076 /* Disable polling. */ 4077 hn_polling(sc, 0); 4078 4079 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4080 KASSERT(sc->hn_vf_ifp != NULL, 4081 ("%s: VF is not attached", ifp->if_xname)); 4082 4083 /* Mark transparent mode VF as disabled. */ 4084 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4085 4086 /* 4087 * NOTE: 4088 * Datapath setting must happen _before_ bringing 4089 * the VF down. 4090 */ 4091 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4092 4093 /* 4094 * Bring the VF down. 4095 */ 4096 hn_xpnt_vf_saveifflags(sc); 4097 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4098 hn_xpnt_vf_iocsetflags(sc); 4099 } 4100 4101 /* Suspend data transfers. */ 4102 hn_suspend_data(sc); 4103 4104 /* Clear OACTIVE bit. */ 4105 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4106 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4107 sc->hn_tx_ring[i].hn_oactive = 0; 4108 4109 /* 4110 * If the non-transparent mode VF is active, make sure 4111 * that the RX filter still allows packet reception. 4112 */ 4113 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4114 hn_rxfilter_config(sc); 4115 } 4116 4117 static void 4118 hn_init_locked(struct hn_softc *sc) 4119 { 4120 struct ifnet *ifp = sc->hn_ifp; 4121 int i; 4122 4123 HN_LOCK_ASSERT(sc); 4124 4125 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4126 return; 4127 4128 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4129 return; 4130 4131 /* Configure RX filter */ 4132 hn_rxfilter_config(sc); 4133 4134 /* Clear OACTIVE bit. */ 4135 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4136 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4137 sc->hn_tx_ring[i].hn_oactive = 0; 4138 4139 /* Clear TX 'suspended' bit. */ 4140 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4141 4142 if (hn_xpnt_vf_isready(sc)) { 4143 /* Initialize transparent VF. */ 4144 hn_xpnt_vf_init(sc); 4145 } 4146 4147 /* Everything is ready; unleash! */ 4148 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4149 4150 /* Re-enable polling if requested. */ 4151 if (sc->hn_pollhz > 0) 4152 hn_polling(sc, sc->hn_pollhz); 4153 } 4154 4155 static void 4156 hn_init(void *xsc) 4157 { 4158 struct hn_softc *sc = xsc; 4159 4160 HN_LOCK(sc); 4161 hn_init_locked(sc); 4162 HN_UNLOCK(sc); 4163 } 4164 4165 #if __FreeBSD_version >= 1100099 4166 4167 static int 4168 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4169 { 4170 struct hn_softc *sc = arg1; 4171 unsigned int lenlim; 4172 int error; 4173 4174 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4175 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4176 if (error || req->newptr == NULL) 4177 return error; 4178 4179 HN_LOCK(sc); 4180 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4181 lenlim > TCP_LRO_LENGTH_MAX) { 4182 HN_UNLOCK(sc); 4183 return EINVAL; 4184 } 4185 hn_set_lro_lenlim(sc, lenlim); 4186 HN_UNLOCK(sc); 4187 4188 return 0; 4189 } 4190 4191 static int 4192 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4193 { 4194 struct hn_softc *sc = arg1; 4195 int ackcnt, error, i; 4196 4197 /* 4198 * lro_ackcnt_lim is append count limit, 4199 * +1 to turn it into aggregation limit. 4200 */ 4201 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4202 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4203 if (error || req->newptr == NULL) 4204 return error; 4205 4206 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4207 return EINVAL; 4208 4209 /* 4210 * Convert aggregation limit back to append 4211 * count limit. 4212 */ 4213 --ackcnt; 4214 HN_LOCK(sc); 4215 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4216 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4217 HN_UNLOCK(sc); 4218 return 0; 4219 } 4220 4221 #endif 4222 4223 static int 4224 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4225 { 4226 struct hn_softc *sc = arg1; 4227 int hcsum = arg2; 4228 int on, error, i; 4229 4230 on = 0; 4231 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4232 on = 1; 4233 4234 error = sysctl_handle_int(oidp, &on, 0, req); 4235 if (error || req->newptr == NULL) 4236 return error; 4237 4238 HN_LOCK(sc); 4239 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4240 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4241 4242 if (on) 4243 rxr->hn_trust_hcsum |= hcsum; 4244 else 4245 rxr->hn_trust_hcsum &= ~hcsum; 4246 } 4247 HN_UNLOCK(sc); 4248 return 0; 4249 } 4250 4251 static int 4252 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4253 { 4254 struct hn_softc *sc = arg1; 4255 int chim_size, error; 4256 4257 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4258 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4259 if (error || req->newptr == NULL) 4260 return error; 4261 4262 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4263 return EINVAL; 4264 4265 HN_LOCK(sc); 4266 hn_set_chim_size(sc, chim_size); 4267 HN_UNLOCK(sc); 4268 return 0; 4269 } 4270 4271 #if __FreeBSD_version < 1100095 4272 static int 4273 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4274 { 4275 struct hn_softc *sc = arg1; 4276 int ofs = arg2, i, error; 4277 struct hn_rx_ring *rxr; 4278 uint64_t stat; 4279 4280 stat = 0; 4281 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4282 rxr = &sc->hn_rx_ring[i]; 4283 stat += *((int *)((uint8_t *)rxr + ofs)); 4284 } 4285 4286 error = sysctl_handle_64(oidp, &stat, 0, req); 4287 if (error || req->newptr == NULL) 4288 return error; 4289 4290 /* Zero out this stat. */ 4291 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4292 rxr = &sc->hn_rx_ring[i]; 4293 *((int *)((uint8_t *)rxr + ofs)) = 0; 4294 } 4295 return 0; 4296 } 4297 #else 4298 static int 4299 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4300 { 4301 struct hn_softc *sc = arg1; 4302 int ofs = arg2, i, error; 4303 struct hn_rx_ring *rxr; 4304 uint64_t stat; 4305 4306 stat = 0; 4307 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4308 rxr = &sc->hn_rx_ring[i]; 4309 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4310 } 4311 4312 error = sysctl_handle_64(oidp, &stat, 0, req); 4313 if (error || req->newptr == NULL) 4314 return error; 4315 4316 /* Zero out this stat. */ 4317 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4318 rxr = &sc->hn_rx_ring[i]; 4319 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4320 } 4321 return 0; 4322 } 4323 4324 #endif 4325 4326 static int 4327 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4328 { 4329 struct hn_softc *sc = arg1; 4330 int ofs = arg2, i, error; 4331 struct hn_rx_ring *rxr; 4332 u_long stat; 4333 4334 stat = 0; 4335 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4336 rxr = &sc->hn_rx_ring[i]; 4337 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4338 } 4339 4340 error = sysctl_handle_long(oidp, &stat, 0, req); 4341 if (error || req->newptr == NULL) 4342 return error; 4343 4344 /* Zero out this stat. */ 4345 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4346 rxr = &sc->hn_rx_ring[i]; 4347 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4348 } 4349 return 0; 4350 } 4351 4352 static int 4353 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4354 { 4355 struct hn_softc *sc = arg1; 4356 int ofs = arg2, i, error; 4357 struct hn_tx_ring *txr; 4358 u_long stat; 4359 4360 stat = 0; 4361 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4362 txr = &sc->hn_tx_ring[i]; 4363 stat += *((u_long *)((uint8_t *)txr + ofs)); 4364 } 4365 4366 error = sysctl_handle_long(oidp, &stat, 0, req); 4367 if (error || req->newptr == NULL) 4368 return error; 4369 4370 /* Zero out this stat. */ 4371 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4372 txr = &sc->hn_tx_ring[i]; 4373 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4374 } 4375 return 0; 4376 } 4377 4378 static int 4379 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4380 { 4381 struct hn_softc *sc = arg1; 4382 int ofs = arg2, i, error, conf; 4383 struct hn_tx_ring *txr; 4384 4385 txr = &sc->hn_tx_ring[0]; 4386 conf = *((int *)((uint8_t *)txr + ofs)); 4387 4388 error = sysctl_handle_int(oidp, &conf, 0, req); 4389 if (error || req->newptr == NULL) 4390 return error; 4391 4392 HN_LOCK(sc); 4393 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4394 txr = &sc->hn_tx_ring[i]; 4395 *((int *)((uint8_t *)txr + ofs)) = conf; 4396 } 4397 HN_UNLOCK(sc); 4398 4399 return 0; 4400 } 4401 4402 static int 4403 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4404 { 4405 struct hn_softc *sc = arg1; 4406 int error, size; 4407 4408 size = sc->hn_agg_size; 4409 error = sysctl_handle_int(oidp, &size, 0, req); 4410 if (error || req->newptr == NULL) 4411 return (error); 4412 4413 HN_LOCK(sc); 4414 sc->hn_agg_size = size; 4415 hn_set_txagg(sc); 4416 HN_UNLOCK(sc); 4417 4418 return (0); 4419 } 4420 4421 static int 4422 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4423 { 4424 struct hn_softc *sc = arg1; 4425 int error, pkts; 4426 4427 pkts = sc->hn_agg_pkts; 4428 error = sysctl_handle_int(oidp, &pkts, 0, req); 4429 if (error || req->newptr == NULL) 4430 return (error); 4431 4432 HN_LOCK(sc); 4433 sc->hn_agg_pkts = pkts; 4434 hn_set_txagg(sc); 4435 HN_UNLOCK(sc); 4436 4437 return (0); 4438 } 4439 4440 static int 4441 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4442 { 4443 struct hn_softc *sc = arg1; 4444 int pkts; 4445 4446 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4447 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4448 } 4449 4450 static int 4451 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4452 { 4453 struct hn_softc *sc = arg1; 4454 int align; 4455 4456 align = sc->hn_tx_ring[0].hn_agg_align; 4457 return (sysctl_handle_int(oidp, &align, 0, req)); 4458 } 4459 4460 static void 4461 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4462 { 4463 if (pollhz == 0) 4464 vmbus_chan_poll_disable(chan); 4465 else 4466 vmbus_chan_poll_enable(chan, pollhz); 4467 } 4468 4469 static void 4470 hn_polling(struct hn_softc *sc, u_int pollhz) 4471 { 4472 int nsubch = sc->hn_rx_ring_inuse - 1; 4473 4474 HN_LOCK_ASSERT(sc); 4475 4476 if (nsubch > 0) { 4477 struct vmbus_channel **subch; 4478 int i; 4479 4480 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4481 for (i = 0; i < nsubch; ++i) 4482 hn_chan_polling(subch[i], pollhz); 4483 vmbus_subchan_rel(subch, nsubch); 4484 } 4485 hn_chan_polling(sc->hn_prichan, pollhz); 4486 } 4487 4488 static int 4489 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4490 { 4491 struct hn_softc *sc = arg1; 4492 int pollhz, error; 4493 4494 pollhz = sc->hn_pollhz; 4495 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4496 if (error || req->newptr == NULL) 4497 return (error); 4498 4499 if (pollhz != 0 && 4500 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4501 return (EINVAL); 4502 4503 HN_LOCK(sc); 4504 if (sc->hn_pollhz != pollhz) { 4505 sc->hn_pollhz = pollhz; 4506 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4507 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4508 hn_polling(sc, sc->hn_pollhz); 4509 } 4510 HN_UNLOCK(sc); 4511 4512 return (0); 4513 } 4514 4515 static int 4516 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4517 { 4518 struct hn_softc *sc = arg1; 4519 char verstr[16]; 4520 4521 snprintf(verstr, sizeof(verstr), "%u.%u", 4522 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4523 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4524 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4525 } 4526 4527 static int 4528 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4529 { 4530 struct hn_softc *sc = arg1; 4531 char caps_str[128]; 4532 uint32_t caps; 4533 4534 HN_LOCK(sc); 4535 caps = sc->hn_caps; 4536 HN_UNLOCK(sc); 4537 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4538 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4539 } 4540 4541 static int 4542 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4543 { 4544 struct hn_softc *sc = arg1; 4545 char assist_str[128]; 4546 uint32_t hwassist; 4547 4548 HN_LOCK(sc); 4549 hwassist = sc->hn_ifp->if_hwassist; 4550 HN_UNLOCK(sc); 4551 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4552 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4553 } 4554 4555 static int 4556 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4557 { 4558 struct hn_softc *sc = arg1; 4559 char filter_str[128]; 4560 uint32_t filter; 4561 4562 HN_LOCK(sc); 4563 filter = sc->hn_rx_filter; 4564 HN_UNLOCK(sc); 4565 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4566 NDIS_PACKET_TYPES); 4567 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4568 } 4569 4570 #ifndef RSS 4571 4572 static int 4573 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4574 { 4575 struct hn_softc *sc = arg1; 4576 int error; 4577 4578 HN_LOCK(sc); 4579 4580 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4581 if (error || req->newptr == NULL) 4582 goto back; 4583 4584 if ((sc->hn_flags & HN_FLAG_RXVF) || 4585 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4586 /* 4587 * RSS key is synchronized w/ VF's, don't allow users 4588 * to change it. 4589 */ 4590 error = EBUSY; 4591 goto back; 4592 } 4593 4594 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4595 if (error) 4596 goto back; 4597 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4598 4599 if (sc->hn_rx_ring_inuse > 1) { 4600 error = hn_rss_reconfig(sc); 4601 } else { 4602 /* Not RSS capable, at least for now; just save the RSS key. */ 4603 error = 0; 4604 } 4605 back: 4606 HN_UNLOCK(sc); 4607 return (error); 4608 } 4609 4610 static int 4611 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4612 { 4613 struct hn_softc *sc = arg1; 4614 int error; 4615 4616 HN_LOCK(sc); 4617 4618 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4619 if (error || req->newptr == NULL) 4620 goto back; 4621 4622 /* 4623 * Don't allow RSS indirect table change, if this interface is not 4624 * RSS capable currently. 4625 */ 4626 if (sc->hn_rx_ring_inuse == 1) { 4627 error = EOPNOTSUPP; 4628 goto back; 4629 } 4630 4631 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4632 if (error) 4633 goto back; 4634 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4635 4636 hn_rss_ind_fixup(sc); 4637 error = hn_rss_reconfig(sc); 4638 back: 4639 HN_UNLOCK(sc); 4640 return (error); 4641 } 4642 4643 #endif /* !RSS */ 4644 4645 static int 4646 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4647 { 4648 struct hn_softc *sc = arg1; 4649 char hash_str[128]; 4650 uint32_t hash; 4651 4652 HN_LOCK(sc); 4653 hash = sc->hn_rss_hash; 4654 HN_UNLOCK(sc); 4655 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4656 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4657 } 4658 4659 static int 4660 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4661 { 4662 struct hn_softc *sc = arg1; 4663 char hash_str[128]; 4664 uint32_t hash; 4665 4666 HN_LOCK(sc); 4667 hash = sc->hn_rss_hcap; 4668 HN_UNLOCK(sc); 4669 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4670 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4671 } 4672 4673 static int 4674 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4675 { 4676 struct hn_softc *sc = arg1; 4677 char hash_str[128]; 4678 uint32_t hash; 4679 4680 HN_LOCK(sc); 4681 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4682 HN_UNLOCK(sc); 4683 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4684 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4685 } 4686 4687 static int 4688 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4689 { 4690 struct hn_softc *sc = arg1; 4691 char vf_name[IFNAMSIZ + 1]; 4692 struct ifnet *vf_ifp; 4693 4694 HN_LOCK(sc); 4695 vf_name[0] = '\0'; 4696 vf_ifp = sc->hn_vf_ifp; 4697 if (vf_ifp != NULL) 4698 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4699 HN_UNLOCK(sc); 4700 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4701 } 4702 4703 static int 4704 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4705 { 4706 struct hn_softc *sc = arg1; 4707 char vf_name[IFNAMSIZ + 1]; 4708 struct ifnet *vf_ifp; 4709 4710 HN_LOCK(sc); 4711 vf_name[0] = '\0'; 4712 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4713 if (vf_ifp != NULL) 4714 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4715 HN_UNLOCK(sc); 4716 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4717 } 4718 4719 static int 4720 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4721 { 4722 struct rm_priotracker pt; 4723 struct sbuf *sb; 4724 int error, i; 4725 bool first; 4726 4727 error = sysctl_wire_old_buffer(req, 0); 4728 if (error != 0) 4729 return (error); 4730 4731 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4732 if (sb == NULL) 4733 return (ENOMEM); 4734 4735 rm_rlock(&hn_vfmap_lock, &pt); 4736 4737 first = true; 4738 for (i = 0; i < hn_vfmap_size; ++i) { 4739 struct epoch_tracker et; 4740 struct ifnet *ifp; 4741 4742 if (hn_vfmap[i] == NULL) 4743 continue; 4744 4745 NET_EPOCH_ENTER(et); 4746 ifp = ifnet_byindex(i); 4747 if (ifp != NULL) { 4748 if (first) 4749 sbuf_printf(sb, "%s", ifp->if_xname); 4750 else 4751 sbuf_printf(sb, " %s", ifp->if_xname); 4752 first = false; 4753 } 4754 NET_EPOCH_EXIT(et); 4755 } 4756 4757 rm_runlock(&hn_vfmap_lock, &pt); 4758 4759 error = sbuf_finish(sb); 4760 sbuf_delete(sb); 4761 return (error); 4762 } 4763 4764 static int 4765 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4766 { 4767 struct rm_priotracker pt; 4768 struct sbuf *sb; 4769 int error, i; 4770 bool first; 4771 4772 error = sysctl_wire_old_buffer(req, 0); 4773 if (error != 0) 4774 return (error); 4775 4776 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4777 if (sb == NULL) 4778 return (ENOMEM); 4779 4780 rm_rlock(&hn_vfmap_lock, &pt); 4781 4782 first = true; 4783 for (i = 0; i < hn_vfmap_size; ++i) { 4784 struct epoch_tracker et; 4785 struct ifnet *ifp, *hn_ifp; 4786 4787 hn_ifp = hn_vfmap[i]; 4788 if (hn_ifp == NULL) 4789 continue; 4790 4791 NET_EPOCH_ENTER(et); 4792 ifp = ifnet_byindex(i); 4793 if (ifp != NULL) { 4794 if (first) { 4795 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4796 hn_ifp->if_xname); 4797 } else { 4798 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4799 hn_ifp->if_xname); 4800 } 4801 first = false; 4802 } 4803 NET_EPOCH_EXIT(et); 4804 } 4805 4806 rm_runlock(&hn_vfmap_lock, &pt); 4807 4808 error = sbuf_finish(sb); 4809 sbuf_delete(sb); 4810 return (error); 4811 } 4812 4813 static int 4814 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4815 { 4816 struct hn_softc *sc = arg1; 4817 int error, onoff = 0; 4818 4819 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4820 onoff = 1; 4821 error = sysctl_handle_int(oidp, &onoff, 0, req); 4822 if (error || req->newptr == NULL) 4823 return (error); 4824 4825 HN_LOCK(sc); 4826 /* NOTE: hn_vf_lock for hn_transmit() */ 4827 rm_wlock(&sc->hn_vf_lock); 4828 if (onoff) 4829 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4830 else 4831 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4832 rm_wunlock(&sc->hn_vf_lock); 4833 HN_UNLOCK(sc); 4834 4835 return (0); 4836 } 4837 4838 static int 4839 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4840 { 4841 struct hn_softc *sc = arg1; 4842 int enabled = 0; 4843 4844 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4845 enabled = 1; 4846 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4847 } 4848 4849 static int 4850 hn_check_iplen(const struct mbuf *m, int hoff) 4851 { 4852 const struct ip *ip; 4853 int len, iphlen, iplen; 4854 const struct tcphdr *th; 4855 int thoff; /* TCP data offset */ 4856 4857 len = hoff + sizeof(struct ip); 4858 4859 /* The packet must be at least the size of an IP header. */ 4860 if (m->m_pkthdr.len < len) 4861 return IPPROTO_DONE; 4862 4863 /* The fixed IP header must reside completely in the first mbuf. */ 4864 if (m->m_len < len) 4865 return IPPROTO_DONE; 4866 4867 ip = mtodo(m, hoff); 4868 4869 /* Bound check the packet's stated IP header length. */ 4870 iphlen = ip->ip_hl << 2; 4871 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4872 return IPPROTO_DONE; 4873 4874 /* The full IP header must reside completely in the one mbuf. */ 4875 if (m->m_len < hoff + iphlen) 4876 return IPPROTO_DONE; 4877 4878 iplen = ntohs(ip->ip_len); 4879 4880 /* 4881 * Check that the amount of data in the buffers is as 4882 * at least much as the IP header would have us expect. 4883 */ 4884 if (m->m_pkthdr.len < hoff + iplen) 4885 return IPPROTO_DONE; 4886 4887 /* 4888 * Ignore IP fragments. 4889 */ 4890 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4891 return IPPROTO_DONE; 4892 4893 /* 4894 * The TCP/IP or UDP/IP header must be entirely contained within 4895 * the first fragment of a packet. 4896 */ 4897 switch (ip->ip_p) { 4898 case IPPROTO_TCP: 4899 if (iplen < iphlen + sizeof(struct tcphdr)) 4900 return IPPROTO_DONE; 4901 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4902 return IPPROTO_DONE; 4903 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4904 thoff = th->th_off << 2; 4905 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4906 return IPPROTO_DONE; 4907 if (m->m_len < hoff + iphlen + thoff) 4908 return IPPROTO_DONE; 4909 break; 4910 case IPPROTO_UDP: 4911 if (iplen < iphlen + sizeof(struct udphdr)) 4912 return IPPROTO_DONE; 4913 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4914 return IPPROTO_DONE; 4915 break; 4916 default: 4917 if (iplen < iphlen) 4918 return IPPROTO_DONE; 4919 break; 4920 } 4921 return ip->ip_p; 4922 } 4923 4924 static void 4925 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4926 { 4927 const struct ether_header *eh; 4928 uint16_t etype; 4929 int hoff; 4930 4931 hoff = sizeof(*eh); 4932 /* Checked at the beginning of this function. */ 4933 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4934 4935 eh = mtod(m_new, const struct ether_header *); 4936 etype = ntohs(eh->ether_type); 4937 if (etype == ETHERTYPE_VLAN) { 4938 const struct ether_vlan_header *evl; 4939 4940 hoff = sizeof(*evl); 4941 if (m_new->m_len < hoff) 4942 return; 4943 evl = mtod(m_new, const struct ether_vlan_header *); 4944 etype = ntohs(evl->evl_proto); 4945 } 4946 *l3proto = etype; 4947 4948 if (etype == ETHERTYPE_IP) 4949 *l4proto = hn_check_iplen(m_new, hoff); 4950 else 4951 *l4proto = IPPROTO_DONE; 4952 } 4953 4954 static int 4955 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4956 { 4957 struct sysctl_oid_list *child; 4958 struct sysctl_ctx_list *ctx; 4959 device_t dev = sc->hn_dev; 4960 #if defined(INET) || defined(INET6) 4961 #if __FreeBSD_version >= 1100095 4962 int lroent_cnt; 4963 #endif 4964 #endif 4965 int i; 4966 4967 /* 4968 * Create RXBUF for reception. 4969 * 4970 * NOTE: 4971 * - It is shared by all channels. 4972 * - A large enough buffer is allocated, certain version of NVSes 4973 * may further limit the usable space. 4974 */ 4975 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4976 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4977 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4978 if (sc->hn_rxbuf == NULL) { 4979 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4980 return (ENOMEM); 4981 } 4982 4983 sc->hn_rx_ring_cnt = ring_cnt; 4984 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4985 4986 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4987 M_DEVBUF, M_WAITOK | M_ZERO); 4988 4989 #if defined(INET) || defined(INET6) 4990 #if __FreeBSD_version >= 1100095 4991 lroent_cnt = hn_lro_entry_count; 4992 if (lroent_cnt < TCP_LRO_ENTRIES) 4993 lroent_cnt = TCP_LRO_ENTRIES; 4994 if (bootverbose) 4995 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4996 #endif 4997 #endif /* INET || INET6 */ 4998 4999 ctx = device_get_sysctl_ctx(dev); 5000 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 5001 5002 /* Create dev.hn.UNIT.rx sysctl tree */ 5003 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 5004 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5005 5006 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5007 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5008 5009 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5010 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 5011 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5012 if (rxr->hn_br == NULL) { 5013 device_printf(dev, "allocate bufring failed\n"); 5014 return (ENOMEM); 5015 } 5016 5017 if (hn_trust_hosttcp) 5018 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5019 if (hn_trust_hostudp) 5020 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5021 if (hn_trust_hostip) 5022 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5023 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5024 rxr->hn_ifp = sc->hn_ifp; 5025 if (i < sc->hn_tx_ring_cnt) 5026 rxr->hn_txr = &sc->hn_tx_ring[i]; 5027 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5028 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5029 rxr->hn_rx_idx = i; 5030 rxr->hn_rxbuf = sc->hn_rxbuf; 5031 5032 /* 5033 * Initialize LRO. 5034 */ 5035 #if defined(INET) || defined(INET6) 5036 #if __FreeBSD_version >= 1100095 5037 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5038 hn_lro_mbufq_depth); 5039 #else 5040 tcp_lro_init(&rxr->hn_lro); 5041 rxr->hn_lro.ifp = sc->hn_ifp; 5042 #endif 5043 #if __FreeBSD_version >= 1100099 5044 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5045 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5046 #endif 5047 #endif /* INET || INET6 */ 5048 5049 if (sc->hn_rx_sysctl_tree != NULL) { 5050 char name[16]; 5051 5052 /* 5053 * Create per RX ring sysctl tree: 5054 * dev.hn.UNIT.rx.RINGID 5055 */ 5056 snprintf(name, sizeof(name), "%d", i); 5057 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5058 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5059 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5060 5061 if (rxr->hn_rx_sysctl_tree != NULL) { 5062 SYSCTL_ADD_ULONG(ctx, 5063 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5064 OID_AUTO, "packets", 5065 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5066 "# of packets received"); 5067 SYSCTL_ADD_ULONG(ctx, 5068 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5069 OID_AUTO, "rss_pkts", 5070 CTLFLAG_RW | CTLFLAG_STATS, 5071 &rxr->hn_rss_pkts, 5072 "# of packets w/ RSS info received"); 5073 SYSCTL_ADD_ULONG(ctx, 5074 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5075 OID_AUTO, "rsc_pkts", 5076 CTLFLAG_RW | CTLFLAG_STATS, 5077 &rxr->hn_rsc_pkts, 5078 "# of RSC packets received"); 5079 SYSCTL_ADD_ULONG(ctx, 5080 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5081 OID_AUTO, "rsc_drop", 5082 CTLFLAG_RW | CTLFLAG_STATS, 5083 &rxr->hn_rsc_drop, 5084 "# of RSC fragments dropped"); 5085 SYSCTL_ADD_INT(ctx, 5086 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5087 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5088 &rxr->hn_pktbuf_len, 0, 5089 "Temporary channel packet buffer length"); 5090 } 5091 } 5092 } 5093 5094 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5095 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5096 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5097 #if __FreeBSD_version < 1100095 5098 hn_rx_stat_int_sysctl, 5099 #else 5100 hn_rx_stat_u64_sysctl, 5101 #endif 5102 "LU", "LRO queued"); 5103 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5104 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5105 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5106 #if __FreeBSD_version < 1100095 5107 hn_rx_stat_int_sysctl, 5108 #else 5109 hn_rx_stat_u64_sysctl, 5110 #endif 5111 "LU", "LRO flushed"); 5112 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5113 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5114 __offsetof(struct hn_rx_ring, hn_lro_tried), 5115 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5116 #if __FreeBSD_version >= 1100099 5117 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5118 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5119 hn_lro_lenlim_sysctl, "IU", 5120 "Max # of data bytes to be aggregated by LRO"); 5121 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5122 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5123 hn_lro_ackcnt_sysctl, "I", 5124 "Max # of ACKs to be aggregated by LRO"); 5125 #endif 5126 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5127 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5128 hn_trust_hcsum_sysctl, "I", 5129 "Trust tcp segment verification on host side, " 5130 "when csum info is missing"); 5131 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5132 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5133 hn_trust_hcsum_sysctl, "I", 5134 "Trust udp datagram verification on host side, " 5135 "when csum info is missing"); 5136 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5137 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5138 hn_trust_hcsum_sysctl, "I", 5139 "Trust ip packet verification on host side, " 5140 "when csum info is missing"); 5141 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5142 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5143 __offsetof(struct hn_rx_ring, hn_csum_ip), 5144 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5145 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5146 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5147 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5148 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5149 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5150 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5151 __offsetof(struct hn_rx_ring, hn_csum_udp), 5152 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5153 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5154 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5155 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5156 hn_rx_stat_ulong_sysctl, "LU", 5157 "# of packets that we trust host's csum verification"); 5158 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5159 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5160 __offsetof(struct hn_rx_ring, hn_small_pkts), 5161 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5162 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5163 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5164 __offsetof(struct hn_rx_ring, hn_ack_failed), 5165 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5166 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5167 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5168 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5169 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5170 5171 return (0); 5172 } 5173 5174 static void 5175 hn_destroy_rx_data(struct hn_softc *sc) 5176 { 5177 int i; 5178 5179 if (sc->hn_rxbuf != NULL) { 5180 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5181 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5182 else 5183 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5184 sc->hn_rxbuf = NULL; 5185 } 5186 5187 if (sc->hn_rx_ring_cnt == 0) 5188 return; 5189 5190 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5191 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5192 5193 if (rxr->hn_br == NULL) 5194 continue; 5195 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5196 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5197 } else { 5198 device_printf(sc->hn_dev, 5199 "%dth channel bufring is referenced", i); 5200 } 5201 rxr->hn_br = NULL; 5202 5203 #if defined(INET) || defined(INET6) 5204 tcp_lro_free(&rxr->hn_lro); 5205 #endif 5206 free(rxr->hn_pktbuf, M_DEVBUF); 5207 } 5208 free(sc->hn_rx_ring, M_DEVBUF); 5209 sc->hn_rx_ring = NULL; 5210 5211 sc->hn_rx_ring_cnt = 0; 5212 sc->hn_rx_ring_inuse = 0; 5213 } 5214 5215 static int 5216 hn_tx_ring_create(struct hn_softc *sc, int id) 5217 { 5218 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5219 device_t dev = sc->hn_dev; 5220 bus_dma_tag_t parent_dtag; 5221 int error, i; 5222 5223 txr->hn_sc = sc; 5224 txr->hn_tx_idx = id; 5225 5226 #ifndef HN_USE_TXDESC_BUFRING 5227 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5228 #endif 5229 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5230 5231 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5232 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5233 M_DEVBUF, M_WAITOK | M_ZERO); 5234 #ifndef HN_USE_TXDESC_BUFRING 5235 SLIST_INIT(&txr->hn_txlist); 5236 #else 5237 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5238 M_WAITOK, &txr->hn_tx_lock); 5239 #endif 5240 5241 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5242 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5243 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5244 } else { 5245 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5246 } 5247 5248 #ifdef HN_IFSTART_SUPPORT 5249 if (hn_use_if_start) { 5250 txr->hn_txeof = hn_start_txeof; 5251 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5252 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5253 } else 5254 #endif 5255 { 5256 int br_depth; 5257 5258 txr->hn_txeof = hn_xmit_txeof; 5259 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5260 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5261 5262 br_depth = hn_get_txswq_depth(txr); 5263 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5264 M_WAITOK, &txr->hn_tx_lock); 5265 } 5266 5267 txr->hn_direct_tx_size = hn_direct_tx_size; 5268 5269 /* 5270 * Always schedule transmission instead of trying to do direct 5271 * transmission. This one gives the best performance so far. 5272 */ 5273 txr->hn_sched_tx = 1; 5274 5275 parent_dtag = bus_get_dma_tag(dev); 5276 5277 /* DMA tag for RNDIS packet messages. */ 5278 error = bus_dma_tag_create(parent_dtag, /* parent */ 5279 HN_RNDIS_PKT_ALIGN, /* alignment */ 5280 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5281 BUS_SPACE_MAXADDR, /* lowaddr */ 5282 BUS_SPACE_MAXADDR, /* highaddr */ 5283 NULL, NULL, /* filter, filterarg */ 5284 HN_RNDIS_PKT_LEN, /* maxsize */ 5285 1, /* nsegments */ 5286 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5287 0, /* flags */ 5288 NULL, /* lockfunc */ 5289 NULL, /* lockfuncarg */ 5290 &txr->hn_tx_rndis_dtag); 5291 if (error) { 5292 device_printf(dev, "failed to create rndis dmatag\n"); 5293 return error; 5294 } 5295 5296 /* DMA tag for data. */ 5297 error = bus_dma_tag_create(parent_dtag, /* parent */ 5298 1, /* alignment */ 5299 HN_TX_DATA_BOUNDARY, /* boundary */ 5300 BUS_SPACE_MAXADDR, /* lowaddr */ 5301 BUS_SPACE_MAXADDR, /* highaddr */ 5302 NULL, NULL, /* filter, filterarg */ 5303 HN_TX_DATA_MAXSIZE, /* maxsize */ 5304 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5305 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5306 0, /* flags */ 5307 NULL, /* lockfunc */ 5308 NULL, /* lockfuncarg */ 5309 &txr->hn_tx_data_dtag); 5310 if (error) { 5311 device_printf(dev, "failed to create data dmatag\n"); 5312 return error; 5313 } 5314 5315 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5316 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5317 5318 txd->txr = txr; 5319 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5320 STAILQ_INIT(&txd->agg_list); 5321 5322 /* 5323 * Allocate and load RNDIS packet message. 5324 */ 5325 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5326 (void **)&txd->rndis_pkt, 5327 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5328 &txd->rndis_pkt_dmap); 5329 if (error) { 5330 device_printf(dev, 5331 "failed to allocate rndis_packet_msg, %d\n", i); 5332 return error; 5333 } 5334 5335 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5336 txd->rndis_pkt_dmap, 5337 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5338 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5339 BUS_DMA_NOWAIT); 5340 if (error) { 5341 device_printf(dev, 5342 "failed to load rndis_packet_msg, %d\n", i); 5343 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5344 txd->rndis_pkt, txd->rndis_pkt_dmap); 5345 return error; 5346 } 5347 5348 /* DMA map for TX data. */ 5349 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5350 &txd->data_dmap); 5351 if (error) { 5352 device_printf(dev, 5353 "failed to allocate tx data dmamap\n"); 5354 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5355 txd->rndis_pkt_dmap); 5356 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5357 txd->rndis_pkt, txd->rndis_pkt_dmap); 5358 return error; 5359 } 5360 5361 /* All set, put it to list */ 5362 txd->flags |= HN_TXD_FLAG_ONLIST; 5363 #ifndef HN_USE_TXDESC_BUFRING 5364 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5365 #else 5366 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5367 #endif 5368 } 5369 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5370 5371 if (sc->hn_tx_sysctl_tree != NULL) { 5372 struct sysctl_oid_list *child; 5373 struct sysctl_ctx_list *ctx; 5374 char name[16]; 5375 5376 /* 5377 * Create per TX ring sysctl tree: 5378 * dev.hn.UNIT.tx.RINGID 5379 */ 5380 ctx = device_get_sysctl_ctx(dev); 5381 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5382 5383 snprintf(name, sizeof(name), "%d", id); 5384 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5385 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5386 5387 if (txr->hn_tx_sysctl_tree != NULL) { 5388 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5389 5390 #ifdef HN_DEBUG 5391 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5392 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5393 "# of available TX descs"); 5394 #endif 5395 #ifdef HN_IFSTART_SUPPORT 5396 if (!hn_use_if_start) 5397 #endif 5398 { 5399 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5400 CTLFLAG_RD, &txr->hn_oactive, 0, 5401 "over active"); 5402 } 5403 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5404 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5405 "# of packets transmitted"); 5406 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5407 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5408 "# of sends"); 5409 } 5410 } 5411 5412 return 0; 5413 } 5414 5415 static void 5416 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5417 { 5418 struct hn_tx_ring *txr = txd->txr; 5419 5420 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5421 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5422 5423 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5424 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5425 txd->rndis_pkt_dmap); 5426 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5427 } 5428 5429 static void 5430 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5431 { 5432 5433 KASSERT(txd->refs == 0 || txd->refs == 1, 5434 ("invalid txd refs %d", txd->refs)); 5435 5436 /* Aggregated txds will be freed by their aggregating txd. */ 5437 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5438 int freed __diagused; 5439 5440 freed = hn_txdesc_put(txr, txd); 5441 KASSERT(freed, ("can't free txdesc")); 5442 } 5443 } 5444 5445 static void 5446 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5447 { 5448 int i; 5449 5450 if (txr->hn_txdesc == NULL) 5451 return; 5452 5453 /* 5454 * NOTE: 5455 * Because the freeing of aggregated txds will be deferred 5456 * to the aggregating txd, two passes are used here: 5457 * - The first pass GCes any pending txds. This GC is necessary, 5458 * since if the channels are revoked, hypervisor will not 5459 * deliver send-done for all pending txds. 5460 * - The second pass frees the busdma stuffs, i.e. after all txds 5461 * were freed. 5462 */ 5463 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5464 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5465 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5466 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5467 5468 if (txr->hn_tx_data_dtag != NULL) 5469 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5470 if (txr->hn_tx_rndis_dtag != NULL) 5471 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5472 5473 #ifdef HN_USE_TXDESC_BUFRING 5474 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5475 #endif 5476 5477 free(txr->hn_txdesc, M_DEVBUF); 5478 txr->hn_txdesc = NULL; 5479 5480 if (txr->hn_mbuf_br != NULL) 5481 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5482 5483 #ifndef HN_USE_TXDESC_BUFRING 5484 mtx_destroy(&txr->hn_txlist_spin); 5485 #endif 5486 mtx_destroy(&txr->hn_tx_lock); 5487 } 5488 5489 static int 5490 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5491 { 5492 struct sysctl_oid_list *child; 5493 struct sysctl_ctx_list *ctx; 5494 int i; 5495 5496 /* 5497 * Create TXBUF for chimney sending. 5498 * 5499 * NOTE: It is shared by all channels. 5500 */ 5501 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5502 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5503 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5504 if (sc->hn_chim == NULL) { 5505 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5506 return (ENOMEM); 5507 } 5508 5509 sc->hn_tx_ring_cnt = ring_cnt; 5510 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5511 5512 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5513 M_DEVBUF, M_WAITOK | M_ZERO); 5514 5515 ctx = device_get_sysctl_ctx(sc->hn_dev); 5516 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5517 5518 /* Create dev.hn.UNIT.tx sysctl tree */ 5519 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5520 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5521 5522 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5523 int error; 5524 5525 error = hn_tx_ring_create(sc, i); 5526 if (error) 5527 return error; 5528 } 5529 5530 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5531 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5532 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5533 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5534 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5535 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5536 __offsetof(struct hn_tx_ring, hn_send_failed), 5537 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5538 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5539 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5540 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5541 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5542 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5543 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5544 __offsetof(struct hn_tx_ring, hn_flush_failed), 5545 hn_tx_stat_ulong_sysctl, "LU", 5546 "# of packet transmission aggregation flush failure"); 5547 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5548 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5549 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5550 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5551 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5552 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5553 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5554 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5555 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5556 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5557 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5558 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5559 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5560 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5561 "# of total TX descs"); 5562 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5563 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5564 "Chimney send packet size upper boundary"); 5565 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5566 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5567 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5568 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5569 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5570 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5571 hn_tx_conf_int_sysctl, "I", 5572 "Size of the packet for direct transmission"); 5573 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5574 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5575 __offsetof(struct hn_tx_ring, hn_sched_tx), 5576 hn_tx_conf_int_sysctl, "I", 5577 "Always schedule transmission " 5578 "instead of doing direct transmission"); 5579 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5580 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5581 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5582 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5583 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5584 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5585 "Applied packet transmission aggregation size"); 5586 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5587 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5588 hn_txagg_pktmax_sysctl, "I", 5589 "Applied packet transmission aggregation packets"); 5590 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5591 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5592 hn_txagg_align_sysctl, "I", 5593 "Applied packet transmission aggregation alignment"); 5594 5595 return 0; 5596 } 5597 5598 static void 5599 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5600 { 5601 int i; 5602 5603 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5604 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5605 } 5606 5607 static void 5608 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5609 { 5610 struct ifnet *ifp = sc->hn_ifp; 5611 u_int hw_tsomax; 5612 int tso_minlen; 5613 5614 HN_LOCK_ASSERT(sc); 5615 5616 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5617 return; 5618 5619 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5620 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5621 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5622 5623 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5624 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5625 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5626 5627 if (tso_maxlen < tso_minlen) 5628 tso_maxlen = tso_minlen; 5629 else if (tso_maxlen > IP_MAXPACKET) 5630 tso_maxlen = IP_MAXPACKET; 5631 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5632 tso_maxlen = sc->hn_ndis_tso_szmax; 5633 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5634 5635 if (hn_xpnt_vf_isready(sc)) { 5636 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5637 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5638 } 5639 ifp->if_hw_tsomax = hw_tsomax; 5640 if (bootverbose) 5641 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5642 } 5643 5644 static void 5645 hn_fixup_tx_data(struct hn_softc *sc) 5646 { 5647 uint64_t csum_assist; 5648 int i; 5649 5650 hn_set_chim_size(sc, sc->hn_chim_szmax); 5651 if (hn_tx_chimney_size > 0 && 5652 hn_tx_chimney_size < sc->hn_chim_szmax) 5653 hn_set_chim_size(sc, hn_tx_chimney_size); 5654 5655 csum_assist = 0; 5656 if (sc->hn_caps & HN_CAP_IPCS) 5657 csum_assist |= CSUM_IP; 5658 if (sc->hn_caps & HN_CAP_TCP4CS) 5659 csum_assist |= CSUM_IP_TCP; 5660 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5661 csum_assist |= CSUM_IP_UDP; 5662 if (sc->hn_caps & HN_CAP_TCP6CS) 5663 csum_assist |= CSUM_IP6_TCP; 5664 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5665 csum_assist |= CSUM_IP6_UDP; 5666 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5667 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5668 5669 if (sc->hn_caps & HN_CAP_HASHVAL) { 5670 /* 5671 * Support HASHVAL pktinfo on TX path. 5672 */ 5673 if (bootverbose) 5674 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5675 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5676 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5677 } 5678 } 5679 5680 static void 5681 hn_fixup_rx_data(struct hn_softc *sc) 5682 { 5683 5684 if (sc->hn_caps & HN_CAP_UDPHASH) { 5685 int i; 5686 5687 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5688 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5689 } 5690 } 5691 5692 static void 5693 hn_destroy_tx_data(struct hn_softc *sc) 5694 { 5695 int i; 5696 5697 if (sc->hn_chim != NULL) { 5698 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5699 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5700 } else { 5701 device_printf(sc->hn_dev, 5702 "chimney sending buffer is referenced"); 5703 } 5704 sc->hn_chim = NULL; 5705 } 5706 5707 if (sc->hn_tx_ring_cnt == 0) 5708 return; 5709 5710 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5711 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5712 5713 free(sc->hn_tx_ring, M_DEVBUF); 5714 sc->hn_tx_ring = NULL; 5715 5716 sc->hn_tx_ring_cnt = 0; 5717 sc->hn_tx_ring_inuse = 0; 5718 } 5719 5720 #ifdef HN_IFSTART_SUPPORT 5721 5722 static void 5723 hn_start_taskfunc(void *xtxr, int pending __unused) 5724 { 5725 struct hn_tx_ring *txr = xtxr; 5726 5727 mtx_lock(&txr->hn_tx_lock); 5728 hn_start_locked(txr, 0); 5729 mtx_unlock(&txr->hn_tx_lock); 5730 } 5731 5732 static int 5733 hn_start_locked(struct hn_tx_ring *txr, int len) 5734 { 5735 struct hn_softc *sc = txr->hn_sc; 5736 struct ifnet *ifp = sc->hn_ifp; 5737 int sched = 0; 5738 5739 KASSERT(hn_use_if_start, 5740 ("hn_start_locked is called, when if_start is disabled")); 5741 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5742 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5743 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5744 5745 if (__predict_false(txr->hn_suspended)) 5746 return (0); 5747 5748 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5749 IFF_DRV_RUNNING) 5750 return (0); 5751 5752 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5753 struct hn_txdesc *txd; 5754 struct mbuf *m_head; 5755 int error; 5756 5757 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5758 if (m_head == NULL) 5759 break; 5760 5761 if (len > 0 && m_head->m_pkthdr.len > len) { 5762 /* 5763 * This sending could be time consuming; let callers 5764 * dispatch this packet sending (and sending of any 5765 * following up packets) to tx taskqueue. 5766 */ 5767 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5768 sched = 1; 5769 break; 5770 } 5771 5772 #if defined(INET6) || defined(INET) 5773 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5774 m_head = hn_tso_fixup(m_head); 5775 if (__predict_false(m_head == NULL)) { 5776 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5777 continue; 5778 } 5779 } else if (m_head->m_pkthdr.csum_flags & 5780 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5781 m_head = hn_set_hlen(m_head); 5782 if (__predict_false(m_head == NULL)) { 5783 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5784 continue; 5785 } 5786 } 5787 #endif 5788 5789 txd = hn_txdesc_get(txr); 5790 if (txd == NULL) { 5791 txr->hn_no_txdescs++; 5792 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5793 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5794 break; 5795 } 5796 5797 error = hn_encap(ifp, txr, txd, &m_head); 5798 if (error) { 5799 /* Both txd and m_head are freed */ 5800 KASSERT(txr->hn_agg_txd == NULL, 5801 ("encap failed w/ pending aggregating txdesc")); 5802 continue; 5803 } 5804 5805 if (txr->hn_agg_pktleft == 0) { 5806 if (txr->hn_agg_txd != NULL) { 5807 KASSERT(m_head == NULL, 5808 ("pending mbuf for aggregating txdesc")); 5809 error = hn_flush_txagg(ifp, txr); 5810 if (__predict_false(error)) { 5811 atomic_set_int(&ifp->if_drv_flags, 5812 IFF_DRV_OACTIVE); 5813 break; 5814 } 5815 } else { 5816 KASSERT(m_head != NULL, ("mbuf was freed")); 5817 error = hn_txpkt(ifp, txr, txd); 5818 if (__predict_false(error)) { 5819 /* txd is freed, but m_head is not */ 5820 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5821 atomic_set_int(&ifp->if_drv_flags, 5822 IFF_DRV_OACTIVE); 5823 break; 5824 } 5825 } 5826 } 5827 #ifdef INVARIANTS 5828 else { 5829 KASSERT(txr->hn_agg_txd != NULL, 5830 ("no aggregating txdesc")); 5831 KASSERT(m_head == NULL, 5832 ("pending mbuf for aggregating txdesc")); 5833 } 5834 #endif 5835 } 5836 5837 /* Flush pending aggerated transmission. */ 5838 if (txr->hn_agg_txd != NULL) 5839 hn_flush_txagg(ifp, txr); 5840 return (sched); 5841 } 5842 5843 static void 5844 hn_start(struct ifnet *ifp) 5845 { 5846 struct hn_softc *sc = ifp->if_softc; 5847 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5848 5849 if (txr->hn_sched_tx) 5850 goto do_sched; 5851 5852 if (mtx_trylock(&txr->hn_tx_lock)) { 5853 int sched; 5854 5855 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5856 mtx_unlock(&txr->hn_tx_lock); 5857 if (!sched) 5858 return; 5859 } 5860 do_sched: 5861 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5862 } 5863 5864 static void 5865 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5866 { 5867 struct hn_tx_ring *txr = xtxr; 5868 5869 mtx_lock(&txr->hn_tx_lock); 5870 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5871 hn_start_locked(txr, 0); 5872 mtx_unlock(&txr->hn_tx_lock); 5873 } 5874 5875 static void 5876 hn_start_txeof(struct hn_tx_ring *txr) 5877 { 5878 struct hn_softc *sc = txr->hn_sc; 5879 struct ifnet *ifp = sc->hn_ifp; 5880 5881 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5882 5883 if (txr->hn_sched_tx) 5884 goto do_sched; 5885 5886 if (mtx_trylock(&txr->hn_tx_lock)) { 5887 int sched; 5888 5889 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5890 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5891 mtx_unlock(&txr->hn_tx_lock); 5892 if (sched) { 5893 taskqueue_enqueue(txr->hn_tx_taskq, 5894 &txr->hn_tx_task); 5895 } 5896 } else { 5897 do_sched: 5898 /* 5899 * Release the OACTIVE earlier, with the hope, that 5900 * others could catch up. The task will clear the 5901 * flag again with the hn_tx_lock to avoid possible 5902 * races. 5903 */ 5904 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5905 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5906 } 5907 } 5908 5909 #endif /* HN_IFSTART_SUPPORT */ 5910 5911 static int 5912 hn_xmit(struct hn_tx_ring *txr, int len) 5913 { 5914 struct hn_softc *sc = txr->hn_sc; 5915 struct ifnet *ifp = sc->hn_ifp; 5916 struct mbuf *m_head; 5917 int sched = 0; 5918 5919 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5920 #ifdef HN_IFSTART_SUPPORT 5921 KASSERT(hn_use_if_start == 0, 5922 ("hn_xmit is called, when if_start is enabled")); 5923 #endif 5924 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5925 5926 if (__predict_false(txr->hn_suspended)) 5927 return (0); 5928 5929 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5930 return (0); 5931 5932 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5933 struct hn_txdesc *txd; 5934 int error; 5935 5936 if (len > 0 && m_head->m_pkthdr.len > len) { 5937 /* 5938 * This sending could be time consuming; let callers 5939 * dispatch this packet sending (and sending of any 5940 * following up packets) to tx taskqueue. 5941 */ 5942 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5943 sched = 1; 5944 break; 5945 } 5946 5947 txd = hn_txdesc_get(txr); 5948 if (txd == NULL) { 5949 txr->hn_no_txdescs++; 5950 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5951 txr->hn_oactive = 1; 5952 break; 5953 } 5954 5955 error = hn_encap(ifp, txr, txd, &m_head); 5956 if (error) { 5957 /* Both txd and m_head are freed; discard */ 5958 KASSERT(txr->hn_agg_txd == NULL, 5959 ("encap failed w/ pending aggregating txdesc")); 5960 drbr_advance(ifp, txr->hn_mbuf_br); 5961 continue; 5962 } 5963 5964 if (txr->hn_agg_pktleft == 0) { 5965 if (txr->hn_agg_txd != NULL) { 5966 KASSERT(m_head == NULL, 5967 ("pending mbuf for aggregating txdesc")); 5968 error = hn_flush_txagg(ifp, txr); 5969 if (__predict_false(error)) { 5970 txr->hn_oactive = 1; 5971 break; 5972 } 5973 } else { 5974 KASSERT(m_head != NULL, ("mbuf was freed")); 5975 error = hn_txpkt(ifp, txr, txd); 5976 if (__predict_false(error)) { 5977 /* txd is freed, but m_head is not */ 5978 drbr_putback(ifp, txr->hn_mbuf_br, 5979 m_head); 5980 txr->hn_oactive = 1; 5981 break; 5982 } 5983 } 5984 } 5985 #ifdef INVARIANTS 5986 else { 5987 KASSERT(txr->hn_agg_txd != NULL, 5988 ("no aggregating txdesc")); 5989 KASSERT(m_head == NULL, 5990 ("pending mbuf for aggregating txdesc")); 5991 } 5992 #endif 5993 5994 /* Sent */ 5995 drbr_advance(ifp, txr->hn_mbuf_br); 5996 } 5997 5998 /* Flush pending aggerated transmission. */ 5999 if (txr->hn_agg_txd != NULL) 6000 hn_flush_txagg(ifp, txr); 6001 return (sched); 6002 } 6003 6004 static int 6005 hn_transmit(struct ifnet *ifp, struct mbuf *m) 6006 { 6007 struct hn_softc *sc = ifp->if_softc; 6008 struct hn_tx_ring *txr; 6009 int error, idx = 0; 6010 6011 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6012 struct rm_priotracker pt; 6013 6014 rm_rlock(&sc->hn_vf_lock, &pt); 6015 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6016 struct mbuf *m_bpf = NULL; 6017 int obytes, omcast; 6018 6019 obytes = m->m_pkthdr.len; 6020 omcast = (m->m_flags & M_MCAST) != 0; 6021 6022 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6023 if (bpf_peers_present(ifp->if_bpf)) { 6024 m_bpf = m_copypacket(m, M_NOWAIT); 6025 if (m_bpf == NULL) { 6026 /* 6027 * Failed to grab a shallow 6028 * copy; tap now. 6029 */ 6030 ETHER_BPF_MTAP(ifp, m); 6031 } 6032 } 6033 } else { 6034 ETHER_BPF_MTAP(ifp, m); 6035 } 6036 6037 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 6038 rm_runlock(&sc->hn_vf_lock, &pt); 6039 6040 if (m_bpf != NULL) { 6041 if (!error) 6042 ETHER_BPF_MTAP(ifp, m_bpf); 6043 m_freem(m_bpf); 6044 } 6045 6046 if (error == ENOBUFS) { 6047 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6048 } else if (error) { 6049 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6050 } else { 6051 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6052 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6053 if (omcast) { 6054 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6055 omcast); 6056 } 6057 } 6058 return (error); 6059 } 6060 rm_runlock(&sc->hn_vf_lock, &pt); 6061 } 6062 6063 #if defined(INET6) || defined(INET) 6064 /* 6065 * Perform TSO packet header fixup or get l2/l3 header length now, 6066 * since packet headers should be cache-hot. 6067 */ 6068 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6069 m = hn_tso_fixup(m); 6070 if (__predict_false(m == NULL)) { 6071 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6072 return EIO; 6073 } 6074 } else if (m->m_pkthdr.csum_flags & 6075 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6076 m = hn_set_hlen(m); 6077 if (__predict_false(m == NULL)) { 6078 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6079 return EIO; 6080 } 6081 } 6082 #endif 6083 6084 /* 6085 * Select the TX ring based on flowid 6086 */ 6087 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6088 #ifdef RSS 6089 uint32_t bid; 6090 6091 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6092 &bid) == 0) 6093 idx = bid % sc->hn_tx_ring_inuse; 6094 else 6095 #endif 6096 { 6097 #if defined(INET6) || defined(INET) 6098 int tcpsyn = 0; 6099 6100 if (m->m_pkthdr.len < 128 && 6101 (m->m_pkthdr.csum_flags & 6102 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6103 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6104 m = hn_check_tcpsyn(m, &tcpsyn); 6105 if (__predict_false(m == NULL)) { 6106 if_inc_counter(ifp, 6107 IFCOUNTER_OERRORS, 1); 6108 return (EIO); 6109 } 6110 } 6111 #else 6112 const int tcpsyn = 0; 6113 #endif 6114 if (tcpsyn) 6115 idx = 0; 6116 else 6117 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6118 } 6119 } 6120 txr = &sc->hn_tx_ring[idx]; 6121 6122 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6123 if (error) { 6124 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6125 return error; 6126 } 6127 6128 if (txr->hn_oactive) 6129 return 0; 6130 6131 if (txr->hn_sched_tx) 6132 goto do_sched; 6133 6134 if (mtx_trylock(&txr->hn_tx_lock)) { 6135 int sched; 6136 6137 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6138 mtx_unlock(&txr->hn_tx_lock); 6139 if (!sched) 6140 return 0; 6141 } 6142 do_sched: 6143 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6144 return 0; 6145 } 6146 6147 static void 6148 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6149 { 6150 struct mbuf *m; 6151 6152 mtx_lock(&txr->hn_tx_lock); 6153 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6154 m_freem(m); 6155 mtx_unlock(&txr->hn_tx_lock); 6156 } 6157 6158 static void 6159 hn_xmit_qflush(struct ifnet *ifp) 6160 { 6161 struct hn_softc *sc = ifp->if_softc; 6162 struct rm_priotracker pt; 6163 int i; 6164 6165 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6166 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6167 if_qflush(ifp); 6168 6169 rm_rlock(&sc->hn_vf_lock, &pt); 6170 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6171 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6172 rm_runlock(&sc->hn_vf_lock, &pt); 6173 } 6174 6175 static void 6176 hn_xmit_txeof(struct hn_tx_ring *txr) 6177 { 6178 6179 if (txr->hn_sched_tx) 6180 goto do_sched; 6181 6182 if (mtx_trylock(&txr->hn_tx_lock)) { 6183 int sched; 6184 6185 txr->hn_oactive = 0; 6186 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6187 mtx_unlock(&txr->hn_tx_lock); 6188 if (sched) { 6189 taskqueue_enqueue(txr->hn_tx_taskq, 6190 &txr->hn_tx_task); 6191 } 6192 } else { 6193 do_sched: 6194 /* 6195 * Release the oactive earlier, with the hope, that 6196 * others could catch up. The task will clear the 6197 * oactive again with the hn_tx_lock to avoid possible 6198 * races. 6199 */ 6200 txr->hn_oactive = 0; 6201 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6202 } 6203 } 6204 6205 static void 6206 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6207 { 6208 struct hn_tx_ring *txr = xtxr; 6209 6210 mtx_lock(&txr->hn_tx_lock); 6211 hn_xmit(txr, 0); 6212 mtx_unlock(&txr->hn_tx_lock); 6213 } 6214 6215 static void 6216 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6217 { 6218 struct hn_tx_ring *txr = xtxr; 6219 6220 mtx_lock(&txr->hn_tx_lock); 6221 txr->hn_oactive = 0; 6222 hn_xmit(txr, 0); 6223 mtx_unlock(&txr->hn_tx_lock); 6224 } 6225 6226 static int 6227 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6228 { 6229 struct vmbus_chan_br cbr; 6230 struct hn_rx_ring *rxr; 6231 struct hn_tx_ring *txr = NULL; 6232 int idx, error; 6233 6234 idx = vmbus_chan_subidx(chan); 6235 6236 /* 6237 * Link this channel to RX/TX ring. 6238 */ 6239 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6240 ("invalid channel index %d, should > 0 && < %d", 6241 idx, sc->hn_rx_ring_inuse)); 6242 rxr = &sc->hn_rx_ring[idx]; 6243 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6244 ("RX ring %d already attached", idx)); 6245 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6246 rxr->hn_chan = chan; 6247 6248 if (bootverbose) { 6249 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6250 idx, vmbus_chan_id(chan)); 6251 } 6252 6253 if (idx < sc->hn_tx_ring_inuse) { 6254 txr = &sc->hn_tx_ring[idx]; 6255 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6256 ("TX ring %d already attached", idx)); 6257 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6258 6259 txr->hn_chan = chan; 6260 if (bootverbose) { 6261 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6262 idx, vmbus_chan_id(chan)); 6263 } 6264 } 6265 6266 /* Bind this channel to a proper CPU. */ 6267 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6268 6269 /* 6270 * Open this channel 6271 */ 6272 cbr.cbr = rxr->hn_br; 6273 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6274 cbr.cbr_txsz = HN_TXBR_SIZE; 6275 cbr.cbr_rxsz = HN_RXBR_SIZE; 6276 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6277 if (error) { 6278 if (error == EISCONN) { 6279 if_printf(sc->hn_ifp, "bufring is connected after " 6280 "chan%u open failure\n", vmbus_chan_id(chan)); 6281 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6282 } else { 6283 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6284 vmbus_chan_id(chan), error); 6285 } 6286 } 6287 return (error); 6288 } 6289 6290 static void 6291 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6292 { 6293 struct hn_rx_ring *rxr; 6294 int idx, error; 6295 6296 idx = vmbus_chan_subidx(chan); 6297 6298 /* 6299 * Link this channel to RX/TX ring. 6300 */ 6301 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6302 ("invalid channel index %d, should > 0 && < %d", 6303 idx, sc->hn_rx_ring_inuse)); 6304 rxr = &sc->hn_rx_ring[idx]; 6305 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6306 ("RX ring %d is not attached", idx)); 6307 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6308 6309 if (idx < sc->hn_tx_ring_inuse) { 6310 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6311 6312 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6313 ("TX ring %d is not attached attached", idx)); 6314 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6315 } 6316 6317 /* 6318 * Close this channel. 6319 * 6320 * NOTE: 6321 * Channel closing does _not_ destroy the target channel. 6322 */ 6323 error = vmbus_chan_close_direct(chan); 6324 if (error == EISCONN) { 6325 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6326 "after being closed\n", vmbus_chan_id(chan)); 6327 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6328 } else if (error) { 6329 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6330 vmbus_chan_id(chan), error); 6331 } 6332 } 6333 6334 static int 6335 hn_attach_subchans(struct hn_softc *sc) 6336 { 6337 struct vmbus_channel **subchans; 6338 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6339 int i, error = 0; 6340 6341 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6342 6343 /* Attach the sub-channels. */ 6344 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6345 for (i = 0; i < subchan_cnt; ++i) { 6346 int error1; 6347 6348 error1 = hn_chan_attach(sc, subchans[i]); 6349 if (error1) { 6350 error = error1; 6351 /* Move on; all channels will be detached later. */ 6352 } 6353 } 6354 vmbus_subchan_rel(subchans, subchan_cnt); 6355 6356 if (error) { 6357 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6358 } else { 6359 if (bootverbose) { 6360 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6361 subchan_cnt); 6362 } 6363 } 6364 return (error); 6365 } 6366 6367 static void 6368 hn_detach_allchans(struct hn_softc *sc) 6369 { 6370 struct vmbus_channel **subchans; 6371 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6372 int i; 6373 6374 if (subchan_cnt == 0) 6375 goto back; 6376 6377 /* Detach the sub-channels. */ 6378 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6379 for (i = 0; i < subchan_cnt; ++i) 6380 hn_chan_detach(sc, subchans[i]); 6381 vmbus_subchan_rel(subchans, subchan_cnt); 6382 6383 back: 6384 /* 6385 * Detach the primary channel, _after_ all sub-channels 6386 * are detached. 6387 */ 6388 hn_chan_detach(sc, sc->hn_prichan); 6389 6390 /* Wait for sub-channels to be destroyed, if any. */ 6391 vmbus_subchan_drain(sc->hn_prichan); 6392 6393 #ifdef INVARIANTS 6394 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6395 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6396 HN_RX_FLAG_ATTACHED) == 0, 6397 ("%dth RX ring is still attached", i)); 6398 } 6399 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6400 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6401 HN_TX_FLAG_ATTACHED) == 0, 6402 ("%dth TX ring is still attached", i)); 6403 } 6404 #endif 6405 } 6406 6407 static int 6408 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6409 { 6410 struct vmbus_channel **subchans; 6411 int nchan, rxr_cnt, error; 6412 6413 nchan = *nsubch + 1; 6414 if (nchan == 1) { 6415 /* 6416 * Multiple RX/TX rings are not requested. 6417 */ 6418 *nsubch = 0; 6419 return (0); 6420 } 6421 6422 /* 6423 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6424 * table entries. 6425 */ 6426 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6427 if (error) { 6428 /* No RSS; this is benign. */ 6429 *nsubch = 0; 6430 return (0); 6431 } 6432 if (bootverbose) { 6433 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6434 rxr_cnt, nchan); 6435 } 6436 6437 if (nchan > rxr_cnt) 6438 nchan = rxr_cnt; 6439 if (nchan == 1) { 6440 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6441 *nsubch = 0; 6442 return (0); 6443 } 6444 6445 /* 6446 * Allocate sub-channels from NVS. 6447 */ 6448 *nsubch = nchan - 1; 6449 error = hn_nvs_alloc_subchans(sc, nsubch); 6450 if (error || *nsubch == 0) { 6451 /* Failed to allocate sub-channels. */ 6452 *nsubch = 0; 6453 return (0); 6454 } 6455 6456 /* 6457 * Wait for all sub-channels to become ready before moving on. 6458 */ 6459 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6460 vmbus_subchan_rel(subchans, *nsubch); 6461 return (0); 6462 } 6463 6464 static bool 6465 hn_synth_attachable(const struct hn_softc *sc) 6466 { 6467 int i; 6468 6469 if (sc->hn_flags & HN_FLAG_ERRORS) 6470 return (false); 6471 6472 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6473 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6474 6475 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6476 return (false); 6477 } 6478 return (true); 6479 } 6480 6481 /* 6482 * Make sure that the RX filter is zero after the successful 6483 * RNDIS initialization. 6484 * 6485 * NOTE: 6486 * Under certain conditions on certain versions of Hyper-V, 6487 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6488 * after the successful RNDIS initialization, which breaks 6489 * the assumption of any following code (well, it breaks the 6490 * RNDIS API contract actually). Clear the RNDIS rxfilter 6491 * explicitly, drain packets sneaking through, and drain the 6492 * interrupt taskqueues scheduled due to the stealth packets. 6493 */ 6494 static void 6495 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6496 { 6497 6498 hn_disable_rx(sc); 6499 hn_drain_rxtx(sc, nchan); 6500 } 6501 6502 static int 6503 hn_synth_attach(struct hn_softc *sc, int mtu) 6504 { 6505 #define ATTACHED_NVS 0x0002 6506 #define ATTACHED_RNDIS 0x0004 6507 6508 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6509 int error, nsubch, nchan = 1, i, rndis_inited; 6510 uint32_t old_caps, attached = 0; 6511 6512 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6513 ("synthetic parts were attached")); 6514 6515 if (!hn_synth_attachable(sc)) 6516 return (ENXIO); 6517 6518 /* Save capabilities for later verification. */ 6519 old_caps = sc->hn_caps; 6520 sc->hn_caps = 0; 6521 6522 /* Clear RSS stuffs. */ 6523 sc->hn_rss_ind_size = 0; 6524 sc->hn_rss_hash = 0; 6525 sc->hn_rss_hcap = 0; 6526 6527 /* 6528 * Attach the primary channel _before_ attaching NVS and RNDIS. 6529 */ 6530 error = hn_chan_attach(sc, sc->hn_prichan); 6531 if (error) 6532 goto failed; 6533 6534 /* 6535 * Attach NVS. 6536 */ 6537 error = hn_nvs_attach(sc, mtu); 6538 if (error) 6539 goto failed; 6540 attached |= ATTACHED_NVS; 6541 6542 /* 6543 * Attach RNDIS _after_ NVS is attached. 6544 */ 6545 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6546 if (rndis_inited) 6547 attached |= ATTACHED_RNDIS; 6548 if (error) 6549 goto failed; 6550 6551 /* 6552 * Make sure capabilities are not changed. 6553 */ 6554 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6555 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6556 old_caps, sc->hn_caps); 6557 error = ENXIO; 6558 goto failed; 6559 } 6560 6561 /* 6562 * Allocate sub-channels for multi-TX/RX rings. 6563 * 6564 * NOTE: 6565 * The # of RX rings that can be used is equivalent to the # of 6566 * channels to be requested. 6567 */ 6568 nsubch = sc->hn_rx_ring_cnt - 1; 6569 error = hn_synth_alloc_subchans(sc, &nsubch); 6570 if (error) 6571 goto failed; 6572 /* NOTE: _Full_ synthetic parts detach is required now. */ 6573 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6574 6575 /* 6576 * Set the # of TX/RX rings that could be used according to 6577 * the # of channels that NVS offered. 6578 */ 6579 nchan = nsubch + 1; 6580 hn_set_ring_inuse(sc, nchan); 6581 if (nchan == 1) { 6582 /* Only the primary channel can be used; done */ 6583 goto back; 6584 } 6585 6586 /* 6587 * Attach the sub-channels. 6588 * 6589 * NOTE: hn_set_ring_inuse() _must_ have been called. 6590 */ 6591 error = hn_attach_subchans(sc); 6592 if (error) 6593 goto failed; 6594 6595 /* 6596 * Configure RSS key and indirect table _after_ all sub-channels 6597 * are attached. 6598 */ 6599 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6600 /* 6601 * RSS key is not set yet; set it to the default RSS key. 6602 */ 6603 if (bootverbose) 6604 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6605 #ifdef RSS 6606 rss_getkey(rss->rss_key); 6607 #else 6608 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6609 #endif 6610 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6611 } 6612 6613 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6614 /* 6615 * RSS indirect table is not set yet; set it up in round- 6616 * robin fashion. 6617 */ 6618 if (bootverbose) { 6619 if_printf(sc->hn_ifp, "setup default RSS indirect " 6620 "table\n"); 6621 } 6622 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6623 uint32_t subidx; 6624 6625 #ifdef RSS 6626 subidx = rss_get_indirection_to_bucket(i); 6627 #else 6628 subidx = i; 6629 #endif 6630 rss->rss_ind[i] = subidx % nchan; 6631 } 6632 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6633 } else { 6634 /* 6635 * # of usable channels may be changed, so we have to 6636 * make sure that all entries in RSS indirect table 6637 * are valid. 6638 * 6639 * NOTE: hn_set_ring_inuse() _must_ have been called. 6640 */ 6641 hn_rss_ind_fixup(sc); 6642 } 6643 6644 sc->hn_rss_hash = sc->hn_rss_hcap; 6645 if ((sc->hn_flags & HN_FLAG_RXVF) || 6646 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6647 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6648 hn_vf_rss_fixup(sc, false); 6649 } 6650 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6651 if (error) 6652 goto failed; 6653 back: 6654 /* 6655 * Fixup transmission aggregation setup. 6656 */ 6657 hn_set_txagg(sc); 6658 hn_rndis_init_fixat(sc, nchan); 6659 return (0); 6660 6661 failed: 6662 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6663 hn_rndis_init_fixat(sc, nchan); 6664 hn_synth_detach(sc); 6665 } else { 6666 if (attached & ATTACHED_RNDIS) { 6667 hn_rndis_init_fixat(sc, nchan); 6668 hn_rndis_detach(sc); 6669 } 6670 if (attached & ATTACHED_NVS) 6671 hn_nvs_detach(sc); 6672 hn_chan_detach(sc, sc->hn_prichan); 6673 /* Restore old capabilities. */ 6674 sc->hn_caps = old_caps; 6675 } 6676 return (error); 6677 6678 #undef ATTACHED_RNDIS 6679 #undef ATTACHED_NVS 6680 } 6681 6682 /* 6683 * NOTE: 6684 * The interface must have been suspended though hn_suspend(), before 6685 * this function get called. 6686 */ 6687 static void 6688 hn_synth_detach(struct hn_softc *sc) 6689 { 6690 6691 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6692 ("synthetic parts were not attached")); 6693 6694 /* Detach the RNDIS first. */ 6695 hn_rndis_detach(sc); 6696 6697 /* Detach NVS. */ 6698 hn_nvs_detach(sc); 6699 6700 /* Detach all of the channels. */ 6701 hn_detach_allchans(sc); 6702 6703 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6704 /* 6705 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6706 */ 6707 int error; 6708 6709 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6710 sc->hn_rxbuf_gpadl); 6711 if (error) { 6712 if_printf(sc->hn_ifp, 6713 "rxbuf gpadl disconn failed: %d\n", error); 6714 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6715 } 6716 sc->hn_rxbuf_gpadl = 0; 6717 } 6718 6719 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6720 /* 6721 * Host is post-Win2016, disconnect chimney sending buffer from 6722 * primary channel here. 6723 */ 6724 int error; 6725 6726 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6727 sc->hn_chim_gpadl); 6728 if (error) { 6729 if_printf(sc->hn_ifp, 6730 "chim gpadl disconn failed: %d\n", error); 6731 sc->hn_flags |= HN_FLAG_CHIM_REF; 6732 } 6733 sc->hn_chim_gpadl = 0; 6734 } 6735 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6736 } 6737 6738 static void 6739 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6740 { 6741 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6742 ("invalid ring count %d", ring_cnt)); 6743 6744 if (sc->hn_tx_ring_cnt > ring_cnt) 6745 sc->hn_tx_ring_inuse = ring_cnt; 6746 else 6747 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6748 sc->hn_rx_ring_inuse = ring_cnt; 6749 6750 #ifdef RSS 6751 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6752 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6753 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6754 rss_getnumbuckets()); 6755 } 6756 #endif 6757 6758 if (bootverbose) { 6759 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6760 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6761 } 6762 } 6763 6764 static void 6765 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6766 { 6767 6768 /* 6769 * NOTE: 6770 * The TX bufring will not be drained by the hypervisor, 6771 * if the primary channel is revoked. 6772 */ 6773 while (!vmbus_chan_rx_empty(chan) || 6774 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6775 !vmbus_chan_tx_empty(chan))) 6776 pause("waitch", 1); 6777 vmbus_chan_intr_drain(chan); 6778 } 6779 6780 static void 6781 hn_disable_rx(struct hn_softc *sc) 6782 { 6783 6784 /* 6785 * Disable RX by clearing RX filter forcefully. 6786 */ 6787 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6788 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6789 6790 /* 6791 * Give RNDIS enough time to flush all pending data packets. 6792 */ 6793 pause("waitrx", (200 * hz) / 1000); 6794 } 6795 6796 /* 6797 * NOTE: 6798 * RX/TX _must_ have been suspended/disabled, before this function 6799 * is called. 6800 */ 6801 static void 6802 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6803 { 6804 struct vmbus_channel **subch = NULL; 6805 int nsubch; 6806 6807 /* 6808 * Drain RX/TX bufrings and interrupts. 6809 */ 6810 nsubch = nchan - 1; 6811 if (nsubch > 0) 6812 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6813 6814 if (subch != NULL) { 6815 int i; 6816 6817 for (i = 0; i < nsubch; ++i) 6818 hn_chan_drain(sc, subch[i]); 6819 } 6820 hn_chan_drain(sc, sc->hn_prichan); 6821 6822 if (subch != NULL) 6823 vmbus_subchan_rel(subch, nsubch); 6824 } 6825 6826 static void 6827 hn_suspend_data(struct hn_softc *sc) 6828 { 6829 struct hn_tx_ring *txr; 6830 int i; 6831 6832 HN_LOCK_ASSERT(sc); 6833 6834 /* 6835 * Suspend TX. 6836 */ 6837 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6838 txr = &sc->hn_tx_ring[i]; 6839 6840 mtx_lock(&txr->hn_tx_lock); 6841 txr->hn_suspended = 1; 6842 mtx_unlock(&txr->hn_tx_lock); 6843 /* No one is able send more packets now. */ 6844 6845 /* 6846 * Wait for all pending sends to finish. 6847 * 6848 * NOTE: 6849 * We will _not_ receive all pending send-done, if the 6850 * primary channel is revoked. 6851 */ 6852 while (hn_tx_ring_pending(txr) && 6853 !vmbus_chan_is_revoked(sc->hn_prichan)) 6854 pause("hnwtx", 1 /* 1 tick */); 6855 } 6856 6857 /* 6858 * Disable RX. 6859 */ 6860 hn_disable_rx(sc); 6861 6862 /* 6863 * Drain RX/TX. 6864 */ 6865 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6866 6867 /* 6868 * Drain any pending TX tasks. 6869 * 6870 * NOTE: 6871 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6872 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6873 */ 6874 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6875 txr = &sc->hn_tx_ring[i]; 6876 6877 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6878 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6879 } 6880 } 6881 6882 static void 6883 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6884 { 6885 6886 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6887 } 6888 6889 static void 6890 hn_suspend_mgmt(struct hn_softc *sc) 6891 { 6892 struct task task; 6893 6894 HN_LOCK_ASSERT(sc); 6895 6896 /* 6897 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6898 * through hn_mgmt_taskq. 6899 */ 6900 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6901 vmbus_chan_run_task(sc->hn_prichan, &task); 6902 6903 /* 6904 * Make sure that all pending management tasks are completed. 6905 */ 6906 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6907 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6908 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6909 } 6910 6911 static void 6912 hn_suspend(struct hn_softc *sc) 6913 { 6914 6915 /* Disable polling. */ 6916 hn_polling(sc, 0); 6917 6918 /* 6919 * If the non-transparent mode VF is activated, the synthetic 6920 * device is receiving packets, so the data path of the 6921 * synthetic device must be suspended. 6922 */ 6923 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6924 (sc->hn_flags & HN_FLAG_RXVF)) 6925 hn_suspend_data(sc); 6926 hn_suspend_mgmt(sc); 6927 } 6928 6929 static void 6930 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6931 { 6932 int i; 6933 6934 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6935 ("invalid TX ring count %d", tx_ring_cnt)); 6936 6937 for (i = 0; i < tx_ring_cnt; ++i) { 6938 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6939 6940 mtx_lock(&txr->hn_tx_lock); 6941 txr->hn_suspended = 0; 6942 mtx_unlock(&txr->hn_tx_lock); 6943 } 6944 } 6945 6946 static void 6947 hn_resume_data(struct hn_softc *sc) 6948 { 6949 int i; 6950 6951 HN_LOCK_ASSERT(sc); 6952 6953 /* 6954 * Re-enable RX. 6955 */ 6956 hn_rxfilter_config(sc); 6957 6958 /* 6959 * Make sure to clear suspend status on "all" TX rings, 6960 * since hn_tx_ring_inuse can be changed after 6961 * hn_suspend_data(). 6962 */ 6963 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6964 6965 #ifdef HN_IFSTART_SUPPORT 6966 if (!hn_use_if_start) 6967 #endif 6968 { 6969 /* 6970 * Flush unused drbrs, since hn_tx_ring_inuse may be 6971 * reduced. 6972 */ 6973 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6974 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6975 } 6976 6977 /* 6978 * Kick start TX. 6979 */ 6980 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6981 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6982 6983 /* 6984 * Use txeof task, so that any pending oactive can be 6985 * cleared properly. 6986 */ 6987 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6988 } 6989 } 6990 6991 static void 6992 hn_resume_mgmt(struct hn_softc *sc) 6993 { 6994 6995 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6996 6997 /* 6998 * Kick off network change detection, if it was pending. 6999 * If no network change was pending, start link status 7000 * checks, which is more lightweight than network change 7001 * detection. 7002 */ 7003 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 7004 hn_change_network(sc); 7005 else 7006 hn_update_link_status(sc); 7007 } 7008 7009 static void 7010 hn_resume(struct hn_softc *sc) 7011 { 7012 7013 /* 7014 * If the non-transparent mode VF is activated, the synthetic 7015 * device have to receive packets, so the data path of the 7016 * synthetic device must be resumed. 7017 */ 7018 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 7019 (sc->hn_flags & HN_FLAG_RXVF)) 7020 hn_resume_data(sc); 7021 7022 /* 7023 * Don't resume link status change if VF is attached/activated. 7024 * - In the non-transparent VF mode, the synthetic device marks 7025 * link down until the VF is deactivated; i.e. VF is down. 7026 * - In transparent VF mode, VF's media status is used until 7027 * the VF is detached. 7028 */ 7029 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7030 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7031 hn_resume_mgmt(sc); 7032 7033 /* 7034 * Re-enable polling if this interface is running and 7035 * the polling is requested. 7036 */ 7037 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7038 hn_polling(sc, sc->hn_pollhz); 7039 } 7040 7041 static void 7042 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7043 { 7044 const struct rndis_status_msg *msg; 7045 int ofs; 7046 7047 if (dlen < sizeof(*msg)) { 7048 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7049 return; 7050 } 7051 msg = data; 7052 7053 switch (msg->rm_status) { 7054 case RNDIS_STATUS_MEDIA_CONNECT: 7055 case RNDIS_STATUS_MEDIA_DISCONNECT: 7056 hn_update_link_status(sc); 7057 break; 7058 7059 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7060 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7061 /* Not really useful; ignore. */ 7062 break; 7063 7064 case RNDIS_STATUS_NETWORK_CHANGE: 7065 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7066 if (dlen < ofs + msg->rm_stbuflen || 7067 msg->rm_stbuflen < sizeof(uint32_t)) { 7068 if_printf(sc->hn_ifp, "network changed\n"); 7069 } else { 7070 uint32_t change; 7071 7072 memcpy(&change, ((const uint8_t *)msg) + ofs, 7073 sizeof(change)); 7074 if_printf(sc->hn_ifp, "network changed, change %u\n", 7075 change); 7076 } 7077 hn_change_network(sc); 7078 break; 7079 7080 default: 7081 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7082 msg->rm_status); 7083 break; 7084 } 7085 } 7086 7087 static int 7088 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7089 { 7090 const struct rndis_pktinfo *pi = info_data; 7091 uint32_t mask = 0; 7092 7093 while (info_dlen != 0) { 7094 const void *data; 7095 uint32_t dlen; 7096 7097 if (__predict_false(info_dlen < sizeof(*pi))) 7098 return (EINVAL); 7099 if (__predict_false(info_dlen < pi->rm_size)) 7100 return (EINVAL); 7101 info_dlen -= pi->rm_size; 7102 7103 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7104 return (EINVAL); 7105 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7106 return (EINVAL); 7107 dlen = pi->rm_size - pi->rm_pktinfooffset; 7108 data = pi->rm_data; 7109 7110 if (pi->rm_internal == 1) { 7111 switch (pi->rm_type) { 7112 case NDIS_PKTINFO_IT_PKTINFO_ID: 7113 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7114 return (EINVAL); 7115 info->pktinfo_id = 7116 (const struct packet_info_id *)data; 7117 mask |= HN_RXINFO_PKTINFO_ID; 7118 break; 7119 7120 default: 7121 goto next; 7122 } 7123 } else { 7124 switch (pi->rm_type) { 7125 case NDIS_PKTINFO_TYPE_VLAN: 7126 if (__predict_false(dlen 7127 < NDIS_VLAN_INFO_SIZE)) 7128 return (EINVAL); 7129 info->vlan_info = (const uint32_t *)data; 7130 mask |= HN_RXINFO_VLAN; 7131 break; 7132 7133 case NDIS_PKTINFO_TYPE_CSUM: 7134 if (__predict_false(dlen 7135 < NDIS_RXCSUM_INFO_SIZE)) 7136 return (EINVAL); 7137 info->csum_info = (const uint32_t *)data; 7138 mask |= HN_RXINFO_CSUM; 7139 break; 7140 7141 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7142 if (__predict_false(dlen 7143 < HN_NDIS_HASH_VALUE_SIZE)) 7144 return (EINVAL); 7145 info->hash_value = (const uint32_t *)data; 7146 mask |= HN_RXINFO_HASHVAL; 7147 break; 7148 7149 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7150 if (__predict_false(dlen 7151 < HN_NDIS_HASH_INFO_SIZE)) 7152 return (EINVAL); 7153 info->hash_info = (const uint32_t *)data; 7154 mask |= HN_RXINFO_HASHINF; 7155 break; 7156 7157 default: 7158 goto next; 7159 } 7160 } 7161 7162 if (mask == HN_RXINFO_ALL) { 7163 /* All found; done */ 7164 break; 7165 } 7166 next: 7167 pi = (const struct rndis_pktinfo *) 7168 ((const uint8_t *)pi + pi->rm_size); 7169 } 7170 7171 /* 7172 * Final fixup. 7173 * - If there is no hash value, invalidate the hash info. 7174 */ 7175 if ((mask & HN_RXINFO_HASHVAL) == 0) 7176 info->hash_info = NULL; 7177 return (0); 7178 } 7179 7180 static __inline bool 7181 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7182 { 7183 7184 if (off < check_off) { 7185 if (__predict_true(off + len <= check_off)) 7186 return (false); 7187 } else if (off > check_off) { 7188 if (__predict_true(check_off + check_len <= off)) 7189 return (false); 7190 } 7191 return (true); 7192 } 7193 7194 static __inline void 7195 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7196 uint32_t len, struct hn_rxinfo *info) 7197 { 7198 uint32_t cnt = rxr->rsc.cnt; 7199 7200 if (cnt) { 7201 rxr->rsc.pktlen += len; 7202 } else { 7203 rxr->rsc.vlan_info = info->vlan_info; 7204 rxr->rsc.csum_info = info->csum_info; 7205 rxr->rsc.hash_info = info->hash_info; 7206 rxr->rsc.hash_value = info->hash_value; 7207 rxr->rsc.pktlen = len; 7208 } 7209 7210 rxr->rsc.frag_data[cnt] = data; 7211 rxr->rsc.frag_len[cnt] = len; 7212 rxr->rsc.cnt++; 7213 } 7214 7215 static void 7216 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7217 { 7218 const struct rndis_packet_msg *pkt; 7219 struct hn_rxinfo info; 7220 int data_off, pktinfo_off, data_len, pktinfo_len; 7221 bool rsc_more= false; 7222 7223 /* 7224 * Check length. 7225 */ 7226 if (__predict_false(dlen < sizeof(*pkt))) { 7227 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7228 return; 7229 } 7230 pkt = data; 7231 7232 if (__predict_false(dlen < pkt->rm_len)) { 7233 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7234 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7235 return; 7236 } 7237 if (__predict_false(pkt->rm_len < 7238 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7239 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7240 "msglen %u, data %u, oob %u, pktinfo %u\n", 7241 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7242 pkt->rm_pktinfolen); 7243 return; 7244 } 7245 if (__predict_false(pkt->rm_datalen == 0)) { 7246 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7247 return; 7248 } 7249 7250 /* 7251 * Check offests. 7252 */ 7253 #define IS_OFFSET_INVALID(ofs) \ 7254 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7255 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7256 7257 /* XXX Hyper-V does not meet data offset alignment requirement */ 7258 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7259 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7260 "data offset %u\n", pkt->rm_dataoffset); 7261 return; 7262 } 7263 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7264 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7265 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7266 "oob offset %u\n", pkt->rm_oobdataoffset); 7267 return; 7268 } 7269 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7270 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7271 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7272 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7273 return; 7274 } 7275 7276 #undef IS_OFFSET_INVALID 7277 7278 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7279 data_len = pkt->rm_datalen; 7280 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7281 pktinfo_len = pkt->rm_pktinfolen; 7282 7283 /* 7284 * Check OOB coverage. 7285 */ 7286 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7287 int oob_off, oob_len; 7288 7289 if_printf(rxr->hn_ifp, "got oobdata\n"); 7290 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7291 oob_len = pkt->rm_oobdatalen; 7292 7293 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7294 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7295 "oob overflow, msglen %u, oob abs %d len %d\n", 7296 pkt->rm_len, oob_off, oob_len); 7297 return; 7298 } 7299 7300 /* 7301 * Check against data. 7302 */ 7303 if (hn_rndis_check_overlap(oob_off, oob_len, 7304 data_off, data_len)) { 7305 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7306 "oob overlaps data, oob abs %d len %d, " 7307 "data abs %d len %d\n", 7308 oob_off, oob_len, data_off, data_len); 7309 return; 7310 } 7311 7312 /* 7313 * Check against pktinfo. 7314 */ 7315 if (pktinfo_len != 0 && 7316 hn_rndis_check_overlap(oob_off, oob_len, 7317 pktinfo_off, pktinfo_len)) { 7318 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7319 "oob overlaps pktinfo, oob abs %d len %d, " 7320 "pktinfo abs %d len %d\n", 7321 oob_off, oob_len, pktinfo_off, pktinfo_len); 7322 return; 7323 } 7324 } 7325 7326 /* 7327 * Check per-packet-info coverage and find useful per-packet-info. 7328 */ 7329 info.vlan_info = NULL; 7330 info.csum_info = NULL; 7331 info.hash_info = NULL; 7332 info.pktinfo_id = NULL; 7333 7334 if (__predict_true(pktinfo_len != 0)) { 7335 bool overlap; 7336 int error; 7337 7338 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7339 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7340 "pktinfo overflow, msglen %u, " 7341 "pktinfo abs %d len %d\n", 7342 pkt->rm_len, pktinfo_off, pktinfo_len); 7343 return; 7344 } 7345 7346 /* 7347 * Check packet info coverage. 7348 */ 7349 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7350 data_off, data_len); 7351 if (__predict_false(overlap)) { 7352 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7353 "pktinfo overlap data, pktinfo abs %d len %d, " 7354 "data abs %d len %d\n", 7355 pktinfo_off, pktinfo_len, data_off, data_len); 7356 return; 7357 } 7358 7359 /* 7360 * Find useful per-packet-info. 7361 */ 7362 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7363 pktinfo_len, &info); 7364 if (__predict_false(error)) { 7365 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7366 "pktinfo\n"); 7367 return; 7368 } 7369 } 7370 7371 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7372 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7373 "data overflow, msglen %u, data abs %d len %d\n", 7374 pkt->rm_len, data_off, data_len); 7375 return; 7376 } 7377 7378 /* Identify RSC fragments, drop invalid packets */ 7379 if ((info.pktinfo_id != NULL) && 7380 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7381 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7382 rxr->rsc.cnt = 0; 7383 rxr->hn_rsc_pkts++; 7384 } else if (rxr->rsc.cnt == 0) 7385 goto drop; 7386 7387 rsc_more = true; 7388 7389 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7390 rsc_more = false; 7391 7392 if (rsc_more && rxr->rsc.is_last) 7393 goto drop; 7394 } else { 7395 rxr->rsc.cnt = 0; 7396 } 7397 7398 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7399 goto drop; 7400 7401 /* Store data in per rx ring structure */ 7402 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7403 data_len, &info); 7404 7405 if (rsc_more) 7406 return; 7407 7408 hn_rxpkt(rxr); 7409 rxr->rsc.cnt = 0; 7410 return; 7411 drop: 7412 rxr->hn_rsc_drop++; 7413 return; 7414 } 7415 7416 static __inline void 7417 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7418 { 7419 const struct rndis_msghdr *hdr; 7420 7421 if (__predict_false(dlen < sizeof(*hdr))) { 7422 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7423 return; 7424 } 7425 hdr = data; 7426 7427 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7428 /* Hot data path. */ 7429 hn_rndis_rx_data(rxr, data, dlen); 7430 /* Done! */ 7431 return; 7432 } 7433 7434 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7435 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7436 else 7437 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7438 } 7439 7440 static void 7441 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7442 { 7443 const struct hn_nvs_hdr *hdr; 7444 7445 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7446 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7447 return; 7448 } 7449 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7450 7451 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7452 /* Useless; ignore */ 7453 return; 7454 } 7455 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7456 } 7457 7458 static void 7459 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7460 const struct vmbus_chanpkt_hdr *pkt) 7461 { 7462 struct hn_nvs_sendctx *sndc; 7463 7464 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7465 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7466 VMBUS_CHANPKT_DATALEN(pkt)); 7467 /* 7468 * NOTE: 7469 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7470 * its callback. 7471 */ 7472 } 7473 7474 static void 7475 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7476 const struct vmbus_chanpkt_hdr *pkthdr) 7477 { 7478 struct epoch_tracker et; 7479 const struct vmbus_chanpkt_rxbuf *pkt; 7480 const struct hn_nvs_hdr *nvs_hdr; 7481 int count, i, hlen; 7482 7483 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7484 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7485 return; 7486 } 7487 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7488 7489 /* Make sure that this is a RNDIS message. */ 7490 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7491 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7492 nvs_hdr->nvs_type); 7493 return; 7494 } 7495 7496 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7497 if (__predict_false(hlen < sizeof(*pkt))) { 7498 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7499 return; 7500 } 7501 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7502 7503 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7504 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7505 pkt->cp_rxbuf_id); 7506 return; 7507 } 7508 7509 count = pkt->cp_rxbuf_cnt; 7510 if (__predict_false(hlen < 7511 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7512 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7513 return; 7514 } 7515 7516 NET_EPOCH_ENTER(et); 7517 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7518 for (i = 0; i < count; ++i) { 7519 int ofs, len; 7520 7521 ofs = pkt->cp_rxbuf[i].rb_ofs; 7522 len = pkt->cp_rxbuf[i].rb_len; 7523 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7524 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7525 "ofs %d, len %d\n", i, ofs, len); 7526 continue; 7527 } 7528 7529 rxr->rsc.is_last = (i == (count - 1)); 7530 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7531 } 7532 NET_EPOCH_EXIT(et); 7533 7534 /* 7535 * Ack the consumed RXBUF associated w/ this channel packet, 7536 * so that this RXBUF can be recycled by the hypervisor. 7537 */ 7538 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7539 } 7540 7541 static void 7542 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7543 uint64_t tid) 7544 { 7545 struct hn_nvs_rndis_ack ack; 7546 int retries, error; 7547 7548 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7549 ack.nvs_status = HN_NVS_STATUS_OK; 7550 7551 retries = 0; 7552 again: 7553 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7554 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7555 if (__predict_false(error == EAGAIN)) { 7556 /* 7557 * NOTE: 7558 * This should _not_ happen in real world, since the 7559 * consumption of the TX bufring from the TX path is 7560 * controlled. 7561 */ 7562 if (rxr->hn_ack_failed == 0) 7563 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7564 rxr->hn_ack_failed++; 7565 retries++; 7566 if (retries < 10) { 7567 DELAY(100); 7568 goto again; 7569 } 7570 /* RXBUF leaks! */ 7571 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7572 } 7573 } 7574 7575 static void 7576 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7577 { 7578 struct hn_rx_ring *rxr = xrxr; 7579 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7580 7581 for (;;) { 7582 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7583 int error, pktlen; 7584 7585 pktlen = rxr->hn_pktbuf_len; 7586 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7587 if (__predict_false(error == ENOBUFS)) { 7588 void *nbuf; 7589 int nlen; 7590 7591 /* 7592 * Expand channel packet buffer. 7593 * 7594 * XXX 7595 * Use M_WAITOK here, since allocation failure 7596 * is fatal. 7597 */ 7598 nlen = rxr->hn_pktbuf_len * 2; 7599 while (nlen < pktlen) 7600 nlen *= 2; 7601 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7602 7603 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7604 rxr->hn_pktbuf_len, nlen); 7605 7606 free(rxr->hn_pktbuf, M_DEVBUF); 7607 rxr->hn_pktbuf = nbuf; 7608 rxr->hn_pktbuf_len = nlen; 7609 /* Retry! */ 7610 continue; 7611 } else if (__predict_false(error == EAGAIN)) { 7612 /* No more channel packets; done! */ 7613 break; 7614 } 7615 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7616 7617 switch (pkt->cph_type) { 7618 case VMBUS_CHANPKT_TYPE_COMP: 7619 hn_nvs_handle_comp(sc, chan, pkt); 7620 break; 7621 7622 case VMBUS_CHANPKT_TYPE_RXBUF: 7623 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7624 break; 7625 7626 case VMBUS_CHANPKT_TYPE_INBAND: 7627 hn_nvs_handle_notify(sc, pkt); 7628 break; 7629 7630 default: 7631 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7632 pkt->cph_type); 7633 break; 7634 } 7635 } 7636 hn_chan_rollup(rxr, rxr->hn_txr); 7637 } 7638 7639 static void 7640 hn_sysinit(void *arg __unused) 7641 { 7642 int i; 7643 7644 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7645 7646 #ifdef HN_IFSTART_SUPPORT 7647 /* 7648 * Don't use ifnet.if_start if transparent VF mode is requested; 7649 * mainly due to the IFF_DRV_OACTIVE flag. 7650 */ 7651 if (hn_xpnt_vf && hn_use_if_start) { 7652 hn_use_if_start = 0; 7653 printf("hn: tranparent VF mode, if_transmit will be used, " 7654 "instead of if_start\n"); 7655 } 7656 #endif 7657 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7658 printf("hn: invalid transparent VF attach routing " 7659 "wait timeout %d, reset to %d\n", 7660 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7661 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7662 } 7663 7664 /* 7665 * Initialize VF map. 7666 */ 7667 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7668 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7669 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7670 M_WAITOK | M_ZERO); 7671 7672 /* 7673 * Fix the # of TX taskqueues. 7674 */ 7675 if (hn_tx_taskq_cnt <= 0) 7676 hn_tx_taskq_cnt = 1; 7677 else if (hn_tx_taskq_cnt > mp_ncpus) 7678 hn_tx_taskq_cnt = mp_ncpus; 7679 7680 /* 7681 * Fix the TX taskqueue mode. 7682 */ 7683 switch (hn_tx_taskq_mode) { 7684 case HN_TX_TASKQ_M_INDEP: 7685 case HN_TX_TASKQ_M_GLOBAL: 7686 case HN_TX_TASKQ_M_EVTTQ: 7687 break; 7688 default: 7689 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7690 break; 7691 } 7692 7693 if (vm_guest != VM_GUEST_HV) 7694 return; 7695 7696 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7697 return; 7698 7699 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7700 M_DEVBUF, M_WAITOK); 7701 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7702 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7703 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7704 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7705 "hn tx%d", i); 7706 } 7707 } 7708 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7709 7710 static void 7711 hn_sysuninit(void *arg __unused) 7712 { 7713 7714 if (hn_tx_taskque != NULL) { 7715 int i; 7716 7717 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7718 taskqueue_free(hn_tx_taskque[i]); 7719 free(hn_tx_taskque, M_DEVBUF); 7720 } 7721 7722 if (hn_vfmap != NULL) 7723 free(hn_vfmap, M_DEVBUF); 7724 rm_destroy(&hn_vfmap_lock); 7725 7726 counter_u64_free(hn_udpcs_fixup); 7727 } 7728 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7729