1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <machine/atomic.h> 89 #include <machine/in_cksum.h> 90 91 #include <net/bpf.h> 92 #include <net/ethernet.h> 93 #include <net/if.h> 94 #include <net/if_dl.h> 95 #include <net/if_media.h> 96 #include <net/if_types.h> 97 #include <net/if_var.h> 98 #include <net/rndis.h> 99 #ifdef RSS 100 #include <net/rss_config.h> 101 #endif 102 103 #include <netinet/in_systm.h> 104 #include <netinet/in.h> 105 #include <netinet/ip.h> 106 #include <netinet/ip6.h> 107 #include <netinet/tcp.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/udp.h> 110 111 #include <dev/hyperv/include/hyperv.h> 112 #include <dev/hyperv/include/hyperv_busdma.h> 113 #include <dev/hyperv/include/vmbus.h> 114 #include <dev/hyperv/include/vmbus_xact.h> 115 116 #include <dev/hyperv/netvsc/ndis.h> 117 #include <dev/hyperv/netvsc/if_hnreg.h> 118 #include <dev/hyperv/netvsc/if_hnvar.h> 119 #include <dev/hyperv/netvsc/hn_nvs.h> 120 #include <dev/hyperv/netvsc/hn_rndis.h> 121 122 #include "vmbus_if.h" 123 124 #define HN_IFSTART_SUPPORT 125 126 #define HN_RING_CNT_DEF_MAX 8 127 128 #define HN_VFMAP_SIZE_DEF 8 129 130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 131 132 /* YYY should get it from the underlying channel */ 133 #define HN_TX_DESC_CNT 512 134 135 #define HN_RNDIS_PKT_LEN \ 136 (sizeof(struct rndis_packet_msg) + \ 137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 143 144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 147 /* -1 for RNDIS packet message */ 148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 149 150 #define HN_DIRECT_TX_SIZE_DEF 128 151 152 #define HN_EARLY_TXEOF_THRESH 8 153 154 #define HN_PKTBUF_LEN_DEF (16 * 1024) 155 156 #define HN_LROENT_CNT_DEF 128 157 158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 160 /* YYY 2*MTU is a bit rough, but should be good enough. */ 161 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 162 163 #define HN_LRO_ACKCNT_DEF 1 164 165 #define HN_LOCK_INIT(sc) \ 166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 169 #define HN_LOCK(sc) \ 170 do { \ 171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 172 /* Relinquish cpu to avoid deadlock */ \ 173 sched_relinquish(curthread); \ 174 DELAY(1000); \ 175 } \ 176 } while (0) 177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 178 179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 181 #define HN_CSUM_IP_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 183 #define HN_CSUM_IP6_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 185 186 #define HN_PKTSIZE_MIN(align) \ 187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 188 HN_RNDIS_PKT_LEN, (align)) 189 #define HN_PKTSIZE(m, align) \ 190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 191 192 #ifdef RSS 193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 194 #else 195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 196 #endif 197 198 struct hn_txdesc { 199 #ifndef HN_USE_TXDESC_BUFRING 200 SLIST_ENTRY(hn_txdesc) link; 201 #endif 202 STAILQ_ENTRY(hn_txdesc) agg_link; 203 204 /* Aggregated txdescs, in sending order. */ 205 STAILQ_HEAD(, hn_txdesc) agg_list; 206 207 /* The oldest packet, if transmission aggregation happens. */ 208 struct mbuf *m; 209 struct hn_tx_ring *txr; 210 int refs; 211 uint32_t flags; /* HN_TXD_FLAG_ */ 212 struct hn_nvs_sendctx send_ctx; 213 uint32_t chim_index; 214 int chim_size; 215 216 bus_dmamap_t data_dmap; 217 218 bus_addr_t rndis_pkt_paddr; 219 struct rndis_packet_msg *rndis_pkt; 220 bus_dmamap_t rndis_pkt_dmap; 221 }; 222 223 #define HN_TXD_FLAG_ONLIST 0x0001 224 #define HN_TXD_FLAG_DMAMAP 0x0002 225 #define HN_TXD_FLAG_ONAGG 0x0004 226 227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 230 231 struct packet_info_id { 232 uint8_t ver; 233 uint8_t flag; 234 uint16_t pkt_id; 235 }; 236 237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 238 239 240 struct hn_rxinfo { 241 const uint32_t *vlan_info; 242 const uint32_t *csum_info; 243 const uint32_t *hash_info; 244 const uint32_t *hash_value; 245 const struct packet_info_id *pktinfo_id; 246 }; 247 248 struct hn_rxvf_setarg { 249 struct hn_rx_ring *rxr; 250 struct ifnet *vf_ifp; 251 }; 252 253 #define HN_RXINFO_VLAN 0x0001 254 #define HN_RXINFO_CSUM 0x0002 255 #define HN_RXINFO_HASHINF 0x0004 256 #define HN_RXINFO_HASHVAL 0x0008 257 #define HN_RXINFO_PKTINFO_ID 0x0010 258 #define HN_RXINFO_ALL \ 259 (HN_RXINFO_VLAN | \ 260 HN_RXINFO_CSUM | \ 261 HN_RXINFO_HASHINF | \ 262 HN_RXINFO_HASHVAL | \ 263 HN_RXINFO_PKTINFO_ID) 264 265 static int hn_probe(device_t); 266 static int hn_attach(device_t); 267 static int hn_detach(device_t); 268 static int hn_shutdown(device_t); 269 static void hn_chan_callback(struct vmbus_channel *, 270 void *); 271 272 static void hn_init(void *); 273 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 274 #ifdef HN_IFSTART_SUPPORT 275 static void hn_start(struct ifnet *); 276 #endif 277 static int hn_transmit(struct ifnet *, struct mbuf *); 278 static void hn_xmit_qflush(struct ifnet *); 279 static int hn_ifmedia_upd(struct ifnet *); 280 static void hn_ifmedia_sts(struct ifnet *, 281 struct ifmediareq *); 282 283 static void hn_ifnet_event(void *, struct ifnet *, int); 284 static void hn_ifaddr_event(void *, struct ifnet *); 285 static void hn_ifnet_attevent(void *, struct ifnet *); 286 static void hn_ifnet_detevent(void *, struct ifnet *); 287 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 288 289 static bool hn_ismyvf(const struct hn_softc *, 290 const struct ifnet *); 291 static void hn_rxvf_change(struct hn_softc *, 292 struct ifnet *, bool); 293 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 294 static void hn_rxvf_set_task(void *, int); 295 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 298 struct ifreq *); 299 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 300 static bool hn_xpnt_vf_isready(struct hn_softc *); 301 static void hn_xpnt_vf_setready(struct hn_softc *); 302 static void hn_xpnt_vf_init_taskfunc(void *, int); 303 static void hn_xpnt_vf_init(struct hn_softc *); 304 static void hn_xpnt_vf_setenable(struct hn_softc *); 305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 306 static void hn_vf_rss_fixup(struct hn_softc *, bool); 307 static void hn_vf_rss_restore(struct hn_softc *); 308 309 static int hn_rndis_rxinfo(const void *, int, 310 struct hn_rxinfo *); 311 static void hn_rndis_rx_data(struct hn_rx_ring *, 312 const void *, int); 313 static void hn_rndis_rx_status(struct hn_softc *, 314 const void *, int); 315 static void hn_rndis_init_fixat(struct hn_softc *, int); 316 317 static void hn_nvs_handle_notify(struct hn_softc *, 318 const struct vmbus_chanpkt_hdr *); 319 static void hn_nvs_handle_comp(struct hn_softc *, 320 struct vmbus_channel *, 321 const struct vmbus_chanpkt_hdr *); 322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 323 struct vmbus_channel *, 324 const struct vmbus_chanpkt_hdr *); 325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 326 struct vmbus_channel *, uint64_t); 327 328 #if __FreeBSD_version >= 1100099 329 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 331 #endif 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 #if __FreeBSD_version < 1100095 335 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 336 #else 337 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 338 #endif 339 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 346 #ifndef RSS 347 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 349 #endif 350 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 362 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 363 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 364 365 static void hn_stop(struct hn_softc *, bool); 366 static void hn_init_locked(struct hn_softc *); 367 static int hn_chan_attach(struct hn_softc *, 368 struct vmbus_channel *); 369 static void hn_chan_detach(struct hn_softc *, 370 struct vmbus_channel *); 371 static int hn_attach_subchans(struct hn_softc *); 372 static void hn_detach_allchans(struct hn_softc *); 373 static void hn_chan_rollup(struct hn_rx_ring *, 374 struct hn_tx_ring *); 375 static void hn_set_ring_inuse(struct hn_softc *, int); 376 static int hn_synth_attach(struct hn_softc *, int); 377 static void hn_synth_detach(struct hn_softc *); 378 static int hn_synth_alloc_subchans(struct hn_softc *, 379 int *); 380 static bool hn_synth_attachable(const struct hn_softc *); 381 static void hn_suspend(struct hn_softc *); 382 static void hn_suspend_data(struct hn_softc *); 383 static void hn_suspend_mgmt(struct hn_softc *); 384 static void hn_resume(struct hn_softc *); 385 static void hn_resume_data(struct hn_softc *); 386 static void hn_resume_mgmt(struct hn_softc *); 387 static void hn_suspend_mgmt_taskfunc(void *, int); 388 static void hn_chan_drain(struct hn_softc *, 389 struct vmbus_channel *); 390 static void hn_disable_rx(struct hn_softc *); 391 static void hn_drain_rxtx(struct hn_softc *, int); 392 static void hn_polling(struct hn_softc *, u_int); 393 static void hn_chan_polling(struct vmbus_channel *, u_int); 394 static void hn_mtu_change_fixup(struct hn_softc *); 395 396 static void hn_update_link_status(struct hn_softc *); 397 static void hn_change_network(struct hn_softc *); 398 static void hn_link_taskfunc(void *, int); 399 static void hn_netchg_init_taskfunc(void *, int); 400 static void hn_netchg_status_taskfunc(void *, int); 401 static void hn_link_status(struct hn_softc *); 402 403 static int hn_create_rx_data(struct hn_softc *, int); 404 static void hn_destroy_rx_data(struct hn_softc *); 405 static int hn_check_iplen(const struct mbuf *, int); 406 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 407 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 408 static int hn_rxfilter_config(struct hn_softc *); 409 static int hn_rss_reconfig(struct hn_softc *); 410 static void hn_rss_ind_fixup(struct hn_softc *); 411 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 412 static int hn_rxpkt(struct hn_rx_ring *); 413 static uint32_t hn_rss_type_fromndis(uint32_t); 414 static uint32_t hn_rss_type_tondis(uint32_t); 415 416 static int hn_tx_ring_create(struct hn_softc *, int); 417 static void hn_tx_ring_destroy(struct hn_tx_ring *); 418 static int hn_create_tx_data(struct hn_softc *, int); 419 static void hn_fixup_tx_data(struct hn_softc *); 420 static void hn_fixup_rx_data(struct hn_softc *); 421 static void hn_destroy_tx_data(struct hn_softc *); 422 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 423 static void hn_txdesc_gc(struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 426 struct hn_txdesc *, struct mbuf **); 427 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 428 struct hn_txdesc *); 429 static void hn_set_chim_size(struct hn_softc *, int); 430 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 431 static bool hn_tx_ring_pending(struct hn_tx_ring *); 432 static void hn_tx_ring_qflush(struct hn_tx_ring *); 433 static void hn_resume_tx(struct hn_softc *, int); 434 static void hn_set_txagg(struct hn_softc *); 435 static void *hn_try_txagg(struct ifnet *, 436 struct hn_tx_ring *, struct hn_txdesc *, 437 int); 438 static int hn_get_txswq_depth(const struct hn_tx_ring *); 439 static void hn_txpkt_done(struct hn_nvs_sendctx *, 440 struct hn_softc *, struct vmbus_channel *, 441 const void *, int); 442 static int hn_txpkt_sglist(struct hn_tx_ring *, 443 struct hn_txdesc *); 444 static int hn_txpkt_chim(struct hn_tx_ring *, 445 struct hn_txdesc *); 446 static int hn_xmit(struct hn_tx_ring *, int); 447 static void hn_xmit_taskfunc(void *, int); 448 static void hn_xmit_txeof(struct hn_tx_ring *); 449 static void hn_xmit_txeof_taskfunc(void *, int); 450 #ifdef HN_IFSTART_SUPPORT 451 static int hn_start_locked(struct hn_tx_ring *, int); 452 static void hn_start_taskfunc(void *, int); 453 static void hn_start_txeof(struct hn_tx_ring *); 454 static void hn_start_txeof_taskfunc(void *, int); 455 #endif 456 457 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 458 "Hyper-V network interface"); 459 460 /* Trust tcp segements verification on host side. */ 461 static int hn_trust_hosttcp = 1; 462 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 463 &hn_trust_hosttcp, 0, 464 "Trust tcp segement verification on host side, " 465 "when csum info is missing (global setting)"); 466 467 /* Trust udp datagrams verification on host side. */ 468 static int hn_trust_hostudp = 1; 469 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 470 &hn_trust_hostudp, 0, 471 "Trust udp datagram verification on host side, " 472 "when csum info is missing (global setting)"); 473 474 /* Trust ip packets verification on host side. */ 475 static int hn_trust_hostip = 1; 476 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 477 &hn_trust_hostip, 0, 478 "Trust ip packet verification on host side, " 479 "when csum info is missing (global setting)"); 480 481 /* 482 * Offload UDP/IPv4 checksum. 483 */ 484 static int hn_enable_udp4cs = 1; 485 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 486 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 487 488 /* 489 * Offload UDP/IPv6 checksum. 490 */ 491 static int hn_enable_udp6cs = 1; 492 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 493 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 494 495 /* Stats. */ 496 static counter_u64_t hn_udpcs_fixup; 497 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 498 &hn_udpcs_fixup, "# of UDP checksum fixup"); 499 500 /* 501 * See hn_set_hlen(). 502 * 503 * This value is for Azure. For Hyper-V, set this above 504 * 65536 to disable UDP datagram checksum fixup. 505 */ 506 static int hn_udpcs_fixup_mtu = 1420; 507 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 508 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 509 510 /* Limit TSO burst size */ 511 static int hn_tso_maxlen = IP_MAXPACKET; 512 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 513 &hn_tso_maxlen, 0, "TSO burst limit"); 514 515 /* Limit chimney send size */ 516 static int hn_tx_chimney_size = 0; 517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 518 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 519 520 /* Limit the size of packet for direct transmission */ 521 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 522 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 523 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 524 525 /* # of LRO entries per RX ring */ 526 #if defined(INET) || defined(INET6) 527 #if __FreeBSD_version >= 1100095 528 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 529 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 530 &hn_lro_entry_count, 0, "LRO entry count"); 531 #endif 532 #endif 533 534 static int hn_tx_taskq_cnt = 1; 535 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 536 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 537 538 #define HN_TX_TASKQ_M_INDEP 0 539 #define HN_TX_TASKQ_M_GLOBAL 1 540 #define HN_TX_TASKQ_M_EVTTQ 2 541 542 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 543 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 544 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 545 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 546 547 #ifndef HN_USE_TXDESC_BUFRING 548 static int hn_use_txdesc_bufring = 0; 549 #else 550 static int hn_use_txdesc_bufring = 1; 551 #endif 552 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 553 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 554 555 #ifdef HN_IFSTART_SUPPORT 556 /* Use ifnet.if_start instead of ifnet.if_transmit */ 557 static int hn_use_if_start = 0; 558 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 559 &hn_use_if_start, 0, "Use if_start TX method"); 560 #endif 561 562 /* # of channels to use */ 563 static int hn_chan_cnt = 0; 564 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 565 &hn_chan_cnt, 0, 566 "# of channels to use; each channel has one RX ring and one TX ring"); 567 568 /* # of transmit rings to use */ 569 static int hn_tx_ring_cnt = 0; 570 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 571 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 572 573 /* Software TX ring deptch */ 574 static int hn_tx_swq_depth = 0; 575 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 576 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 577 578 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 579 #if __FreeBSD_version >= 1100095 580 static u_int hn_lro_mbufq_depth = 0; 581 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 582 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 583 #endif 584 585 /* Packet transmission aggregation size limit */ 586 static int hn_tx_agg_size = -1; 587 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 588 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 589 590 /* Packet transmission aggregation count limit */ 591 static int hn_tx_agg_pkts = -1; 592 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 593 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 594 595 /* VF list */ 596 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 597 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 598 hn_vflist_sysctl, "A", 599 "VF list"); 600 601 /* VF mapping */ 602 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 603 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 604 hn_vfmap_sysctl, "A", 605 "VF mapping"); 606 607 /* Transparent VF */ 608 static int hn_xpnt_vf = 1; 609 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 610 &hn_xpnt_vf, 0, "Transparent VF mod"); 611 612 /* Accurate BPF support for Transparent VF */ 613 static int hn_xpnt_vf_accbpf = 0; 614 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 615 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 616 617 /* Extra wait for transparent VF attach routing; unit seconds. */ 618 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 619 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 620 &hn_xpnt_vf_attwait, 0, 621 "Extra wait for transparent VF attach routing; unit: seconds"); 622 623 static u_int hn_cpu_index; /* next CPU for channel */ 624 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 625 626 static struct rmlock hn_vfmap_lock; 627 static int hn_vfmap_size; 628 static struct ifnet **hn_vfmap; 629 630 #ifndef RSS 631 static const uint8_t 632 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 633 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 634 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 635 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 636 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 637 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 638 }; 639 #endif /* !RSS */ 640 641 static const struct hyperv_guid hn_guid = { 642 .hv_guid = { 643 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 644 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 645 }; 646 647 static device_method_t hn_methods[] = { 648 /* Device interface */ 649 DEVMETHOD(device_probe, hn_probe), 650 DEVMETHOD(device_attach, hn_attach), 651 DEVMETHOD(device_detach, hn_detach), 652 DEVMETHOD(device_shutdown, hn_shutdown), 653 DEVMETHOD_END 654 }; 655 656 static driver_t hn_driver = { 657 "hn", 658 hn_methods, 659 sizeof(struct hn_softc) 660 }; 661 662 static devclass_t hn_devclass; 663 664 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 665 MODULE_VERSION(hn, 1); 666 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 667 668 #if __FreeBSD_version >= 1100099 669 static void 670 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 671 { 672 int i; 673 674 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 675 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 676 } 677 #endif 678 679 static int 680 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 681 { 682 683 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 684 txd->chim_size == 0, ("invalid rndis sglist txd")); 685 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 686 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 687 } 688 689 static int 690 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 691 { 692 struct hn_nvs_rndis rndis; 693 694 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 695 txd->chim_size > 0, ("invalid rndis chim txd")); 696 697 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 698 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 699 rndis.nvs_chim_idx = txd->chim_index; 700 rndis.nvs_chim_sz = txd->chim_size; 701 702 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 703 &rndis, sizeof(rndis), &txd->send_ctx)); 704 } 705 706 static __inline uint32_t 707 hn_chim_alloc(struct hn_softc *sc) 708 { 709 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 710 u_long *bmap = sc->hn_chim_bmap; 711 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 712 713 for (i = 0; i < bmap_cnt; ++i) { 714 int idx; 715 716 idx = ffsl(~bmap[i]); 717 if (idx == 0) 718 continue; 719 720 --idx; /* ffsl is 1-based */ 721 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 722 ("invalid i %d and idx %d", i, idx)); 723 724 if (atomic_testandset_long(&bmap[i], idx)) 725 continue; 726 727 ret = i * LONG_BIT + idx; 728 break; 729 } 730 return (ret); 731 } 732 733 static __inline void 734 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 735 { 736 u_long mask; 737 uint32_t idx; 738 739 idx = chim_idx / LONG_BIT; 740 KASSERT(idx < sc->hn_chim_bmap_cnt, 741 ("invalid chimney index 0x%x", chim_idx)); 742 743 mask = 1UL << (chim_idx % LONG_BIT); 744 KASSERT(sc->hn_chim_bmap[idx] & mask, 745 ("index bitmap 0x%lx, chimney index %u, " 746 "bitmap idx %d, bitmask 0x%lx", 747 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 748 749 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 750 } 751 752 #if defined(INET6) || defined(INET) 753 754 #define PULLUP_HDR(m, len) \ 755 do { \ 756 if (__predict_false((m)->m_len < (len))) { \ 757 (m) = m_pullup((m), (len)); \ 758 if ((m) == NULL) \ 759 return (NULL); \ 760 } \ 761 } while (0) 762 763 /* 764 * NOTE: If this function failed, the m_head would be freed. 765 */ 766 static __inline struct mbuf * 767 hn_tso_fixup(struct mbuf *m_head) 768 { 769 struct ether_vlan_header *evl; 770 struct tcphdr *th; 771 int ehlen; 772 773 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 774 775 PULLUP_HDR(m_head, sizeof(*evl)); 776 evl = mtod(m_head, struct ether_vlan_header *); 777 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 778 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 779 else 780 ehlen = ETHER_HDR_LEN; 781 m_head->m_pkthdr.l2hlen = ehlen; 782 783 #ifdef INET 784 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 785 struct ip *ip; 786 int iphlen; 787 788 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 789 ip = mtodo(m_head, ehlen); 790 iphlen = ip->ip_hl << 2; 791 m_head->m_pkthdr.l3hlen = iphlen; 792 793 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 794 th = mtodo(m_head, ehlen + iphlen); 795 796 ip->ip_len = 0; 797 ip->ip_sum = 0; 798 th->th_sum = in_pseudo(ip->ip_src.s_addr, 799 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 800 } 801 #endif 802 #if defined(INET6) && defined(INET) 803 else 804 #endif 805 #ifdef INET6 806 { 807 struct ip6_hdr *ip6; 808 809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 810 ip6 = mtodo(m_head, ehlen); 811 if (ip6->ip6_nxt != IPPROTO_TCP) { 812 m_freem(m_head); 813 return (NULL); 814 } 815 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 816 817 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 818 th = mtodo(m_head, ehlen + sizeof(*ip6)); 819 820 ip6->ip6_plen = 0; 821 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 822 } 823 #endif 824 return (m_head); 825 } 826 827 /* 828 * NOTE: If this function failed, the m_head would be freed. 829 */ 830 static __inline struct mbuf * 831 hn_set_hlen(struct mbuf *m_head) 832 { 833 const struct ether_vlan_header *evl; 834 int ehlen; 835 836 PULLUP_HDR(m_head, sizeof(*evl)); 837 evl = mtod(m_head, const struct ether_vlan_header *); 838 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 839 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 840 else 841 ehlen = ETHER_HDR_LEN; 842 m_head->m_pkthdr.l2hlen = ehlen; 843 844 #ifdef INET 845 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 846 const struct ip *ip; 847 int iphlen; 848 849 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 850 ip = mtodo(m_head, ehlen); 851 iphlen = ip->ip_hl << 2; 852 m_head->m_pkthdr.l3hlen = iphlen; 853 854 /* 855 * UDP checksum offload does not work in Azure, if the 856 * following conditions meet: 857 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 858 * - IP_DF is not set in the IP hdr. 859 * 860 * Fallback to software checksum for these UDP datagrams. 861 */ 862 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 863 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 864 (ntohs(ip->ip_off) & IP_DF) == 0) { 865 uint16_t off = ehlen + iphlen; 866 867 counter_u64_add(hn_udpcs_fixup, 1); 868 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 869 *(uint16_t *)(m_head->m_data + off + 870 m_head->m_pkthdr.csum_data) = in_cksum_skip( 871 m_head, m_head->m_pkthdr.len, off); 872 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 873 } 874 } 875 #endif 876 #if defined(INET6) && defined(INET) 877 else 878 #endif 879 #ifdef INET6 880 { 881 const struct ip6_hdr *ip6; 882 883 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 884 ip6 = mtodo(m_head, ehlen); 885 if (ip6->ip6_nxt != IPPROTO_TCP && 886 ip6->ip6_nxt != IPPROTO_UDP) { 887 m_freem(m_head); 888 return (NULL); 889 } 890 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 891 } 892 #endif 893 return (m_head); 894 } 895 896 /* 897 * NOTE: If this function failed, the m_head would be freed. 898 */ 899 static __inline struct mbuf * 900 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 901 { 902 const struct tcphdr *th; 903 int ehlen, iphlen; 904 905 *tcpsyn = 0; 906 ehlen = m_head->m_pkthdr.l2hlen; 907 iphlen = m_head->m_pkthdr.l3hlen; 908 909 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 910 th = mtodo(m_head, ehlen + iphlen); 911 if (th->th_flags & TH_SYN) 912 *tcpsyn = 1; 913 return (m_head); 914 } 915 916 #undef PULLUP_HDR 917 918 #endif /* INET6 || INET */ 919 920 static int 921 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 922 { 923 int error = 0; 924 925 HN_LOCK_ASSERT(sc); 926 927 if (sc->hn_rx_filter != filter) { 928 error = hn_rndis_set_rxfilter(sc, filter); 929 if (!error) 930 sc->hn_rx_filter = filter; 931 } 932 return (error); 933 } 934 935 static int 936 hn_rxfilter_config(struct hn_softc *sc) 937 { 938 struct ifnet *ifp = sc->hn_ifp; 939 uint32_t filter; 940 941 HN_LOCK_ASSERT(sc); 942 943 /* 944 * If the non-transparent mode VF is activated, we don't know how 945 * its RX filter is configured, so stick the synthetic device in 946 * the promiscous mode. 947 */ 948 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 949 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 950 } else { 951 filter = NDIS_PACKET_TYPE_DIRECTED; 952 if (ifp->if_flags & IFF_BROADCAST) 953 filter |= NDIS_PACKET_TYPE_BROADCAST; 954 /* TODO: support multicast list */ 955 if ((ifp->if_flags & IFF_ALLMULTI) || 956 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 957 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 958 } 959 return (hn_set_rxfilter(sc, filter)); 960 } 961 962 static void 963 hn_set_txagg(struct hn_softc *sc) 964 { 965 uint32_t size, pkts; 966 int i; 967 968 /* 969 * Setup aggregation size. 970 */ 971 if (sc->hn_agg_size < 0) 972 size = UINT32_MAX; 973 else 974 size = sc->hn_agg_size; 975 976 if (sc->hn_rndis_agg_size < size) 977 size = sc->hn_rndis_agg_size; 978 979 /* NOTE: We only aggregate packets using chimney sending buffers. */ 980 if (size > (uint32_t)sc->hn_chim_szmax) 981 size = sc->hn_chim_szmax; 982 983 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 984 /* Disable */ 985 size = 0; 986 pkts = 0; 987 goto done; 988 } 989 990 /* NOTE: Type of the per TX ring setting is 'int'. */ 991 if (size > INT_MAX) 992 size = INT_MAX; 993 994 /* 995 * Setup aggregation packet count. 996 */ 997 if (sc->hn_agg_pkts < 0) 998 pkts = UINT32_MAX; 999 else 1000 pkts = sc->hn_agg_pkts; 1001 1002 if (sc->hn_rndis_agg_pkts < pkts) 1003 pkts = sc->hn_rndis_agg_pkts; 1004 1005 if (pkts <= 1) { 1006 /* Disable */ 1007 size = 0; 1008 pkts = 0; 1009 goto done; 1010 } 1011 1012 /* NOTE: Type of the per TX ring setting is 'short'. */ 1013 if (pkts > SHRT_MAX) 1014 pkts = SHRT_MAX; 1015 1016 done: 1017 /* NOTE: Type of the per TX ring setting is 'short'. */ 1018 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1019 /* Disable */ 1020 size = 0; 1021 pkts = 0; 1022 } 1023 1024 if (bootverbose) { 1025 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1026 size, pkts, sc->hn_rndis_agg_align); 1027 } 1028 1029 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1030 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1031 1032 mtx_lock(&txr->hn_tx_lock); 1033 txr->hn_agg_szmax = size; 1034 txr->hn_agg_pktmax = pkts; 1035 txr->hn_agg_align = sc->hn_rndis_agg_align; 1036 mtx_unlock(&txr->hn_tx_lock); 1037 } 1038 } 1039 1040 static int 1041 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1042 { 1043 1044 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1045 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1046 return txr->hn_txdesc_cnt; 1047 return hn_tx_swq_depth; 1048 } 1049 1050 static int 1051 hn_rss_reconfig(struct hn_softc *sc) 1052 { 1053 int error; 1054 1055 HN_LOCK_ASSERT(sc); 1056 1057 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1058 return (ENXIO); 1059 1060 /* 1061 * Disable RSS first. 1062 * 1063 * NOTE: 1064 * Direct reconfiguration by setting the UNCHG flags does 1065 * _not_ work properly. 1066 */ 1067 if (bootverbose) 1068 if_printf(sc->hn_ifp, "disable RSS\n"); 1069 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1070 if (error) { 1071 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1072 return (error); 1073 } 1074 1075 /* 1076 * Reenable the RSS w/ the updated RSS key or indirect 1077 * table. 1078 */ 1079 if (bootverbose) 1080 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1081 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1082 if (error) { 1083 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1084 return (error); 1085 } 1086 return (0); 1087 } 1088 1089 static void 1090 hn_rss_ind_fixup(struct hn_softc *sc) 1091 { 1092 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1093 int i, nchan; 1094 1095 nchan = sc->hn_rx_ring_inuse; 1096 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1097 1098 /* 1099 * Check indirect table to make sure that all channels in it 1100 * can be used. 1101 */ 1102 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1103 if (rss->rss_ind[i] >= nchan) { 1104 if_printf(sc->hn_ifp, 1105 "RSS indirect table %d fixup: %u -> %d\n", 1106 i, rss->rss_ind[i], nchan - 1); 1107 rss->rss_ind[i] = nchan - 1; 1108 } 1109 } 1110 } 1111 1112 static int 1113 hn_ifmedia_upd(struct ifnet *ifp __unused) 1114 { 1115 1116 return EOPNOTSUPP; 1117 } 1118 1119 static void 1120 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1121 { 1122 struct hn_softc *sc = ifp->if_softc; 1123 1124 ifmr->ifm_status = IFM_AVALID; 1125 ifmr->ifm_active = IFM_ETHER; 1126 1127 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1128 ifmr->ifm_active |= IFM_NONE; 1129 return; 1130 } 1131 ifmr->ifm_status |= IFM_ACTIVE; 1132 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1133 } 1134 1135 static void 1136 hn_rxvf_set_task(void *xarg, int pending __unused) 1137 { 1138 struct hn_rxvf_setarg *arg = xarg; 1139 1140 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1141 } 1142 1143 static void 1144 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1145 { 1146 struct hn_rx_ring *rxr; 1147 struct hn_rxvf_setarg arg; 1148 struct task task; 1149 int i; 1150 1151 HN_LOCK_ASSERT(sc); 1152 1153 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1154 1155 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1156 rxr = &sc->hn_rx_ring[i]; 1157 1158 if (i < sc->hn_rx_ring_inuse) { 1159 arg.rxr = rxr; 1160 arg.vf_ifp = vf_ifp; 1161 vmbus_chan_run_task(rxr->hn_chan, &task); 1162 } else { 1163 rxr->hn_rxvf_ifp = vf_ifp; 1164 } 1165 } 1166 } 1167 1168 static bool 1169 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1170 { 1171 const struct ifnet *hn_ifp; 1172 1173 hn_ifp = sc->hn_ifp; 1174 1175 if (ifp == hn_ifp) 1176 return (false); 1177 1178 if (ifp->if_alloctype != IFT_ETHER) 1179 return (false); 1180 1181 /* Ignore lagg/vlan interfaces */ 1182 if (strcmp(ifp->if_dname, "lagg") == 0 || 1183 strcmp(ifp->if_dname, "vlan") == 0) 1184 return (false); 1185 1186 /* 1187 * During detach events ifp->if_addr might be NULL. 1188 * Make sure the bcmp() below doesn't panic on that: 1189 */ 1190 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1191 return (false); 1192 1193 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1194 return (false); 1195 1196 return (true); 1197 } 1198 1199 static void 1200 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1201 { 1202 struct ifnet *hn_ifp; 1203 1204 HN_LOCK(sc); 1205 1206 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1207 goto out; 1208 1209 if (!hn_ismyvf(sc, ifp)) 1210 goto out; 1211 hn_ifp = sc->hn_ifp; 1212 1213 if (rxvf) { 1214 if (sc->hn_flags & HN_FLAG_RXVF) 1215 goto out; 1216 1217 sc->hn_flags |= HN_FLAG_RXVF; 1218 hn_rxfilter_config(sc); 1219 } else { 1220 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1221 goto out; 1222 1223 sc->hn_flags &= ~HN_FLAG_RXVF; 1224 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1225 hn_rxfilter_config(sc); 1226 else 1227 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1228 } 1229 1230 hn_nvs_set_datapath(sc, 1231 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1232 1233 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1234 1235 if (rxvf) { 1236 hn_vf_rss_fixup(sc, true); 1237 hn_suspend_mgmt(sc); 1238 sc->hn_link_flags &= 1239 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1240 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1241 } else { 1242 hn_vf_rss_restore(sc); 1243 hn_resume_mgmt(sc); 1244 } 1245 1246 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1247 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1248 1249 if (bootverbose) { 1250 if_printf(hn_ifp, "datapath is switched %s %s\n", 1251 rxvf ? "to" : "from", ifp->if_xname); 1252 } 1253 out: 1254 HN_UNLOCK(sc); 1255 } 1256 1257 static void 1258 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1259 { 1260 1261 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1262 return; 1263 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1264 } 1265 1266 static void 1267 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1268 { 1269 1270 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1271 } 1272 1273 static int 1274 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1275 { 1276 struct ifnet *ifp, *vf_ifp; 1277 uint64_t tmp; 1278 int error; 1279 1280 HN_LOCK_ASSERT(sc); 1281 ifp = sc->hn_ifp; 1282 vf_ifp = sc->hn_vf_ifp; 1283 1284 /* 1285 * Fix up requested capabilities w/ supported capabilities, 1286 * since the supported capabilities could have been changed. 1287 */ 1288 ifr->ifr_reqcap &= ifp->if_capabilities; 1289 /* Pass SIOCSIFCAP to VF. */ 1290 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1291 1292 /* 1293 * NOTE: 1294 * The error will be propagated to the callers, however, it 1295 * is _not_ useful here. 1296 */ 1297 1298 /* 1299 * Merge VF's enabled capabilities. 1300 */ 1301 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1302 1303 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1304 if (ifp->if_capenable & IFCAP_TXCSUM) 1305 ifp->if_hwassist |= tmp; 1306 else 1307 ifp->if_hwassist &= ~tmp; 1308 1309 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1310 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1311 ifp->if_hwassist |= tmp; 1312 else 1313 ifp->if_hwassist &= ~tmp; 1314 1315 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1316 if (ifp->if_capenable & IFCAP_TSO4) 1317 ifp->if_hwassist |= tmp; 1318 else 1319 ifp->if_hwassist &= ~tmp; 1320 1321 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1322 if (ifp->if_capenable & IFCAP_TSO6) 1323 ifp->if_hwassist |= tmp; 1324 else 1325 ifp->if_hwassist &= ~tmp; 1326 1327 return (error); 1328 } 1329 1330 static int 1331 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1332 { 1333 struct ifnet *vf_ifp; 1334 struct ifreq ifr; 1335 1336 HN_LOCK_ASSERT(sc); 1337 vf_ifp = sc->hn_vf_ifp; 1338 1339 memset(&ifr, 0, sizeof(ifr)); 1340 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1341 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1342 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1343 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1344 } 1345 1346 static void 1347 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1348 { 1349 struct ifnet *ifp = sc->hn_ifp; 1350 int allmulti = 0; 1351 1352 HN_LOCK_ASSERT(sc); 1353 1354 /* XXX vlan(4) style mcast addr maintenance */ 1355 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1356 allmulti = IFF_ALLMULTI; 1357 1358 /* Always set the VF's if_flags */ 1359 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1360 } 1361 1362 static void 1363 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1364 { 1365 struct rm_priotracker pt; 1366 struct ifnet *hn_ifp = NULL; 1367 struct mbuf *mn; 1368 1369 /* 1370 * XXX racy, if hn(4) ever detached. 1371 */ 1372 rm_rlock(&hn_vfmap_lock, &pt); 1373 if (vf_ifp->if_index < hn_vfmap_size) 1374 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1375 rm_runlock(&hn_vfmap_lock, &pt); 1376 1377 if (hn_ifp != NULL) { 1378 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1379 /* 1380 * Allow tapping on the VF. 1381 */ 1382 ETHER_BPF_MTAP(vf_ifp, mn); 1383 1384 /* 1385 * Update VF stats. 1386 */ 1387 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1388 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1389 mn->m_pkthdr.len); 1390 } 1391 /* 1392 * XXX IFCOUNTER_IMCAST 1393 * This stat updating is kinda invasive, since it 1394 * requires two checks on the mbuf: the length check 1395 * and the ethernet header check. As of this write, 1396 * all multicast packets go directly to hn(4), which 1397 * makes imcast stat updating in the VF a try in vian. 1398 */ 1399 1400 /* 1401 * Fix up rcvif and increase hn(4)'s ipackets. 1402 */ 1403 mn->m_pkthdr.rcvif = hn_ifp; 1404 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1405 } 1406 /* 1407 * Go through hn(4)'s if_input. 1408 */ 1409 hn_ifp->if_input(hn_ifp, m); 1410 } else { 1411 /* 1412 * In the middle of the transition; free this 1413 * mbuf chain. 1414 */ 1415 while (m != NULL) { 1416 mn = m->m_nextpkt; 1417 m->m_nextpkt = NULL; 1418 m_freem(m); 1419 m = mn; 1420 } 1421 } 1422 } 1423 1424 static void 1425 hn_mtu_change_fixup(struct hn_softc *sc) 1426 { 1427 struct ifnet *ifp; 1428 1429 HN_LOCK_ASSERT(sc); 1430 ifp = sc->hn_ifp; 1431 1432 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1433 #if __FreeBSD_version >= 1100099 1434 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1435 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1436 #endif 1437 } 1438 1439 static uint32_t 1440 hn_rss_type_fromndis(uint32_t rss_hash) 1441 { 1442 uint32_t types = 0; 1443 1444 if (rss_hash & NDIS_HASH_IPV4) 1445 types |= RSS_TYPE_IPV4; 1446 if (rss_hash & NDIS_HASH_TCP_IPV4) 1447 types |= RSS_TYPE_TCP_IPV4; 1448 if (rss_hash & NDIS_HASH_IPV6) 1449 types |= RSS_TYPE_IPV6; 1450 if (rss_hash & NDIS_HASH_IPV6_EX) 1451 types |= RSS_TYPE_IPV6_EX; 1452 if (rss_hash & NDIS_HASH_TCP_IPV6) 1453 types |= RSS_TYPE_TCP_IPV6; 1454 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1455 types |= RSS_TYPE_TCP_IPV6_EX; 1456 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1457 types |= RSS_TYPE_UDP_IPV4; 1458 return (types); 1459 } 1460 1461 static uint32_t 1462 hn_rss_type_tondis(uint32_t types) 1463 { 1464 uint32_t rss_hash = 0; 1465 1466 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1467 ("UDP6 and UDP6EX are not supported")); 1468 1469 if (types & RSS_TYPE_IPV4) 1470 rss_hash |= NDIS_HASH_IPV4; 1471 if (types & RSS_TYPE_TCP_IPV4) 1472 rss_hash |= NDIS_HASH_TCP_IPV4; 1473 if (types & RSS_TYPE_IPV6) 1474 rss_hash |= NDIS_HASH_IPV6; 1475 if (types & RSS_TYPE_IPV6_EX) 1476 rss_hash |= NDIS_HASH_IPV6_EX; 1477 if (types & RSS_TYPE_TCP_IPV6) 1478 rss_hash |= NDIS_HASH_TCP_IPV6; 1479 if (types & RSS_TYPE_TCP_IPV6_EX) 1480 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1481 if (types & RSS_TYPE_UDP_IPV4) 1482 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1483 return (rss_hash); 1484 } 1485 1486 static void 1487 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1488 { 1489 int i; 1490 1491 HN_LOCK_ASSERT(sc); 1492 1493 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1494 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1495 } 1496 1497 static void 1498 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1499 { 1500 struct ifnet *ifp, *vf_ifp; 1501 struct ifrsshash ifrh; 1502 struct ifrsskey ifrk; 1503 int error; 1504 uint32_t my_types, diff_types, mbuf_types = 0; 1505 1506 HN_LOCK_ASSERT(sc); 1507 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1508 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1509 1510 if (sc->hn_rx_ring_inuse == 1) { 1511 /* No RSS on synthetic parts; done. */ 1512 return; 1513 } 1514 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1515 /* Synthetic parts do not support Toeplitz; done. */ 1516 return; 1517 } 1518 1519 ifp = sc->hn_ifp; 1520 vf_ifp = sc->hn_vf_ifp; 1521 1522 /* 1523 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1524 * supported. 1525 */ 1526 memset(&ifrk, 0, sizeof(ifrk)); 1527 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1528 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1529 if (error) { 1530 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1531 vf_ifp->if_xname, error); 1532 goto done; 1533 } 1534 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1535 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1536 vf_ifp->if_xname, ifrk.ifrk_func); 1537 goto done; 1538 } 1539 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1540 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1541 vf_ifp->if_xname, ifrk.ifrk_keylen); 1542 goto done; 1543 } 1544 1545 /* 1546 * Extract VF's RSS hash. Only Toeplitz is supported. 1547 */ 1548 memset(&ifrh, 0, sizeof(ifrh)); 1549 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1550 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1551 if (error) { 1552 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1553 vf_ifp->if_xname, error); 1554 goto done; 1555 } 1556 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1557 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1558 vf_ifp->if_xname, ifrh.ifrh_func); 1559 goto done; 1560 } 1561 1562 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1563 if ((ifrh.ifrh_types & my_types) == 0) { 1564 /* This disables RSS; ignore it then */ 1565 if_printf(ifp, "%s intersection of RSS types failed. " 1566 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1567 ifrh.ifrh_types, my_types); 1568 goto done; 1569 } 1570 1571 diff_types = my_types ^ ifrh.ifrh_types; 1572 my_types &= ifrh.ifrh_types; 1573 mbuf_types = my_types; 1574 1575 /* 1576 * Detect RSS hash value/type confliction. 1577 * 1578 * NOTE: 1579 * We don't disable the hash type, but stop delivery the hash 1580 * value/type through mbufs on RX path. 1581 * 1582 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1583 * hash is delivered with type of TCP_IPV4. This means if 1584 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1585 * least to hn_mbuf_hash. However, given that _all_ of the 1586 * NICs implement TCP_IPV4, this will _not_ impose any issues 1587 * here. 1588 */ 1589 if ((my_types & RSS_TYPE_IPV4) && 1590 (diff_types & ifrh.ifrh_types & 1591 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1592 /* Conflict; disable IPV4 hash type/value delivery. */ 1593 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1594 mbuf_types &= ~RSS_TYPE_IPV4; 1595 } 1596 if ((my_types & RSS_TYPE_IPV6) && 1597 (diff_types & ifrh.ifrh_types & 1598 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1599 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1600 RSS_TYPE_IPV6_EX))) { 1601 /* Conflict; disable IPV6 hash type/value delivery. */ 1602 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1603 mbuf_types &= ~RSS_TYPE_IPV6; 1604 } 1605 if ((my_types & RSS_TYPE_IPV6_EX) && 1606 (diff_types & ifrh.ifrh_types & 1607 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1608 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1609 RSS_TYPE_IPV6))) { 1610 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1611 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1612 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1613 } 1614 if ((my_types & RSS_TYPE_TCP_IPV6) && 1615 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1616 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1617 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1618 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1619 } 1620 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1621 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1622 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1623 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1624 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1625 } 1626 if ((my_types & RSS_TYPE_UDP_IPV6) && 1627 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1628 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1629 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1630 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1631 } 1632 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1633 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1634 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1635 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1636 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1637 } 1638 1639 /* 1640 * Indirect table does not matter. 1641 */ 1642 1643 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1644 hn_rss_type_tondis(my_types); 1645 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1646 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1647 1648 if (reconf) { 1649 error = hn_rss_reconfig(sc); 1650 if (error) { 1651 /* XXX roll-back? */ 1652 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1653 /* XXX keep going. */ 1654 } 1655 } 1656 done: 1657 /* Hash deliverability for mbufs. */ 1658 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1659 } 1660 1661 static void 1662 hn_vf_rss_restore(struct hn_softc *sc) 1663 { 1664 1665 HN_LOCK_ASSERT(sc); 1666 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1667 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1668 1669 if (sc->hn_rx_ring_inuse == 1) 1670 goto done; 1671 1672 /* 1673 * Restore hash types. Key does _not_ matter. 1674 */ 1675 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1676 int error; 1677 1678 sc->hn_rss_hash = sc->hn_rss_hcap; 1679 error = hn_rss_reconfig(sc); 1680 if (error) { 1681 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1682 error); 1683 /* XXX keep going. */ 1684 } 1685 } 1686 done: 1687 /* Hash deliverability for mbufs. */ 1688 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1689 } 1690 1691 static void 1692 hn_xpnt_vf_setready(struct hn_softc *sc) 1693 { 1694 struct ifnet *ifp, *vf_ifp; 1695 struct ifreq ifr; 1696 1697 HN_LOCK_ASSERT(sc); 1698 ifp = sc->hn_ifp; 1699 vf_ifp = sc->hn_vf_ifp; 1700 1701 /* 1702 * Mark the VF ready. 1703 */ 1704 sc->hn_vf_rdytick = 0; 1705 1706 /* 1707 * Save information for restoration. 1708 */ 1709 sc->hn_saved_caps = ifp->if_capabilities; 1710 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1711 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1712 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1713 1714 /* 1715 * Intersect supported/enabled capabilities. 1716 * 1717 * NOTE: 1718 * if_hwassist is not changed here. 1719 */ 1720 ifp->if_capabilities &= vf_ifp->if_capabilities; 1721 ifp->if_capenable &= ifp->if_capabilities; 1722 1723 /* 1724 * Fix TSO settings. 1725 */ 1726 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1727 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1728 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1729 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1730 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1731 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1732 1733 /* 1734 * Change VF's enabled capabilities. 1735 */ 1736 memset(&ifr, 0, sizeof(ifr)); 1737 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1738 ifr.ifr_reqcap = ifp->if_capenable; 1739 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1740 1741 if (ifp->if_mtu != ETHERMTU) { 1742 int error; 1743 1744 /* 1745 * Change VF's MTU. 1746 */ 1747 memset(&ifr, 0, sizeof(ifr)); 1748 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1749 ifr.ifr_mtu = ifp->if_mtu; 1750 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1751 if (error) { 1752 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1753 vf_ifp->if_xname, ifp->if_mtu); 1754 if (ifp->if_mtu > ETHERMTU) { 1755 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1756 1757 /* 1758 * XXX 1759 * No need to adjust the synthetic parts' MTU; 1760 * failure of the adjustment will cause us 1761 * infinite headache. 1762 */ 1763 ifp->if_mtu = ETHERMTU; 1764 hn_mtu_change_fixup(sc); 1765 } 1766 } 1767 } 1768 } 1769 1770 static bool 1771 hn_xpnt_vf_isready(struct hn_softc *sc) 1772 { 1773 1774 HN_LOCK_ASSERT(sc); 1775 1776 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1777 return (false); 1778 1779 if (sc->hn_vf_rdytick == 0) 1780 return (true); 1781 1782 if (sc->hn_vf_rdytick > ticks) 1783 return (false); 1784 1785 /* Mark VF as ready. */ 1786 hn_xpnt_vf_setready(sc); 1787 return (true); 1788 } 1789 1790 static void 1791 hn_xpnt_vf_setenable(struct hn_softc *sc) 1792 { 1793 int i; 1794 1795 HN_LOCK_ASSERT(sc); 1796 1797 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1798 rm_wlock(&sc->hn_vf_lock); 1799 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1800 rm_wunlock(&sc->hn_vf_lock); 1801 1802 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1803 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1804 } 1805 1806 static void 1807 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1808 { 1809 int i; 1810 1811 HN_LOCK_ASSERT(sc); 1812 1813 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1814 rm_wlock(&sc->hn_vf_lock); 1815 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1816 if (clear_vf) 1817 sc->hn_vf_ifp = NULL; 1818 rm_wunlock(&sc->hn_vf_lock); 1819 1820 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1821 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1822 } 1823 1824 static void 1825 hn_xpnt_vf_init(struct hn_softc *sc) 1826 { 1827 int error; 1828 1829 HN_LOCK_ASSERT(sc); 1830 1831 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1832 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1833 1834 if (bootverbose) { 1835 if_printf(sc->hn_ifp, "try bringing up %s\n", 1836 sc->hn_vf_ifp->if_xname); 1837 } 1838 1839 /* 1840 * Bring the VF up. 1841 */ 1842 hn_xpnt_vf_saveifflags(sc); 1843 sc->hn_vf_ifp->if_flags |= IFF_UP; 1844 error = hn_xpnt_vf_iocsetflags(sc); 1845 if (error) { 1846 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1847 sc->hn_vf_ifp->if_xname, error); 1848 return; 1849 } 1850 1851 /* 1852 * NOTE: 1853 * Datapath setting must happen _after_ bringing the VF up. 1854 */ 1855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1856 1857 /* 1858 * NOTE: 1859 * Fixup RSS related bits _after_ the VF is brought up, since 1860 * many VFs generate RSS key during it's initialization. 1861 */ 1862 hn_vf_rss_fixup(sc, true); 1863 1864 /* Mark transparent mode VF as enabled. */ 1865 hn_xpnt_vf_setenable(sc); 1866 } 1867 1868 static void 1869 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1870 { 1871 struct hn_softc *sc = xsc; 1872 1873 HN_LOCK(sc); 1874 1875 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1876 goto done; 1877 if (sc->hn_vf_ifp == NULL) 1878 goto done; 1879 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1880 goto done; 1881 1882 if (sc->hn_vf_rdytick != 0) { 1883 /* Mark VF as ready. */ 1884 hn_xpnt_vf_setready(sc); 1885 } 1886 1887 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1888 /* 1889 * Delayed VF initialization. 1890 */ 1891 if (bootverbose) { 1892 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1893 sc->hn_vf_ifp->if_xname); 1894 } 1895 hn_xpnt_vf_init(sc); 1896 } 1897 done: 1898 HN_UNLOCK(sc); 1899 } 1900 1901 static void 1902 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1903 { 1904 struct hn_softc *sc = xsc; 1905 1906 HN_LOCK(sc); 1907 1908 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1909 goto done; 1910 1911 if (!hn_ismyvf(sc, ifp)) 1912 goto done; 1913 1914 if (sc->hn_vf_ifp != NULL) { 1915 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1916 sc->hn_vf_ifp->if_xname); 1917 goto done; 1918 } 1919 1920 if (hn_xpnt_vf && ifp->if_start != NULL) { 1921 /* 1922 * ifnet.if_start is _not_ supported by transparent 1923 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1924 */ 1925 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1926 "in transparent VF mode.\n", ifp->if_xname); 1927 goto done; 1928 } 1929 1930 rm_wlock(&hn_vfmap_lock); 1931 1932 if (ifp->if_index >= hn_vfmap_size) { 1933 struct ifnet **newmap; 1934 int newsize; 1935 1936 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1937 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1938 M_WAITOK | M_ZERO); 1939 1940 memcpy(newmap, hn_vfmap, 1941 sizeof(struct ifnet *) * hn_vfmap_size); 1942 free(hn_vfmap, M_DEVBUF); 1943 hn_vfmap = newmap; 1944 hn_vfmap_size = newsize; 1945 } 1946 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1947 ("%s: ifindex %d was mapped to %s", 1948 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1949 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1950 1951 rm_wunlock(&hn_vfmap_lock); 1952 1953 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1954 rm_wlock(&sc->hn_vf_lock); 1955 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1956 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1957 sc->hn_vf_ifp = ifp; 1958 rm_wunlock(&sc->hn_vf_lock); 1959 1960 if (hn_xpnt_vf) { 1961 int wait_ticks; 1962 1963 /* 1964 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1965 * Save vf_ifp's current if_input for later restoration. 1966 */ 1967 sc->hn_vf_input = ifp->if_input; 1968 ifp->if_input = hn_xpnt_vf_input; 1969 1970 /* 1971 * Stop link status management; use the VF's. 1972 */ 1973 hn_suspend_mgmt(sc); 1974 1975 /* 1976 * Give VF sometime to complete its attach routing. 1977 */ 1978 wait_ticks = hn_xpnt_vf_attwait * hz; 1979 sc->hn_vf_rdytick = ticks + wait_ticks; 1980 1981 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1982 wait_ticks); 1983 } 1984 done: 1985 HN_UNLOCK(sc); 1986 } 1987 1988 static void 1989 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1990 { 1991 struct hn_softc *sc = xsc; 1992 1993 HN_LOCK(sc); 1994 1995 if (sc->hn_vf_ifp == NULL) 1996 goto done; 1997 1998 if (!hn_ismyvf(sc, ifp)) 1999 goto done; 2000 2001 if (hn_xpnt_vf) { 2002 /* 2003 * Make sure that the delayed initialization is not running. 2004 * 2005 * NOTE: 2006 * - This lock _must_ be released, since the hn_vf_init task 2007 * will try holding this lock. 2008 * - It is safe to release this lock here, since the 2009 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2010 * 2011 * XXX racy, if hn(4) ever detached. 2012 */ 2013 HN_UNLOCK(sc); 2014 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2015 HN_LOCK(sc); 2016 2017 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2018 sc->hn_ifp->if_xname)); 2019 ifp->if_input = sc->hn_vf_input; 2020 sc->hn_vf_input = NULL; 2021 2022 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2023 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2024 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2025 2026 if (sc->hn_vf_rdytick == 0) { 2027 /* 2028 * The VF was ready; restore some settings. 2029 */ 2030 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2031 /* 2032 * NOTE: 2033 * There is _no_ need to fixup if_capenable and 2034 * if_hwassist, since the if_capabilities before 2035 * restoration was an intersection of the VF's 2036 * if_capabilites and the synthetic device's 2037 * if_capabilites. 2038 */ 2039 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2040 sc->hn_ifp->if_hw_tsomaxsegcount = 2041 sc->hn_saved_tsosegcnt; 2042 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2043 } 2044 2045 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2046 /* 2047 * Restore RSS settings. 2048 */ 2049 hn_vf_rss_restore(sc); 2050 2051 /* 2052 * Resume link status management, which was suspended 2053 * by hn_ifnet_attevent(). 2054 */ 2055 hn_resume_mgmt(sc); 2056 } 2057 } 2058 2059 /* Mark transparent mode VF as disabled. */ 2060 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2061 2062 rm_wlock(&hn_vfmap_lock); 2063 2064 KASSERT(ifp->if_index < hn_vfmap_size, 2065 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2066 if (hn_vfmap[ifp->if_index] != NULL) { 2067 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2068 ("%s: ifindex %d was mapped to %s", 2069 ifp->if_xname, ifp->if_index, 2070 hn_vfmap[ifp->if_index]->if_xname)); 2071 hn_vfmap[ifp->if_index] = NULL; 2072 } 2073 2074 rm_wunlock(&hn_vfmap_lock); 2075 done: 2076 HN_UNLOCK(sc); 2077 } 2078 2079 static void 2080 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2081 { 2082 struct hn_softc *sc = xsc; 2083 2084 if (sc->hn_vf_ifp == ifp) 2085 if_link_state_change(sc->hn_ifp, link_state); 2086 } 2087 2088 static int 2089 hn_probe(device_t dev) 2090 { 2091 2092 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2093 device_set_desc(dev, "Hyper-V Network Interface"); 2094 return BUS_PROBE_DEFAULT; 2095 } 2096 return ENXIO; 2097 } 2098 2099 static int 2100 hn_attach(device_t dev) 2101 { 2102 struct hn_softc *sc = device_get_softc(dev); 2103 struct sysctl_oid_list *child; 2104 struct sysctl_ctx_list *ctx; 2105 uint8_t eaddr[ETHER_ADDR_LEN]; 2106 struct ifnet *ifp = NULL; 2107 int error, ring_cnt, tx_ring_cnt; 2108 uint32_t mtu; 2109 2110 sc->hn_dev = dev; 2111 sc->hn_prichan = vmbus_get_channel(dev); 2112 HN_LOCK_INIT(sc); 2113 rm_init(&sc->hn_vf_lock, "hnvf"); 2114 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2115 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2116 2117 /* 2118 * Initialize these tunables once. 2119 */ 2120 sc->hn_agg_size = hn_tx_agg_size; 2121 sc->hn_agg_pkts = hn_tx_agg_pkts; 2122 2123 /* 2124 * Setup taskqueue for transmission. 2125 */ 2126 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2127 int i; 2128 2129 sc->hn_tx_taskqs = 2130 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2131 M_DEVBUF, M_WAITOK); 2132 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2133 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2134 M_WAITOK, taskqueue_thread_enqueue, 2135 &sc->hn_tx_taskqs[i]); 2136 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2137 "%s tx%d", device_get_nameunit(dev), i); 2138 } 2139 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2140 sc->hn_tx_taskqs = hn_tx_taskque; 2141 } 2142 2143 /* 2144 * Setup taskqueue for mangement tasks, e.g. link status. 2145 */ 2146 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2147 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2148 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2149 device_get_nameunit(dev)); 2150 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2151 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2152 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2153 hn_netchg_status_taskfunc, sc); 2154 2155 if (hn_xpnt_vf) { 2156 /* 2157 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2158 */ 2159 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2160 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2161 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2162 device_get_nameunit(dev)); 2163 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2164 hn_xpnt_vf_init_taskfunc, sc); 2165 } 2166 2167 /* 2168 * Allocate ifnet and setup its name earlier, so that if_printf 2169 * can be used by functions, which will be called after 2170 * ether_ifattach(). 2171 */ 2172 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2173 ifp->if_softc = sc; 2174 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2175 2176 /* 2177 * Initialize ifmedia earlier so that it can be unconditionally 2178 * destroyed, if error happened later on. 2179 */ 2180 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2181 2182 /* 2183 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2184 * to use (tx_ring_cnt). 2185 * 2186 * NOTE: 2187 * The # of RX rings to use is same as the # of channels to use. 2188 */ 2189 ring_cnt = hn_chan_cnt; 2190 if (ring_cnt <= 0) { 2191 /* Default */ 2192 ring_cnt = mp_ncpus; 2193 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2194 ring_cnt = HN_RING_CNT_DEF_MAX; 2195 } else if (ring_cnt > mp_ncpus) { 2196 ring_cnt = mp_ncpus; 2197 } 2198 #ifdef RSS 2199 if (ring_cnt > rss_getnumbuckets()) 2200 ring_cnt = rss_getnumbuckets(); 2201 #endif 2202 2203 tx_ring_cnt = hn_tx_ring_cnt; 2204 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2205 tx_ring_cnt = ring_cnt; 2206 #ifdef HN_IFSTART_SUPPORT 2207 if (hn_use_if_start) { 2208 /* ifnet.if_start only needs one TX ring. */ 2209 tx_ring_cnt = 1; 2210 } 2211 #endif 2212 2213 /* 2214 * Set the leader CPU for channels. 2215 */ 2216 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2217 2218 /* 2219 * Create enough TX/RX rings, even if only limited number of 2220 * channels can be allocated. 2221 */ 2222 error = hn_create_tx_data(sc, tx_ring_cnt); 2223 if (error) 2224 goto failed; 2225 error = hn_create_rx_data(sc, ring_cnt); 2226 if (error) 2227 goto failed; 2228 2229 /* 2230 * Create transaction context for NVS and RNDIS transactions. 2231 */ 2232 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2233 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2234 if (sc->hn_xact == NULL) { 2235 error = ENXIO; 2236 goto failed; 2237 } 2238 2239 /* 2240 * Install orphan handler for the revocation of this device's 2241 * primary channel. 2242 * 2243 * NOTE: 2244 * The processing order is critical here: 2245 * Install the orphan handler, _before_ testing whether this 2246 * device's primary channel has been revoked or not. 2247 */ 2248 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2249 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2250 error = ENXIO; 2251 goto failed; 2252 } 2253 2254 /* 2255 * Attach the synthetic parts, i.e. NVS and RNDIS. 2256 */ 2257 error = hn_synth_attach(sc, ETHERMTU); 2258 if (error) 2259 goto failed; 2260 2261 error = hn_rndis_get_eaddr(sc, eaddr); 2262 if (error) 2263 goto failed; 2264 2265 error = hn_rndis_get_mtu(sc, &mtu); 2266 if (error) 2267 mtu = ETHERMTU; 2268 else if (bootverbose) 2269 device_printf(dev, "RNDIS mtu %u\n", mtu); 2270 2271 #if __FreeBSD_version >= 1100099 2272 if (sc->hn_rx_ring_inuse > 1) { 2273 /* 2274 * Reduce TCP segment aggregation limit for multiple 2275 * RX rings to increase ACK timeliness. 2276 */ 2277 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2278 } 2279 #endif 2280 2281 /* 2282 * Fixup TX/RX stuffs after synthetic parts are attached. 2283 */ 2284 hn_fixup_tx_data(sc); 2285 hn_fixup_rx_data(sc); 2286 2287 ctx = device_get_sysctl_ctx(dev); 2288 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2289 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2290 &sc->hn_nvs_ver, 0, "NVS version"); 2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2292 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2293 hn_ndis_version_sysctl, "A", "NDIS version"); 2294 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2295 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2296 hn_caps_sysctl, "A", "capabilities"); 2297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2298 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2299 hn_hwassist_sysctl, "A", "hwassist"); 2300 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2301 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2303 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2304 "max # of TSO segments"); 2305 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2306 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2307 "max size of TSO segment"); 2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2309 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2310 hn_rxfilter_sysctl, "A", "rxfilter"); 2311 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2312 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2313 hn_rss_hash_sysctl, "A", "RSS hash"); 2314 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2315 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2316 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2319 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2320 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2321 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2322 #ifndef RSS 2323 /* 2324 * Don't allow RSS key/indirect table changes, if RSS is defined. 2325 */ 2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2327 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2328 hn_rss_key_sysctl, "IU", "RSS key"); 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2330 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2331 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2332 #endif 2333 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2334 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2335 "RNDIS offered packet transmission aggregation size limit"); 2336 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2337 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2338 "RNDIS offered packet transmission aggregation count limit"); 2339 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2340 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2341 "RNDIS packet transmission aggregation alignment"); 2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2343 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2344 hn_txagg_size_sysctl, "I", 2345 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2347 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2348 hn_txagg_pkts_sysctl, "I", 2349 "Packet transmission aggregation packets, " 2350 "0 -- disable, -1 -- auto"); 2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2352 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2353 hn_polling_sysctl, "I", 2354 "Polling frequency: [100,1000000], 0 disable polling"); 2355 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2356 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2357 hn_vf_sysctl, "A", "Virtual Function's name"); 2358 if (!hn_xpnt_vf) { 2359 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2360 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2361 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2362 } else { 2363 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2364 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2365 hn_xpnt_vf_enabled_sysctl, "I", 2366 "Transparent VF enabled"); 2367 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2368 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2369 hn_xpnt_vf_accbpf_sysctl, "I", 2370 "Accurate BPF for transparent VF"); 2371 } 2372 2373 /* 2374 * Setup the ifmedia, which has been initialized earlier. 2375 */ 2376 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2377 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2378 /* XXX ifmedia_set really should do this for us */ 2379 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2380 2381 /* 2382 * Setup the ifnet for this interface. 2383 */ 2384 2385 ifp->if_baudrate = IF_Gbps(10); 2386 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2387 ifp->if_ioctl = hn_ioctl; 2388 ifp->if_init = hn_init; 2389 #ifdef HN_IFSTART_SUPPORT 2390 if (hn_use_if_start) { 2391 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2392 2393 ifp->if_start = hn_start; 2394 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2395 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2396 IFQ_SET_READY(&ifp->if_snd); 2397 } else 2398 #endif 2399 { 2400 ifp->if_transmit = hn_transmit; 2401 ifp->if_qflush = hn_xmit_qflush; 2402 } 2403 2404 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2405 #ifdef foo 2406 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2407 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2408 #endif 2409 if (sc->hn_caps & HN_CAP_VLAN) { 2410 /* XXX not sure about VLAN_MTU. */ 2411 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2412 } 2413 2414 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2415 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2416 ifp->if_capabilities |= IFCAP_TXCSUM; 2417 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2418 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2419 if (sc->hn_caps & HN_CAP_TSO4) { 2420 ifp->if_capabilities |= IFCAP_TSO4; 2421 ifp->if_hwassist |= CSUM_IP_TSO; 2422 } 2423 if (sc->hn_caps & HN_CAP_TSO6) { 2424 ifp->if_capabilities |= IFCAP_TSO6; 2425 ifp->if_hwassist |= CSUM_IP6_TSO; 2426 } 2427 2428 /* Enable all available capabilities by default. */ 2429 ifp->if_capenable = ifp->if_capabilities; 2430 2431 /* 2432 * Disable IPv6 TSO and TXCSUM by default, they still can 2433 * be enabled through SIOCSIFCAP. 2434 */ 2435 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2436 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2437 2438 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2439 /* 2440 * Lock hn_set_tso_maxsize() to simplify its 2441 * internal logic. 2442 */ 2443 HN_LOCK(sc); 2444 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2445 HN_UNLOCK(sc); 2446 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2447 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2448 } 2449 2450 ether_ifattach(ifp, eaddr); 2451 2452 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2453 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2454 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2455 } 2456 if (mtu < ETHERMTU) { 2457 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2458 ifp->if_mtu = mtu; 2459 } 2460 2461 /* Inform the upper layer about the long frame support. */ 2462 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2463 2464 /* 2465 * Kick off link status check. 2466 */ 2467 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2468 hn_update_link_status(sc); 2469 2470 if (!hn_xpnt_vf) { 2471 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2472 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2473 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2474 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2475 } else { 2476 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2477 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2478 } 2479 2480 /* 2481 * NOTE: 2482 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2483 * since interface's LLADDR is needed; interface LLADDR is not 2484 * available when ifnet_arrival event is triggered. 2485 */ 2486 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2487 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2488 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2489 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2490 2491 return (0); 2492 failed: 2493 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2494 hn_synth_detach(sc); 2495 hn_detach(dev); 2496 return (error); 2497 } 2498 2499 static int 2500 hn_detach(device_t dev) 2501 { 2502 struct hn_softc *sc = device_get_softc(dev); 2503 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2504 2505 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2506 /* 2507 * In case that the vmbus missed the orphan handler 2508 * installation. 2509 */ 2510 vmbus_xact_ctx_orphan(sc->hn_xact); 2511 } 2512 2513 if (sc->hn_ifaddr_evthand != NULL) 2514 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2515 if (sc->hn_ifnet_evthand != NULL) 2516 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2517 if (sc->hn_ifnet_atthand != NULL) { 2518 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2519 sc->hn_ifnet_atthand); 2520 } 2521 if (sc->hn_ifnet_dethand != NULL) { 2522 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2523 sc->hn_ifnet_dethand); 2524 } 2525 if (sc->hn_ifnet_lnkhand != NULL) 2526 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2527 2528 vf_ifp = sc->hn_vf_ifp; 2529 __compiler_membar(); 2530 if (vf_ifp != NULL) 2531 hn_ifnet_detevent(sc, vf_ifp); 2532 2533 if (device_is_attached(dev)) { 2534 HN_LOCK(sc); 2535 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2536 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2537 hn_stop(sc, true); 2538 /* 2539 * NOTE: 2540 * hn_stop() only suspends data, so managment 2541 * stuffs have to be suspended manually here. 2542 */ 2543 hn_suspend_mgmt(sc); 2544 hn_synth_detach(sc); 2545 } 2546 HN_UNLOCK(sc); 2547 ether_ifdetach(ifp); 2548 } 2549 2550 ifmedia_removeall(&sc->hn_media); 2551 hn_destroy_rx_data(sc); 2552 hn_destroy_tx_data(sc); 2553 2554 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2555 int i; 2556 2557 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2558 taskqueue_free(sc->hn_tx_taskqs[i]); 2559 free(sc->hn_tx_taskqs, M_DEVBUF); 2560 } 2561 taskqueue_free(sc->hn_mgmt_taskq0); 2562 if (sc->hn_vf_taskq != NULL) 2563 taskqueue_free(sc->hn_vf_taskq); 2564 2565 if (sc->hn_xact != NULL) { 2566 /* 2567 * Uninstall the orphan handler _before_ the xact is 2568 * destructed. 2569 */ 2570 vmbus_chan_unset_orphan(sc->hn_prichan); 2571 vmbus_xact_ctx_destroy(sc->hn_xact); 2572 } 2573 2574 if_free(ifp); 2575 2576 HN_LOCK_DESTROY(sc); 2577 rm_destroy(&sc->hn_vf_lock); 2578 return (0); 2579 } 2580 2581 static int 2582 hn_shutdown(device_t dev) 2583 { 2584 2585 return (0); 2586 } 2587 2588 static void 2589 hn_link_status(struct hn_softc *sc) 2590 { 2591 uint32_t link_status; 2592 int error; 2593 2594 error = hn_rndis_get_linkstatus(sc, &link_status); 2595 if (error) { 2596 /* XXX what to do? */ 2597 return; 2598 } 2599 2600 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2601 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2602 else 2603 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2604 if_link_state_change(sc->hn_ifp, 2605 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2606 LINK_STATE_UP : LINK_STATE_DOWN); 2607 } 2608 2609 static void 2610 hn_link_taskfunc(void *xsc, int pending __unused) 2611 { 2612 struct hn_softc *sc = xsc; 2613 2614 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2615 return; 2616 hn_link_status(sc); 2617 } 2618 2619 static void 2620 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2621 { 2622 struct hn_softc *sc = xsc; 2623 2624 /* Prevent any link status checks from running. */ 2625 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2626 2627 /* 2628 * Fake up a [link down --> link up] state change; 5 seconds 2629 * delay is used, which closely simulates miibus reaction 2630 * upon link down event. 2631 */ 2632 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2633 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2634 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2635 &sc->hn_netchg_status, 5 * hz); 2636 } 2637 2638 static void 2639 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2640 { 2641 struct hn_softc *sc = xsc; 2642 2643 /* Re-allow link status checks. */ 2644 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2645 hn_link_status(sc); 2646 } 2647 2648 static void 2649 hn_update_link_status(struct hn_softc *sc) 2650 { 2651 2652 if (sc->hn_mgmt_taskq != NULL) 2653 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2654 } 2655 2656 static void 2657 hn_change_network(struct hn_softc *sc) 2658 { 2659 2660 if (sc->hn_mgmt_taskq != NULL) 2661 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2662 } 2663 2664 static __inline int 2665 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2666 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2667 { 2668 struct mbuf *m = *m_head; 2669 int error; 2670 2671 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2672 2673 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2674 m, segs, nsegs, BUS_DMA_NOWAIT); 2675 if (error == EFBIG) { 2676 struct mbuf *m_new; 2677 2678 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2679 if (m_new == NULL) 2680 return ENOBUFS; 2681 else 2682 *m_head = m = m_new; 2683 txr->hn_tx_collapsed++; 2684 2685 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2686 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2687 } 2688 if (!error) { 2689 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2690 BUS_DMASYNC_PREWRITE); 2691 txd->flags |= HN_TXD_FLAG_DMAMAP; 2692 } 2693 return error; 2694 } 2695 2696 static __inline int 2697 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2698 { 2699 2700 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2701 ("put an onlist txd %#x", txd->flags)); 2702 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2703 ("put an onagg txd %#x", txd->flags)); 2704 2705 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2706 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2707 return 0; 2708 2709 if (!STAILQ_EMPTY(&txd->agg_list)) { 2710 struct hn_txdesc *tmp_txd; 2711 2712 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2713 int freed; 2714 2715 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2716 ("resursive aggregation on aggregated txdesc")); 2717 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2718 ("not aggregated txdesc")); 2719 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2720 ("aggregated txdesc uses dmamap")); 2721 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2722 ("aggregated txdesc consumes " 2723 "chimney sending buffer")); 2724 KASSERT(tmp_txd->chim_size == 0, 2725 ("aggregated txdesc has non-zero " 2726 "chimney sending size")); 2727 2728 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2729 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2730 freed = hn_txdesc_put(txr, tmp_txd); 2731 KASSERT(freed, ("failed to free aggregated txdesc")); 2732 } 2733 } 2734 2735 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2736 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2737 ("chim txd uses dmamap")); 2738 hn_chim_free(txr->hn_sc, txd->chim_index); 2739 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2740 txd->chim_size = 0; 2741 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2742 bus_dmamap_sync(txr->hn_tx_data_dtag, 2743 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2744 bus_dmamap_unload(txr->hn_tx_data_dtag, 2745 txd->data_dmap); 2746 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2747 } 2748 2749 if (txd->m != NULL) { 2750 m_freem(txd->m); 2751 txd->m = NULL; 2752 } 2753 2754 txd->flags |= HN_TXD_FLAG_ONLIST; 2755 #ifndef HN_USE_TXDESC_BUFRING 2756 mtx_lock_spin(&txr->hn_txlist_spin); 2757 KASSERT(txr->hn_txdesc_avail >= 0 && 2758 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2759 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2760 txr->hn_txdesc_avail++; 2761 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2762 mtx_unlock_spin(&txr->hn_txlist_spin); 2763 #else /* HN_USE_TXDESC_BUFRING */ 2764 #ifdef HN_DEBUG 2765 atomic_add_int(&txr->hn_txdesc_avail, 1); 2766 #endif 2767 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2768 #endif /* !HN_USE_TXDESC_BUFRING */ 2769 2770 return 1; 2771 } 2772 2773 static __inline struct hn_txdesc * 2774 hn_txdesc_get(struct hn_tx_ring *txr) 2775 { 2776 struct hn_txdesc *txd; 2777 2778 #ifndef HN_USE_TXDESC_BUFRING 2779 mtx_lock_spin(&txr->hn_txlist_spin); 2780 txd = SLIST_FIRST(&txr->hn_txlist); 2781 if (txd != NULL) { 2782 KASSERT(txr->hn_txdesc_avail > 0, 2783 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2784 txr->hn_txdesc_avail--; 2785 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2786 } 2787 mtx_unlock_spin(&txr->hn_txlist_spin); 2788 #else 2789 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2790 #endif 2791 2792 if (txd != NULL) { 2793 #ifdef HN_USE_TXDESC_BUFRING 2794 #ifdef HN_DEBUG 2795 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2796 #endif 2797 #endif /* HN_USE_TXDESC_BUFRING */ 2798 KASSERT(txd->m == NULL && txd->refs == 0 && 2799 STAILQ_EMPTY(&txd->agg_list) && 2800 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2801 txd->chim_size == 0 && 2802 (txd->flags & HN_TXD_FLAG_ONLIST) && 2803 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2804 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2805 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2806 txd->refs = 1; 2807 } 2808 return txd; 2809 } 2810 2811 static __inline void 2812 hn_txdesc_hold(struct hn_txdesc *txd) 2813 { 2814 2815 /* 0->1 transition will never work */ 2816 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2817 atomic_add_int(&txd->refs, 1); 2818 } 2819 2820 static __inline void 2821 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2822 { 2823 2824 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2825 ("recursive aggregation on aggregating txdesc")); 2826 2827 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2828 ("already aggregated")); 2829 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2830 ("recursive aggregation on to-be-aggregated txdesc")); 2831 2832 txd->flags |= HN_TXD_FLAG_ONAGG; 2833 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2834 } 2835 2836 static bool 2837 hn_tx_ring_pending(struct hn_tx_ring *txr) 2838 { 2839 bool pending = false; 2840 2841 #ifndef HN_USE_TXDESC_BUFRING 2842 mtx_lock_spin(&txr->hn_txlist_spin); 2843 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2844 pending = true; 2845 mtx_unlock_spin(&txr->hn_txlist_spin); 2846 #else 2847 if (!buf_ring_full(txr->hn_txdesc_br)) 2848 pending = true; 2849 #endif 2850 return (pending); 2851 } 2852 2853 static __inline void 2854 hn_txeof(struct hn_tx_ring *txr) 2855 { 2856 txr->hn_has_txeof = 0; 2857 txr->hn_txeof(txr); 2858 } 2859 2860 static void 2861 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2862 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2863 { 2864 struct hn_txdesc *txd = sndc->hn_cbarg; 2865 struct hn_tx_ring *txr; 2866 2867 txr = txd->txr; 2868 KASSERT(txr->hn_chan == chan, 2869 ("channel mismatch, on chan%u, should be chan%u", 2870 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2871 2872 txr->hn_has_txeof = 1; 2873 hn_txdesc_put(txr, txd); 2874 2875 ++txr->hn_txdone_cnt; 2876 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2877 txr->hn_txdone_cnt = 0; 2878 if (txr->hn_oactive) 2879 hn_txeof(txr); 2880 } 2881 } 2882 2883 static void 2884 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2885 { 2886 #if defined(INET) || defined(INET6) 2887 struct epoch_tracker et; 2888 2889 NET_EPOCH_ENTER(et); 2890 tcp_lro_flush_all(&rxr->hn_lro); 2891 NET_EPOCH_EXIT(et); 2892 #endif 2893 2894 /* 2895 * NOTE: 2896 * 'txr' could be NULL, if multiple channels and 2897 * ifnet.if_start method are enabled. 2898 */ 2899 if (txr == NULL || !txr->hn_has_txeof) 2900 return; 2901 2902 txr->hn_txdone_cnt = 0; 2903 hn_txeof(txr); 2904 } 2905 2906 static __inline uint32_t 2907 hn_rndis_pktmsg_offset(uint32_t ofs) 2908 { 2909 2910 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2911 ("invalid RNDIS packet msg offset %u", ofs)); 2912 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2913 } 2914 2915 static __inline void * 2916 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2917 size_t pi_dlen, uint32_t pi_type) 2918 { 2919 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2920 struct rndis_pktinfo *pi; 2921 2922 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2923 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2924 2925 /* 2926 * Per-packet-info does not move; it only grows. 2927 * 2928 * NOTE: 2929 * rm_pktinfooffset in this phase counts from the beginning 2930 * of rndis_packet_msg. 2931 */ 2932 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2933 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2934 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2935 pkt->rm_pktinfolen); 2936 pkt->rm_pktinfolen += pi_size; 2937 2938 pi->rm_size = pi_size; 2939 pi->rm_type = pi_type; 2940 pi->rm_internal = 0; 2941 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2942 2943 return (pi->rm_data); 2944 } 2945 2946 static __inline int 2947 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2948 { 2949 struct hn_txdesc *txd; 2950 struct mbuf *m; 2951 int error, pkts; 2952 2953 txd = txr->hn_agg_txd; 2954 KASSERT(txd != NULL, ("no aggregate txdesc")); 2955 2956 /* 2957 * Since hn_txpkt() will reset this temporary stat, save 2958 * it now, so that oerrors can be updated properly, if 2959 * hn_txpkt() ever fails. 2960 */ 2961 pkts = txr->hn_stat_pkts; 2962 2963 /* 2964 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2965 * failure, save it for later freeing, if hn_txpkt() ever 2966 * fails. 2967 */ 2968 m = txd->m; 2969 error = hn_txpkt(ifp, txr, txd); 2970 if (__predict_false(error)) { 2971 /* txd is freed, but m is not. */ 2972 m_freem(m); 2973 2974 txr->hn_flush_failed++; 2975 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2976 } 2977 2978 /* Reset all aggregation states. */ 2979 txr->hn_agg_txd = NULL; 2980 txr->hn_agg_szleft = 0; 2981 txr->hn_agg_pktleft = 0; 2982 txr->hn_agg_prevpkt = NULL; 2983 2984 return (error); 2985 } 2986 2987 static void * 2988 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2989 int pktsize) 2990 { 2991 void *chim; 2992 2993 if (txr->hn_agg_txd != NULL) { 2994 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2995 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2996 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2997 int olen; 2998 2999 /* 3000 * Update the previous RNDIS packet's total length, 3001 * it can be increased due to the mandatory alignment 3002 * padding for this RNDIS packet. And update the 3003 * aggregating txdesc's chimney sending buffer size 3004 * accordingly. 3005 * 3006 * XXX 3007 * Zero-out the padding, as required by the RNDIS spec. 3008 */ 3009 olen = pkt->rm_len; 3010 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3011 agg_txd->chim_size += pkt->rm_len - olen; 3012 3013 /* Link this txdesc to the parent. */ 3014 hn_txdesc_agg(agg_txd, txd); 3015 3016 chim = (uint8_t *)pkt + pkt->rm_len; 3017 /* Save the current packet for later fixup. */ 3018 txr->hn_agg_prevpkt = chim; 3019 3020 txr->hn_agg_pktleft--; 3021 txr->hn_agg_szleft -= pktsize; 3022 if (txr->hn_agg_szleft <= 3023 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3024 /* 3025 * Probably can't aggregate more packets, 3026 * flush this aggregating txdesc proactively. 3027 */ 3028 txr->hn_agg_pktleft = 0; 3029 } 3030 /* Done! */ 3031 return (chim); 3032 } 3033 hn_flush_txagg(ifp, txr); 3034 } 3035 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3036 3037 txr->hn_tx_chimney_tried++; 3038 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3039 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3040 return (NULL); 3041 txr->hn_tx_chimney++; 3042 3043 chim = txr->hn_sc->hn_chim + 3044 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3045 3046 if (txr->hn_agg_pktmax > 1 && 3047 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3048 txr->hn_agg_txd = txd; 3049 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3050 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3051 txr->hn_agg_prevpkt = chim; 3052 } 3053 return (chim); 3054 } 3055 3056 /* 3057 * NOTE: 3058 * If this function fails, then both txd and m_head0 will be freed. 3059 */ 3060 static int 3061 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3062 struct mbuf **m_head0) 3063 { 3064 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3065 int error, nsegs, i; 3066 struct mbuf *m_head = *m_head0; 3067 struct rndis_packet_msg *pkt; 3068 uint32_t *pi_data; 3069 void *chim = NULL; 3070 int pkt_hlen, pkt_size; 3071 3072 pkt = txd->rndis_pkt; 3073 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3074 if (pkt_size < txr->hn_chim_size) { 3075 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3076 if (chim != NULL) 3077 pkt = chim; 3078 } else { 3079 if (txr->hn_agg_txd != NULL) 3080 hn_flush_txagg(ifp, txr); 3081 } 3082 3083 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3084 pkt->rm_len = m_head->m_pkthdr.len; 3085 pkt->rm_dataoffset = 0; 3086 pkt->rm_datalen = m_head->m_pkthdr.len; 3087 pkt->rm_oobdataoffset = 0; 3088 pkt->rm_oobdatalen = 0; 3089 pkt->rm_oobdataelements = 0; 3090 pkt->rm_pktinfooffset = sizeof(*pkt); 3091 pkt->rm_pktinfolen = 0; 3092 pkt->rm_vchandle = 0; 3093 pkt->rm_reserved = 0; 3094 3095 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3096 /* 3097 * Set the hash value for this packet. 3098 */ 3099 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3100 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3101 3102 if (M_HASHTYPE_ISHASH(m_head)) 3103 /* 3104 * The flowid field contains the hash value host 3105 * set in the rx queue if it is a ip forwarding pkt. 3106 * Set the same hash value so host can send on the 3107 * cpu it was received. 3108 */ 3109 *pi_data = m_head->m_pkthdr.flowid; 3110 else 3111 /* 3112 * Otherwise just put the tx queue index. 3113 */ 3114 *pi_data = txr->hn_tx_idx; 3115 } 3116 3117 if (m_head->m_flags & M_VLANTAG) { 3118 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3119 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3120 *pi_data = NDIS_VLAN_INFO_MAKE( 3121 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3122 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3123 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3124 } 3125 3126 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3127 #if defined(INET6) || defined(INET) 3128 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3129 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3130 #ifdef INET 3131 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3132 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3133 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3134 m_head->m_pkthdr.tso_segsz); 3135 } 3136 #endif 3137 #if defined(INET6) && defined(INET) 3138 else 3139 #endif 3140 #ifdef INET6 3141 { 3142 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3143 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3144 m_head->m_pkthdr.tso_segsz); 3145 } 3146 #endif 3147 #endif /* INET6 || INET */ 3148 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3149 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3150 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3151 if (m_head->m_pkthdr.csum_flags & 3152 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3153 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3154 } else { 3155 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3156 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3157 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3158 } 3159 3160 if (m_head->m_pkthdr.csum_flags & 3161 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3162 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3163 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3164 } else if (m_head->m_pkthdr.csum_flags & 3165 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3166 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3167 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3168 } 3169 } 3170 3171 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3172 /* Fixup RNDIS packet message total length */ 3173 pkt->rm_len += pkt_hlen; 3174 /* Convert RNDIS packet message offsets */ 3175 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3176 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3177 3178 /* 3179 * Fast path: Chimney sending. 3180 */ 3181 if (chim != NULL) { 3182 struct hn_txdesc *tgt_txd = txd; 3183 3184 if (txr->hn_agg_txd != NULL) { 3185 tgt_txd = txr->hn_agg_txd; 3186 #ifdef INVARIANTS 3187 *m_head0 = NULL; 3188 #endif 3189 } 3190 3191 KASSERT(pkt == chim, 3192 ("RNDIS pkt not in chimney sending buffer")); 3193 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3194 ("chimney sending buffer is not used")); 3195 tgt_txd->chim_size += pkt->rm_len; 3196 3197 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3198 ((uint8_t *)chim) + pkt_hlen); 3199 3200 txr->hn_gpa_cnt = 0; 3201 txr->hn_sendpkt = hn_txpkt_chim; 3202 goto done; 3203 } 3204 3205 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3206 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3207 ("chimney buffer is used")); 3208 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3209 3210 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3211 if (__predict_false(error)) { 3212 int freed; 3213 3214 /* 3215 * This mbuf is not linked w/ the txd yet, so free it now. 3216 */ 3217 m_freem(m_head); 3218 *m_head0 = NULL; 3219 3220 freed = hn_txdesc_put(txr, txd); 3221 KASSERT(freed != 0, 3222 ("fail to free txd upon txdma error")); 3223 3224 txr->hn_txdma_failed++; 3225 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3226 return error; 3227 } 3228 *m_head0 = m_head; 3229 3230 /* +1 RNDIS packet message */ 3231 txr->hn_gpa_cnt = nsegs + 1; 3232 3233 /* send packet with page buffer */ 3234 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3235 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3236 txr->hn_gpa[0].gpa_len = pkt_hlen; 3237 3238 /* 3239 * Fill the page buffers with mbuf info after the page 3240 * buffer for RNDIS packet message. 3241 */ 3242 for (i = 0; i < nsegs; ++i) { 3243 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3244 3245 gpa->gpa_page = atop(segs[i].ds_addr); 3246 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3247 gpa->gpa_len = segs[i].ds_len; 3248 } 3249 3250 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3251 txd->chim_size = 0; 3252 txr->hn_sendpkt = hn_txpkt_sglist; 3253 done: 3254 txd->m = m_head; 3255 3256 /* Set the completion routine */ 3257 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3258 3259 /* Update temporary stats for later use. */ 3260 txr->hn_stat_pkts++; 3261 txr->hn_stat_size += m_head->m_pkthdr.len; 3262 if (m_head->m_flags & M_MCAST) 3263 txr->hn_stat_mcasts++; 3264 3265 return 0; 3266 } 3267 3268 /* 3269 * NOTE: 3270 * If this function fails, then txd will be freed, but the mbuf 3271 * associated w/ the txd will _not_ be freed. 3272 */ 3273 static int 3274 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3275 { 3276 int error, send_failed = 0, has_bpf; 3277 3278 again: 3279 has_bpf = bpf_peers_present(ifp->if_bpf); 3280 if (has_bpf) { 3281 /* 3282 * Make sure that this txd and any aggregated txds are not 3283 * freed before ETHER_BPF_MTAP. 3284 */ 3285 hn_txdesc_hold(txd); 3286 } 3287 error = txr->hn_sendpkt(txr, txd); 3288 if (!error) { 3289 if (has_bpf) { 3290 const struct hn_txdesc *tmp_txd; 3291 3292 ETHER_BPF_MTAP(ifp, txd->m); 3293 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3294 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3295 } 3296 3297 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3298 #ifdef HN_IFSTART_SUPPORT 3299 if (!hn_use_if_start) 3300 #endif 3301 { 3302 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3303 txr->hn_stat_size); 3304 if (txr->hn_stat_mcasts != 0) { 3305 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3306 txr->hn_stat_mcasts); 3307 } 3308 } 3309 txr->hn_pkts += txr->hn_stat_pkts; 3310 txr->hn_sends++; 3311 } 3312 if (has_bpf) 3313 hn_txdesc_put(txr, txd); 3314 3315 if (__predict_false(error)) { 3316 int freed; 3317 3318 /* 3319 * This should "really rarely" happen. 3320 * 3321 * XXX Too many RX to be acked or too many sideband 3322 * commands to run? Ask netvsc_channel_rollup() 3323 * to kick start later. 3324 */ 3325 txr->hn_has_txeof = 1; 3326 if (!send_failed) { 3327 txr->hn_send_failed++; 3328 send_failed = 1; 3329 /* 3330 * Try sending again after set hn_has_txeof; 3331 * in case that we missed the last 3332 * netvsc_channel_rollup(). 3333 */ 3334 goto again; 3335 } 3336 if_printf(ifp, "send failed\n"); 3337 3338 /* 3339 * Caller will perform further processing on the 3340 * associated mbuf, so don't free it in hn_txdesc_put(); 3341 * only unload it from the DMA map in hn_txdesc_put(), 3342 * if it was loaded. 3343 */ 3344 txd->m = NULL; 3345 freed = hn_txdesc_put(txr, txd); 3346 KASSERT(freed != 0, 3347 ("fail to free txd upon send error")); 3348 3349 txr->hn_send_failed++; 3350 } 3351 3352 /* Reset temporary stats, after this sending is done. */ 3353 txr->hn_stat_size = 0; 3354 txr->hn_stat_pkts = 0; 3355 txr->hn_stat_mcasts = 0; 3356 3357 return (error); 3358 } 3359 3360 /* 3361 * Append the specified data to the indicated mbuf chain, 3362 * Extend the mbuf chain if the new data does not fit in 3363 * existing space. 3364 * 3365 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3366 * There should be an equivalent in the kernel mbuf code, 3367 * but there does not appear to be one yet. 3368 * 3369 * Differs from m_append() in that additional mbufs are 3370 * allocated with cluster size MJUMPAGESIZE, and filled 3371 * accordingly. 3372 * 3373 * Return the last mbuf in the chain or NULL if failed to 3374 * allocate new mbuf. 3375 */ 3376 static struct mbuf * 3377 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3378 { 3379 struct mbuf *m, *n; 3380 int remainder, space; 3381 3382 for (m = m0; m->m_next != NULL; m = m->m_next) 3383 ; 3384 remainder = len; 3385 space = M_TRAILINGSPACE(m); 3386 if (space > 0) { 3387 /* 3388 * Copy into available space. 3389 */ 3390 if (space > remainder) 3391 space = remainder; 3392 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3393 m->m_len += space; 3394 cp += space; 3395 remainder -= space; 3396 } 3397 while (remainder > 0) { 3398 /* 3399 * Allocate a new mbuf; could check space 3400 * and allocate a cluster instead. 3401 */ 3402 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3403 if (n == NULL) 3404 return NULL; 3405 n->m_len = min(MJUMPAGESIZE, remainder); 3406 bcopy(cp, mtod(n, caddr_t), n->m_len); 3407 cp += n->m_len; 3408 remainder -= n->m_len; 3409 m->m_next = n; 3410 m = n; 3411 } 3412 3413 return m; 3414 } 3415 3416 #if defined(INET) || defined(INET6) 3417 static __inline int 3418 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3419 { 3420 #if __FreeBSD_version >= 1100095 3421 if (hn_lro_mbufq_depth) { 3422 tcp_lro_queue_mbuf(lc, m); 3423 return 0; 3424 } 3425 #endif 3426 return tcp_lro_rx(lc, m, 0); 3427 } 3428 #endif 3429 3430 static int 3431 hn_rxpkt(struct hn_rx_ring *rxr) 3432 { 3433 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3434 struct mbuf *m_new, *n; 3435 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3436 int hash_type = M_HASHTYPE_NONE; 3437 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3438 int i; 3439 3440 ifp = hn_ifp; 3441 if (rxr->hn_rxvf_ifp != NULL) { 3442 /* 3443 * Non-transparent mode VF; pretend this packet is from 3444 * the VF. 3445 */ 3446 ifp = rxr->hn_rxvf_ifp; 3447 is_vf = 1; 3448 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3449 /* Transparent mode VF. */ 3450 is_vf = 1; 3451 } 3452 3453 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3454 /* 3455 * NOTE: 3456 * See the NOTE of hn_rndis_init_fixat(). This 3457 * function can be reached, immediately after the 3458 * RNDIS is initialized but before the ifnet is 3459 * setup on the hn_attach() path; drop the unexpected 3460 * packets. 3461 */ 3462 return (0); 3463 } 3464 3465 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3466 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3467 return (0); 3468 } 3469 3470 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3471 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3472 if (m_new == NULL) { 3473 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3474 return (0); 3475 } 3476 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3477 rxr->rsc.frag_len[0]); 3478 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3479 } else { 3480 /* 3481 * Get an mbuf with a cluster. For packets 2K or less, 3482 * get a standard 2K cluster. For anything larger, get a 3483 * 4K cluster. Any buffers larger than 4K can cause problems 3484 * if looped around to the Hyper-V TX channel, so avoid them. 3485 */ 3486 size = MCLBYTES; 3487 if (rxr->rsc.pktlen > MCLBYTES) { 3488 /* 4096 */ 3489 size = MJUMPAGESIZE; 3490 } 3491 3492 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3493 if (m_new == NULL) { 3494 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3495 return (0); 3496 } 3497 3498 n = m_new; 3499 for (i = 0; i < rxr->rsc.cnt; i++) { 3500 n = hv_m_append(n, rxr->rsc.frag_len[i], 3501 rxr->rsc.frag_data[i]); 3502 if (n == NULL) { 3503 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3504 return (0); 3505 } else { 3506 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3507 } 3508 } 3509 } 3510 if (rxr->rsc.pktlen <= MHLEN) 3511 rxr->hn_small_pkts++; 3512 3513 m_new->m_pkthdr.rcvif = ifp; 3514 3515 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3516 do_csum = 0; 3517 3518 /* receive side checksum offload */ 3519 if (rxr->rsc.csum_info != NULL) { 3520 /* IP csum offload */ 3521 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3522 m_new->m_pkthdr.csum_flags |= 3523 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3524 rxr->hn_csum_ip++; 3525 } 3526 3527 /* TCP/UDP csum offload */ 3528 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3529 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3530 m_new->m_pkthdr.csum_flags |= 3531 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3532 m_new->m_pkthdr.csum_data = 0xffff; 3533 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3534 rxr->hn_csum_tcp++; 3535 else 3536 rxr->hn_csum_udp++; 3537 } 3538 3539 /* 3540 * XXX 3541 * As of this write (Oct 28th, 2016), host side will turn 3542 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3543 * the do_lro setting here is actually _not_ accurate. We 3544 * depend on the RSS hash type check to reset do_lro. 3545 */ 3546 if ((*(rxr->rsc.csum_info) & 3547 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3548 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3549 do_lro = 1; 3550 } else { 3551 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3552 if (l3proto == ETHERTYPE_IP) { 3553 if (l4proto == IPPROTO_TCP) { 3554 if (do_csum && 3555 (rxr->hn_trust_hcsum & 3556 HN_TRUST_HCSUM_TCP)) { 3557 rxr->hn_csum_trusted++; 3558 m_new->m_pkthdr.csum_flags |= 3559 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3560 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3561 m_new->m_pkthdr.csum_data = 0xffff; 3562 } 3563 do_lro = 1; 3564 } else if (l4proto == IPPROTO_UDP) { 3565 if (do_csum && 3566 (rxr->hn_trust_hcsum & 3567 HN_TRUST_HCSUM_UDP)) { 3568 rxr->hn_csum_trusted++; 3569 m_new->m_pkthdr.csum_flags |= 3570 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3571 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3572 m_new->m_pkthdr.csum_data = 0xffff; 3573 } 3574 } else if (l4proto != IPPROTO_DONE && do_csum && 3575 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3576 rxr->hn_csum_trusted++; 3577 m_new->m_pkthdr.csum_flags |= 3578 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3579 } 3580 } 3581 } 3582 3583 if (rxr->rsc.vlan_info != NULL) { 3584 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3585 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3586 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3587 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3588 m_new->m_flags |= M_VLANTAG; 3589 } 3590 3591 /* 3592 * If VF is activated (tranparent/non-transparent mode does not 3593 * matter here). 3594 * 3595 * - Disable LRO 3596 * 3597 * hn(4) will only receive broadcast packets, multicast packets, 3598 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3599 * packet types. 3600 * 3601 * For non-transparent, we definitely _cannot_ enable LRO at 3602 * all, since the LRO flush will use hn(4) as the receiving 3603 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3604 */ 3605 if (is_vf) 3606 do_lro = 0; 3607 3608 /* 3609 * If VF is activated (tranparent/non-transparent mode does not 3610 * matter here), do _not_ mess with unsupported hash types or 3611 * functions. 3612 */ 3613 if (rxr->rsc.hash_info != NULL) { 3614 rxr->hn_rss_pkts++; 3615 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3616 if (!is_vf) 3617 hash_type = M_HASHTYPE_OPAQUE_HASH; 3618 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3619 NDIS_HASH_FUNCTION_TOEPLITZ) { 3620 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3621 rxr->hn_mbuf_hash); 3622 3623 /* 3624 * NOTE: 3625 * do_lro is resetted, if the hash types are not TCP 3626 * related. See the comment in the above csum_flags 3627 * setup section. 3628 */ 3629 switch (type) { 3630 case NDIS_HASH_IPV4: 3631 hash_type = M_HASHTYPE_RSS_IPV4; 3632 do_lro = 0; 3633 break; 3634 3635 case NDIS_HASH_TCP_IPV4: 3636 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3637 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3638 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3639 3640 if (is_vf) 3641 def_htype = M_HASHTYPE_NONE; 3642 3643 /* 3644 * UDP 4-tuple hash is delivered as 3645 * TCP 4-tuple hash. 3646 */ 3647 if (l3proto == ETHERTYPE_MAX) { 3648 hn_rxpkt_proto(m_new, 3649 &l3proto, &l4proto); 3650 } 3651 if (l3proto == ETHERTYPE_IP) { 3652 if (l4proto == IPPROTO_UDP && 3653 (rxr->hn_mbuf_hash & 3654 NDIS_HASH_UDP_IPV4_X)) { 3655 hash_type = 3656 M_HASHTYPE_RSS_UDP_IPV4; 3657 do_lro = 0; 3658 } else if (l4proto != 3659 IPPROTO_TCP) { 3660 hash_type = def_htype; 3661 do_lro = 0; 3662 } 3663 } else { 3664 hash_type = def_htype; 3665 do_lro = 0; 3666 } 3667 } 3668 break; 3669 3670 case NDIS_HASH_IPV6: 3671 hash_type = M_HASHTYPE_RSS_IPV6; 3672 do_lro = 0; 3673 break; 3674 3675 case NDIS_HASH_IPV6_EX: 3676 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3677 do_lro = 0; 3678 break; 3679 3680 case NDIS_HASH_TCP_IPV6: 3681 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3682 break; 3683 3684 case NDIS_HASH_TCP_IPV6_EX: 3685 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3686 break; 3687 } 3688 } 3689 } else if (!is_vf) { 3690 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3691 hash_type = M_HASHTYPE_OPAQUE; 3692 } 3693 M_HASHTYPE_SET(m_new, hash_type); 3694 3695 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3696 if (hn_ifp != ifp) { 3697 const struct ether_header *eh; 3698 3699 /* 3700 * Non-transparent mode VF is activated. 3701 */ 3702 3703 /* 3704 * Allow tapping on hn(4). 3705 */ 3706 ETHER_BPF_MTAP(hn_ifp, m_new); 3707 3708 /* 3709 * Update hn(4)'s stats. 3710 */ 3711 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3712 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3713 /* Checked at the beginning of this function. */ 3714 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3715 eh = mtod(m_new, struct ether_header *); 3716 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3717 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3718 } 3719 rxr->hn_pkts++; 3720 3721 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3722 #if defined(INET) || defined(INET6) 3723 struct lro_ctrl *lro = &rxr->hn_lro; 3724 3725 if (lro->lro_cnt) { 3726 rxr->hn_lro_tried++; 3727 if (hn_lro_rx(lro, m_new) == 0) { 3728 /* DONE! */ 3729 return 0; 3730 } 3731 } 3732 #endif 3733 } 3734 ifp->if_input(ifp, m_new); 3735 3736 return (0); 3737 } 3738 3739 static int 3740 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3741 { 3742 struct hn_softc *sc = ifp->if_softc; 3743 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3744 struct ifnet *vf_ifp; 3745 int mask, error = 0; 3746 struct ifrsskey *ifrk; 3747 struct ifrsshash *ifrh; 3748 uint32_t mtu; 3749 3750 switch (cmd) { 3751 case SIOCSIFMTU: 3752 if (ifr->ifr_mtu > HN_MTU_MAX) { 3753 error = EINVAL; 3754 break; 3755 } 3756 3757 HN_LOCK(sc); 3758 3759 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3760 HN_UNLOCK(sc); 3761 break; 3762 } 3763 3764 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3765 /* Can't change MTU */ 3766 HN_UNLOCK(sc); 3767 error = EOPNOTSUPP; 3768 break; 3769 } 3770 3771 if (ifp->if_mtu == ifr->ifr_mtu) { 3772 HN_UNLOCK(sc); 3773 break; 3774 } 3775 3776 if (hn_xpnt_vf_isready(sc)) { 3777 vf_ifp = sc->hn_vf_ifp; 3778 ifr_vf = *ifr; 3779 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3780 sizeof(ifr_vf.ifr_name)); 3781 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3782 (caddr_t)&ifr_vf); 3783 if (error) { 3784 HN_UNLOCK(sc); 3785 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3786 vf_ifp->if_xname, ifr->ifr_mtu, error); 3787 break; 3788 } 3789 } 3790 3791 /* 3792 * Suspend this interface before the synthetic parts 3793 * are ripped. 3794 */ 3795 hn_suspend(sc); 3796 3797 /* 3798 * Detach the synthetics parts, i.e. NVS and RNDIS. 3799 */ 3800 hn_synth_detach(sc); 3801 3802 /* 3803 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3804 * with the new MTU setting. 3805 */ 3806 error = hn_synth_attach(sc, ifr->ifr_mtu); 3807 if (error) { 3808 HN_UNLOCK(sc); 3809 break; 3810 } 3811 3812 error = hn_rndis_get_mtu(sc, &mtu); 3813 if (error) 3814 mtu = ifr->ifr_mtu; 3815 else if (bootverbose) 3816 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3817 3818 /* 3819 * Commit the requested MTU, after the synthetic parts 3820 * have been successfully attached. 3821 */ 3822 if (mtu >= ifr->ifr_mtu) { 3823 mtu = ifr->ifr_mtu; 3824 } else { 3825 if_printf(ifp, "fixup mtu %d -> %u\n", 3826 ifr->ifr_mtu, mtu); 3827 } 3828 ifp->if_mtu = mtu; 3829 3830 /* 3831 * Synthetic parts' reattach may change the chimney 3832 * sending size; update it. 3833 */ 3834 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3835 hn_set_chim_size(sc, sc->hn_chim_szmax); 3836 3837 /* 3838 * Make sure that various parameters based on MTU are 3839 * still valid, after the MTU change. 3840 */ 3841 hn_mtu_change_fixup(sc); 3842 3843 /* 3844 * All done! Resume the interface now. 3845 */ 3846 hn_resume(sc); 3847 3848 if ((sc->hn_flags & HN_FLAG_RXVF) || 3849 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3850 /* 3851 * Since we have reattached the NVS part, 3852 * change the datapath to VF again; in case 3853 * that it is lost, after the NVS was detached. 3854 */ 3855 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3856 } 3857 3858 HN_UNLOCK(sc); 3859 break; 3860 3861 case SIOCSIFFLAGS: 3862 HN_LOCK(sc); 3863 3864 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3865 HN_UNLOCK(sc); 3866 break; 3867 } 3868 3869 if (hn_xpnt_vf_isready(sc)) 3870 hn_xpnt_vf_saveifflags(sc); 3871 3872 if (ifp->if_flags & IFF_UP) { 3873 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3874 /* 3875 * Caller meight hold mutex, e.g. 3876 * bpf; use busy-wait for the RNDIS 3877 * reply. 3878 */ 3879 HN_NO_SLEEPING(sc); 3880 hn_rxfilter_config(sc); 3881 HN_SLEEPING_OK(sc); 3882 3883 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3884 error = hn_xpnt_vf_iocsetflags(sc); 3885 } else { 3886 hn_init_locked(sc); 3887 } 3888 } else { 3889 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3890 hn_stop(sc, false); 3891 } 3892 sc->hn_if_flags = ifp->if_flags; 3893 3894 HN_UNLOCK(sc); 3895 break; 3896 3897 case SIOCSIFCAP: 3898 HN_LOCK(sc); 3899 3900 if (hn_xpnt_vf_isready(sc)) { 3901 ifr_vf = *ifr; 3902 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3903 sizeof(ifr_vf.ifr_name)); 3904 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3905 HN_UNLOCK(sc); 3906 break; 3907 } 3908 3909 /* 3910 * Fix up requested capabilities w/ supported capabilities, 3911 * since the supported capabilities could have been changed. 3912 */ 3913 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3914 ifp->if_capenable; 3915 3916 if (mask & IFCAP_TXCSUM) { 3917 ifp->if_capenable ^= IFCAP_TXCSUM; 3918 if (ifp->if_capenable & IFCAP_TXCSUM) 3919 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3920 else 3921 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3922 } 3923 if (mask & IFCAP_TXCSUM_IPV6) { 3924 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3925 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3926 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3927 else 3928 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3929 } 3930 3931 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3932 if (mask & IFCAP_RXCSUM) 3933 ifp->if_capenable ^= IFCAP_RXCSUM; 3934 #ifdef foo 3935 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3936 if (mask & IFCAP_RXCSUM_IPV6) 3937 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3938 #endif 3939 3940 if (mask & IFCAP_LRO) 3941 ifp->if_capenable ^= IFCAP_LRO; 3942 3943 if (mask & IFCAP_TSO4) { 3944 ifp->if_capenable ^= IFCAP_TSO4; 3945 if (ifp->if_capenable & IFCAP_TSO4) 3946 ifp->if_hwassist |= CSUM_IP_TSO; 3947 else 3948 ifp->if_hwassist &= ~CSUM_IP_TSO; 3949 } 3950 if (mask & IFCAP_TSO6) { 3951 ifp->if_capenable ^= IFCAP_TSO6; 3952 if (ifp->if_capenable & IFCAP_TSO6) 3953 ifp->if_hwassist |= CSUM_IP6_TSO; 3954 else 3955 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3956 } 3957 3958 HN_UNLOCK(sc); 3959 break; 3960 3961 case SIOCADDMULTI: 3962 case SIOCDELMULTI: 3963 HN_LOCK(sc); 3964 3965 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3966 HN_UNLOCK(sc); 3967 break; 3968 } 3969 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3970 /* 3971 * Multicast uses mutex; use busy-wait for 3972 * the RNDIS reply. 3973 */ 3974 HN_NO_SLEEPING(sc); 3975 hn_rxfilter_config(sc); 3976 HN_SLEEPING_OK(sc); 3977 } 3978 3979 /* XXX vlan(4) style mcast addr maintenance */ 3980 if (hn_xpnt_vf_isready(sc)) { 3981 int old_if_flags; 3982 3983 old_if_flags = sc->hn_vf_ifp->if_flags; 3984 hn_xpnt_vf_saveifflags(sc); 3985 3986 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3987 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3988 IFF_ALLMULTI)) 3989 error = hn_xpnt_vf_iocsetflags(sc); 3990 } 3991 3992 HN_UNLOCK(sc); 3993 break; 3994 3995 case SIOCSIFMEDIA: 3996 case SIOCGIFMEDIA: 3997 HN_LOCK(sc); 3998 if (hn_xpnt_vf_isready(sc)) { 3999 /* 4000 * SIOCGIFMEDIA expects ifmediareq, so don't 4001 * create and pass ifr_vf to the VF here; just 4002 * replace the ifr_name. 4003 */ 4004 vf_ifp = sc->hn_vf_ifp; 4005 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 4006 sizeof(ifr->ifr_name)); 4007 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 4008 /* Restore the ifr_name. */ 4009 strlcpy(ifr->ifr_name, ifp->if_xname, 4010 sizeof(ifr->ifr_name)); 4011 HN_UNLOCK(sc); 4012 break; 4013 } 4014 HN_UNLOCK(sc); 4015 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4016 break; 4017 4018 case SIOCGIFRSSHASH: 4019 ifrh = (struct ifrsshash *)data; 4020 HN_LOCK(sc); 4021 if (sc->hn_rx_ring_inuse == 1) { 4022 HN_UNLOCK(sc); 4023 ifrh->ifrh_func = RSS_FUNC_NONE; 4024 ifrh->ifrh_types = 0; 4025 break; 4026 } 4027 4028 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4029 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4030 else 4031 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4032 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4033 HN_UNLOCK(sc); 4034 break; 4035 4036 case SIOCGIFRSSKEY: 4037 ifrk = (struct ifrsskey *)data; 4038 HN_LOCK(sc); 4039 if (sc->hn_rx_ring_inuse == 1) { 4040 HN_UNLOCK(sc); 4041 ifrk->ifrk_func = RSS_FUNC_NONE; 4042 ifrk->ifrk_keylen = 0; 4043 break; 4044 } 4045 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4046 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4047 else 4048 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4049 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4050 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4051 NDIS_HASH_KEYSIZE_TOEPLITZ); 4052 HN_UNLOCK(sc); 4053 break; 4054 4055 default: 4056 error = ether_ioctl(ifp, cmd, data); 4057 break; 4058 } 4059 return (error); 4060 } 4061 4062 static void 4063 hn_stop(struct hn_softc *sc, bool detaching) 4064 { 4065 struct ifnet *ifp = sc->hn_ifp; 4066 int i; 4067 4068 HN_LOCK_ASSERT(sc); 4069 4070 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4071 ("synthetic parts were not attached")); 4072 4073 /* Clear RUNNING bit ASAP. */ 4074 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4075 4076 /* Disable polling. */ 4077 hn_polling(sc, 0); 4078 4079 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4080 KASSERT(sc->hn_vf_ifp != NULL, 4081 ("%s: VF is not attached", ifp->if_xname)); 4082 4083 /* Mark transparent mode VF as disabled. */ 4084 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4085 4086 /* 4087 * NOTE: 4088 * Datapath setting must happen _before_ bringing 4089 * the VF down. 4090 */ 4091 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4092 4093 /* 4094 * Bring the VF down. 4095 */ 4096 hn_xpnt_vf_saveifflags(sc); 4097 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4098 hn_xpnt_vf_iocsetflags(sc); 4099 } 4100 4101 /* Suspend data transfers. */ 4102 hn_suspend_data(sc); 4103 4104 /* Clear OACTIVE bit. */ 4105 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4106 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4107 sc->hn_tx_ring[i].hn_oactive = 0; 4108 4109 /* 4110 * If the non-transparent mode VF is active, make sure 4111 * that the RX filter still allows packet reception. 4112 */ 4113 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4114 hn_rxfilter_config(sc); 4115 } 4116 4117 static void 4118 hn_init_locked(struct hn_softc *sc) 4119 { 4120 struct ifnet *ifp = sc->hn_ifp; 4121 int i; 4122 4123 HN_LOCK_ASSERT(sc); 4124 4125 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4126 return; 4127 4128 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4129 return; 4130 4131 /* Configure RX filter */ 4132 hn_rxfilter_config(sc); 4133 4134 /* Clear OACTIVE bit. */ 4135 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4136 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4137 sc->hn_tx_ring[i].hn_oactive = 0; 4138 4139 /* Clear TX 'suspended' bit. */ 4140 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4141 4142 if (hn_xpnt_vf_isready(sc)) { 4143 /* Initialize transparent VF. */ 4144 hn_xpnt_vf_init(sc); 4145 } 4146 4147 /* Everything is ready; unleash! */ 4148 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4149 4150 /* Re-enable polling if requested. */ 4151 if (sc->hn_pollhz > 0) 4152 hn_polling(sc, sc->hn_pollhz); 4153 } 4154 4155 static void 4156 hn_init(void *xsc) 4157 { 4158 struct hn_softc *sc = xsc; 4159 4160 HN_LOCK(sc); 4161 hn_init_locked(sc); 4162 HN_UNLOCK(sc); 4163 } 4164 4165 #if __FreeBSD_version >= 1100099 4166 4167 static int 4168 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4169 { 4170 struct hn_softc *sc = arg1; 4171 unsigned int lenlim; 4172 int error; 4173 4174 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4175 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4176 if (error || req->newptr == NULL) 4177 return error; 4178 4179 HN_LOCK(sc); 4180 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4181 lenlim > TCP_LRO_LENGTH_MAX) { 4182 HN_UNLOCK(sc); 4183 return EINVAL; 4184 } 4185 hn_set_lro_lenlim(sc, lenlim); 4186 HN_UNLOCK(sc); 4187 4188 return 0; 4189 } 4190 4191 static int 4192 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4193 { 4194 struct hn_softc *sc = arg1; 4195 int ackcnt, error, i; 4196 4197 /* 4198 * lro_ackcnt_lim is append count limit, 4199 * +1 to turn it into aggregation limit. 4200 */ 4201 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4202 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4203 if (error || req->newptr == NULL) 4204 return error; 4205 4206 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4207 return EINVAL; 4208 4209 /* 4210 * Convert aggregation limit back to append 4211 * count limit. 4212 */ 4213 --ackcnt; 4214 HN_LOCK(sc); 4215 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4216 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4217 HN_UNLOCK(sc); 4218 return 0; 4219 } 4220 4221 #endif 4222 4223 static int 4224 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4225 { 4226 struct hn_softc *sc = arg1; 4227 int hcsum = arg2; 4228 int on, error, i; 4229 4230 on = 0; 4231 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4232 on = 1; 4233 4234 error = sysctl_handle_int(oidp, &on, 0, req); 4235 if (error || req->newptr == NULL) 4236 return error; 4237 4238 HN_LOCK(sc); 4239 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4240 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4241 4242 if (on) 4243 rxr->hn_trust_hcsum |= hcsum; 4244 else 4245 rxr->hn_trust_hcsum &= ~hcsum; 4246 } 4247 HN_UNLOCK(sc); 4248 return 0; 4249 } 4250 4251 static int 4252 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4253 { 4254 struct hn_softc *sc = arg1; 4255 int chim_size, error; 4256 4257 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4258 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4259 if (error || req->newptr == NULL) 4260 return error; 4261 4262 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4263 return EINVAL; 4264 4265 HN_LOCK(sc); 4266 hn_set_chim_size(sc, chim_size); 4267 HN_UNLOCK(sc); 4268 return 0; 4269 } 4270 4271 #if __FreeBSD_version < 1100095 4272 static int 4273 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4274 { 4275 struct hn_softc *sc = arg1; 4276 int ofs = arg2, i, error; 4277 struct hn_rx_ring *rxr; 4278 uint64_t stat; 4279 4280 stat = 0; 4281 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4282 rxr = &sc->hn_rx_ring[i]; 4283 stat += *((int *)((uint8_t *)rxr + ofs)); 4284 } 4285 4286 error = sysctl_handle_64(oidp, &stat, 0, req); 4287 if (error || req->newptr == NULL) 4288 return error; 4289 4290 /* Zero out this stat. */ 4291 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4292 rxr = &sc->hn_rx_ring[i]; 4293 *((int *)((uint8_t *)rxr + ofs)) = 0; 4294 } 4295 return 0; 4296 } 4297 #else 4298 static int 4299 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4300 { 4301 struct hn_softc *sc = arg1; 4302 int ofs = arg2, i, error; 4303 struct hn_rx_ring *rxr; 4304 uint64_t stat; 4305 4306 stat = 0; 4307 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4308 rxr = &sc->hn_rx_ring[i]; 4309 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4310 } 4311 4312 error = sysctl_handle_64(oidp, &stat, 0, req); 4313 if (error || req->newptr == NULL) 4314 return error; 4315 4316 /* Zero out this stat. */ 4317 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4318 rxr = &sc->hn_rx_ring[i]; 4319 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4320 } 4321 return 0; 4322 } 4323 4324 #endif 4325 4326 static int 4327 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4328 { 4329 struct hn_softc *sc = arg1; 4330 int ofs = arg2, i, error; 4331 struct hn_rx_ring *rxr; 4332 u_long stat; 4333 4334 stat = 0; 4335 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4336 rxr = &sc->hn_rx_ring[i]; 4337 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4338 } 4339 4340 error = sysctl_handle_long(oidp, &stat, 0, req); 4341 if (error || req->newptr == NULL) 4342 return error; 4343 4344 /* Zero out this stat. */ 4345 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4346 rxr = &sc->hn_rx_ring[i]; 4347 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4348 } 4349 return 0; 4350 } 4351 4352 static int 4353 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4354 { 4355 struct hn_softc *sc = arg1; 4356 int ofs = arg2, i, error; 4357 struct hn_tx_ring *txr; 4358 u_long stat; 4359 4360 stat = 0; 4361 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4362 txr = &sc->hn_tx_ring[i]; 4363 stat += *((u_long *)((uint8_t *)txr + ofs)); 4364 } 4365 4366 error = sysctl_handle_long(oidp, &stat, 0, req); 4367 if (error || req->newptr == NULL) 4368 return error; 4369 4370 /* Zero out this stat. */ 4371 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4372 txr = &sc->hn_tx_ring[i]; 4373 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4374 } 4375 return 0; 4376 } 4377 4378 static int 4379 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4380 { 4381 struct hn_softc *sc = arg1; 4382 int ofs = arg2, i, error, conf; 4383 struct hn_tx_ring *txr; 4384 4385 txr = &sc->hn_tx_ring[0]; 4386 conf = *((int *)((uint8_t *)txr + ofs)); 4387 4388 error = sysctl_handle_int(oidp, &conf, 0, req); 4389 if (error || req->newptr == NULL) 4390 return error; 4391 4392 HN_LOCK(sc); 4393 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4394 txr = &sc->hn_tx_ring[i]; 4395 *((int *)((uint8_t *)txr + ofs)) = conf; 4396 } 4397 HN_UNLOCK(sc); 4398 4399 return 0; 4400 } 4401 4402 static int 4403 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4404 { 4405 struct hn_softc *sc = arg1; 4406 int error, size; 4407 4408 size = sc->hn_agg_size; 4409 error = sysctl_handle_int(oidp, &size, 0, req); 4410 if (error || req->newptr == NULL) 4411 return (error); 4412 4413 HN_LOCK(sc); 4414 sc->hn_agg_size = size; 4415 hn_set_txagg(sc); 4416 HN_UNLOCK(sc); 4417 4418 return (0); 4419 } 4420 4421 static int 4422 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4423 { 4424 struct hn_softc *sc = arg1; 4425 int error, pkts; 4426 4427 pkts = sc->hn_agg_pkts; 4428 error = sysctl_handle_int(oidp, &pkts, 0, req); 4429 if (error || req->newptr == NULL) 4430 return (error); 4431 4432 HN_LOCK(sc); 4433 sc->hn_agg_pkts = pkts; 4434 hn_set_txagg(sc); 4435 HN_UNLOCK(sc); 4436 4437 return (0); 4438 } 4439 4440 static int 4441 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4442 { 4443 struct hn_softc *sc = arg1; 4444 int pkts; 4445 4446 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4447 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4448 } 4449 4450 static int 4451 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4452 { 4453 struct hn_softc *sc = arg1; 4454 int align; 4455 4456 align = sc->hn_tx_ring[0].hn_agg_align; 4457 return (sysctl_handle_int(oidp, &align, 0, req)); 4458 } 4459 4460 static void 4461 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4462 { 4463 if (pollhz == 0) 4464 vmbus_chan_poll_disable(chan); 4465 else 4466 vmbus_chan_poll_enable(chan, pollhz); 4467 } 4468 4469 static void 4470 hn_polling(struct hn_softc *sc, u_int pollhz) 4471 { 4472 int nsubch = sc->hn_rx_ring_inuse - 1; 4473 4474 HN_LOCK_ASSERT(sc); 4475 4476 if (nsubch > 0) { 4477 struct vmbus_channel **subch; 4478 int i; 4479 4480 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4481 for (i = 0; i < nsubch; ++i) 4482 hn_chan_polling(subch[i], pollhz); 4483 vmbus_subchan_rel(subch, nsubch); 4484 } 4485 hn_chan_polling(sc->hn_prichan, pollhz); 4486 } 4487 4488 static int 4489 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4490 { 4491 struct hn_softc *sc = arg1; 4492 int pollhz, error; 4493 4494 pollhz = sc->hn_pollhz; 4495 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4496 if (error || req->newptr == NULL) 4497 return (error); 4498 4499 if (pollhz != 0 && 4500 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4501 return (EINVAL); 4502 4503 HN_LOCK(sc); 4504 if (sc->hn_pollhz != pollhz) { 4505 sc->hn_pollhz = pollhz; 4506 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4507 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4508 hn_polling(sc, sc->hn_pollhz); 4509 } 4510 HN_UNLOCK(sc); 4511 4512 return (0); 4513 } 4514 4515 static int 4516 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4517 { 4518 struct hn_softc *sc = arg1; 4519 char verstr[16]; 4520 4521 snprintf(verstr, sizeof(verstr), "%u.%u", 4522 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4523 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4524 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4525 } 4526 4527 static int 4528 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4529 { 4530 struct hn_softc *sc = arg1; 4531 char caps_str[128]; 4532 uint32_t caps; 4533 4534 HN_LOCK(sc); 4535 caps = sc->hn_caps; 4536 HN_UNLOCK(sc); 4537 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4538 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4539 } 4540 4541 static int 4542 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4543 { 4544 struct hn_softc *sc = arg1; 4545 char assist_str[128]; 4546 uint32_t hwassist; 4547 4548 HN_LOCK(sc); 4549 hwassist = sc->hn_ifp->if_hwassist; 4550 HN_UNLOCK(sc); 4551 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4552 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4553 } 4554 4555 static int 4556 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4557 { 4558 struct hn_softc *sc = arg1; 4559 char filter_str[128]; 4560 uint32_t filter; 4561 4562 HN_LOCK(sc); 4563 filter = sc->hn_rx_filter; 4564 HN_UNLOCK(sc); 4565 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4566 NDIS_PACKET_TYPES); 4567 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4568 } 4569 4570 #ifndef RSS 4571 4572 static int 4573 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4574 { 4575 struct hn_softc *sc = arg1; 4576 int error; 4577 4578 HN_LOCK(sc); 4579 4580 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4581 if (error || req->newptr == NULL) 4582 goto back; 4583 4584 if ((sc->hn_flags & HN_FLAG_RXVF) || 4585 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4586 /* 4587 * RSS key is synchronized w/ VF's, don't allow users 4588 * to change it. 4589 */ 4590 error = EBUSY; 4591 goto back; 4592 } 4593 4594 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4595 if (error) 4596 goto back; 4597 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4598 4599 if (sc->hn_rx_ring_inuse > 1) { 4600 error = hn_rss_reconfig(sc); 4601 } else { 4602 /* Not RSS capable, at least for now; just save the RSS key. */ 4603 error = 0; 4604 } 4605 back: 4606 HN_UNLOCK(sc); 4607 return (error); 4608 } 4609 4610 static int 4611 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4612 { 4613 struct hn_softc *sc = arg1; 4614 int error; 4615 4616 HN_LOCK(sc); 4617 4618 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4619 if (error || req->newptr == NULL) 4620 goto back; 4621 4622 /* 4623 * Don't allow RSS indirect table change, if this interface is not 4624 * RSS capable currently. 4625 */ 4626 if (sc->hn_rx_ring_inuse == 1) { 4627 error = EOPNOTSUPP; 4628 goto back; 4629 } 4630 4631 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4632 if (error) 4633 goto back; 4634 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4635 4636 hn_rss_ind_fixup(sc); 4637 error = hn_rss_reconfig(sc); 4638 back: 4639 HN_UNLOCK(sc); 4640 return (error); 4641 } 4642 4643 #endif /* !RSS */ 4644 4645 static int 4646 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4647 { 4648 struct hn_softc *sc = arg1; 4649 char hash_str[128]; 4650 uint32_t hash; 4651 4652 HN_LOCK(sc); 4653 hash = sc->hn_rss_hash; 4654 HN_UNLOCK(sc); 4655 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4656 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4657 } 4658 4659 static int 4660 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4661 { 4662 struct hn_softc *sc = arg1; 4663 char hash_str[128]; 4664 uint32_t hash; 4665 4666 HN_LOCK(sc); 4667 hash = sc->hn_rss_hcap; 4668 HN_UNLOCK(sc); 4669 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4670 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4671 } 4672 4673 static int 4674 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4675 { 4676 struct hn_softc *sc = arg1; 4677 char hash_str[128]; 4678 uint32_t hash; 4679 4680 HN_LOCK(sc); 4681 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4682 HN_UNLOCK(sc); 4683 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4684 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4685 } 4686 4687 static int 4688 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4689 { 4690 struct hn_softc *sc = arg1; 4691 char vf_name[IFNAMSIZ + 1]; 4692 struct ifnet *vf_ifp; 4693 4694 HN_LOCK(sc); 4695 vf_name[0] = '\0'; 4696 vf_ifp = sc->hn_vf_ifp; 4697 if (vf_ifp != NULL) 4698 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4699 HN_UNLOCK(sc); 4700 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4701 } 4702 4703 static int 4704 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4705 { 4706 struct hn_softc *sc = arg1; 4707 char vf_name[IFNAMSIZ + 1]; 4708 struct ifnet *vf_ifp; 4709 4710 HN_LOCK(sc); 4711 vf_name[0] = '\0'; 4712 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4713 if (vf_ifp != NULL) 4714 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4715 HN_UNLOCK(sc); 4716 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4717 } 4718 4719 static int 4720 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4721 { 4722 struct rm_priotracker pt; 4723 struct sbuf *sb; 4724 int error, i; 4725 bool first; 4726 4727 error = sysctl_wire_old_buffer(req, 0); 4728 if (error != 0) 4729 return (error); 4730 4731 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4732 if (sb == NULL) 4733 return (ENOMEM); 4734 4735 rm_rlock(&hn_vfmap_lock, &pt); 4736 4737 first = true; 4738 for (i = 0; i < hn_vfmap_size; ++i) { 4739 struct ifnet *ifp; 4740 4741 if (hn_vfmap[i] == NULL) 4742 continue; 4743 4744 ifp = ifnet_byindex(i); 4745 if (ifp != NULL) { 4746 if (first) 4747 sbuf_printf(sb, "%s", ifp->if_xname); 4748 else 4749 sbuf_printf(sb, " %s", ifp->if_xname); 4750 first = false; 4751 } 4752 } 4753 4754 rm_runlock(&hn_vfmap_lock, &pt); 4755 4756 error = sbuf_finish(sb); 4757 sbuf_delete(sb); 4758 return (error); 4759 } 4760 4761 static int 4762 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4763 { 4764 struct rm_priotracker pt; 4765 struct sbuf *sb; 4766 int error, i; 4767 bool first; 4768 4769 error = sysctl_wire_old_buffer(req, 0); 4770 if (error != 0) 4771 return (error); 4772 4773 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4774 if (sb == NULL) 4775 return (ENOMEM); 4776 4777 rm_rlock(&hn_vfmap_lock, &pt); 4778 4779 first = true; 4780 for (i = 0; i < hn_vfmap_size; ++i) { 4781 struct ifnet *ifp, *hn_ifp; 4782 4783 hn_ifp = hn_vfmap[i]; 4784 if (hn_ifp == NULL) 4785 continue; 4786 4787 ifp = ifnet_byindex(i); 4788 if (ifp != NULL) { 4789 if (first) { 4790 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4791 hn_ifp->if_xname); 4792 } else { 4793 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4794 hn_ifp->if_xname); 4795 } 4796 first = false; 4797 } 4798 } 4799 4800 rm_runlock(&hn_vfmap_lock, &pt); 4801 4802 error = sbuf_finish(sb); 4803 sbuf_delete(sb); 4804 return (error); 4805 } 4806 4807 static int 4808 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4809 { 4810 struct hn_softc *sc = arg1; 4811 int error, onoff = 0; 4812 4813 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4814 onoff = 1; 4815 error = sysctl_handle_int(oidp, &onoff, 0, req); 4816 if (error || req->newptr == NULL) 4817 return (error); 4818 4819 HN_LOCK(sc); 4820 /* NOTE: hn_vf_lock for hn_transmit() */ 4821 rm_wlock(&sc->hn_vf_lock); 4822 if (onoff) 4823 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4824 else 4825 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4826 rm_wunlock(&sc->hn_vf_lock); 4827 HN_UNLOCK(sc); 4828 4829 return (0); 4830 } 4831 4832 static int 4833 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4834 { 4835 struct hn_softc *sc = arg1; 4836 int enabled = 0; 4837 4838 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4839 enabled = 1; 4840 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4841 } 4842 4843 static int 4844 hn_check_iplen(const struct mbuf *m, int hoff) 4845 { 4846 const struct ip *ip; 4847 int len, iphlen, iplen; 4848 const struct tcphdr *th; 4849 int thoff; /* TCP data offset */ 4850 4851 len = hoff + sizeof(struct ip); 4852 4853 /* The packet must be at least the size of an IP header. */ 4854 if (m->m_pkthdr.len < len) 4855 return IPPROTO_DONE; 4856 4857 /* The fixed IP header must reside completely in the first mbuf. */ 4858 if (m->m_len < len) 4859 return IPPROTO_DONE; 4860 4861 ip = mtodo(m, hoff); 4862 4863 /* Bound check the packet's stated IP header length. */ 4864 iphlen = ip->ip_hl << 2; 4865 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4866 return IPPROTO_DONE; 4867 4868 /* The full IP header must reside completely in the one mbuf. */ 4869 if (m->m_len < hoff + iphlen) 4870 return IPPROTO_DONE; 4871 4872 iplen = ntohs(ip->ip_len); 4873 4874 /* 4875 * Check that the amount of data in the buffers is as 4876 * at least much as the IP header would have us expect. 4877 */ 4878 if (m->m_pkthdr.len < hoff + iplen) 4879 return IPPROTO_DONE; 4880 4881 /* 4882 * Ignore IP fragments. 4883 */ 4884 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4885 return IPPROTO_DONE; 4886 4887 /* 4888 * The TCP/IP or UDP/IP header must be entirely contained within 4889 * the first fragment of a packet. 4890 */ 4891 switch (ip->ip_p) { 4892 case IPPROTO_TCP: 4893 if (iplen < iphlen + sizeof(struct tcphdr)) 4894 return IPPROTO_DONE; 4895 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4896 return IPPROTO_DONE; 4897 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4898 thoff = th->th_off << 2; 4899 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4900 return IPPROTO_DONE; 4901 if (m->m_len < hoff + iphlen + thoff) 4902 return IPPROTO_DONE; 4903 break; 4904 case IPPROTO_UDP: 4905 if (iplen < iphlen + sizeof(struct udphdr)) 4906 return IPPROTO_DONE; 4907 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4908 return IPPROTO_DONE; 4909 break; 4910 default: 4911 if (iplen < iphlen) 4912 return IPPROTO_DONE; 4913 break; 4914 } 4915 return ip->ip_p; 4916 } 4917 4918 static void 4919 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4920 { 4921 const struct ether_header *eh; 4922 uint16_t etype; 4923 int hoff; 4924 4925 hoff = sizeof(*eh); 4926 /* Checked at the beginning of this function. */ 4927 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4928 4929 eh = mtod(m_new, const struct ether_header *); 4930 etype = ntohs(eh->ether_type); 4931 if (etype == ETHERTYPE_VLAN) { 4932 const struct ether_vlan_header *evl; 4933 4934 hoff = sizeof(*evl); 4935 if (m_new->m_len < hoff) 4936 return; 4937 evl = mtod(m_new, const struct ether_vlan_header *); 4938 etype = ntohs(evl->evl_proto); 4939 } 4940 *l3proto = etype; 4941 4942 if (etype == ETHERTYPE_IP) 4943 *l4proto = hn_check_iplen(m_new, hoff); 4944 else 4945 *l4proto = IPPROTO_DONE; 4946 } 4947 4948 static int 4949 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4950 { 4951 struct sysctl_oid_list *child; 4952 struct sysctl_ctx_list *ctx; 4953 device_t dev = sc->hn_dev; 4954 #if defined(INET) || defined(INET6) 4955 #if __FreeBSD_version >= 1100095 4956 int lroent_cnt; 4957 #endif 4958 #endif 4959 int i; 4960 4961 /* 4962 * Create RXBUF for reception. 4963 * 4964 * NOTE: 4965 * - It is shared by all channels. 4966 * - A large enough buffer is allocated, certain version of NVSes 4967 * may further limit the usable space. 4968 */ 4969 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4970 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4971 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4972 if (sc->hn_rxbuf == NULL) { 4973 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4974 return (ENOMEM); 4975 } 4976 4977 sc->hn_rx_ring_cnt = ring_cnt; 4978 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4979 4980 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4981 M_DEVBUF, M_WAITOK | M_ZERO); 4982 4983 #if defined(INET) || defined(INET6) 4984 #if __FreeBSD_version >= 1100095 4985 lroent_cnt = hn_lro_entry_count; 4986 if (lroent_cnt < TCP_LRO_ENTRIES) 4987 lroent_cnt = TCP_LRO_ENTRIES; 4988 if (bootverbose) 4989 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4990 #endif 4991 #endif /* INET || INET6 */ 4992 4993 ctx = device_get_sysctl_ctx(dev); 4994 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4995 4996 /* Create dev.hn.UNIT.rx sysctl tree */ 4997 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4998 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4999 5000 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5001 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5002 5003 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5004 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 5005 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5006 if (rxr->hn_br == NULL) { 5007 device_printf(dev, "allocate bufring failed\n"); 5008 return (ENOMEM); 5009 } 5010 5011 if (hn_trust_hosttcp) 5012 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5013 if (hn_trust_hostudp) 5014 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5015 if (hn_trust_hostip) 5016 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5017 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5018 rxr->hn_ifp = sc->hn_ifp; 5019 if (i < sc->hn_tx_ring_cnt) 5020 rxr->hn_txr = &sc->hn_tx_ring[i]; 5021 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5022 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5023 rxr->hn_rx_idx = i; 5024 rxr->hn_rxbuf = sc->hn_rxbuf; 5025 5026 /* 5027 * Initialize LRO. 5028 */ 5029 #if defined(INET) || defined(INET6) 5030 #if __FreeBSD_version >= 1100095 5031 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5032 hn_lro_mbufq_depth); 5033 #else 5034 tcp_lro_init(&rxr->hn_lro); 5035 rxr->hn_lro.ifp = sc->hn_ifp; 5036 #endif 5037 #if __FreeBSD_version >= 1100099 5038 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5039 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5040 #endif 5041 #endif /* INET || INET6 */ 5042 5043 if (sc->hn_rx_sysctl_tree != NULL) { 5044 char name[16]; 5045 5046 /* 5047 * Create per RX ring sysctl tree: 5048 * dev.hn.UNIT.rx.RINGID 5049 */ 5050 snprintf(name, sizeof(name), "%d", i); 5051 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5052 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5053 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5054 5055 if (rxr->hn_rx_sysctl_tree != NULL) { 5056 SYSCTL_ADD_ULONG(ctx, 5057 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5058 OID_AUTO, "packets", CTLFLAG_RW, 5059 &rxr->hn_pkts, "# of packets received"); 5060 SYSCTL_ADD_ULONG(ctx, 5061 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5062 OID_AUTO, "rss_pkts", CTLFLAG_RW, 5063 &rxr->hn_rss_pkts, 5064 "# of packets w/ RSS info received"); 5065 SYSCTL_ADD_ULONG(ctx, 5066 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5067 OID_AUTO, "rsc_pkts", CTLFLAG_RW, 5068 &rxr->hn_rsc_pkts, 5069 "# of RSC packets received"); 5070 SYSCTL_ADD_ULONG(ctx, 5071 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5072 OID_AUTO, "rsc_drop", CTLFLAG_RW, 5073 &rxr->hn_rsc_drop, 5074 "# of RSC fragments dropped"); 5075 SYSCTL_ADD_INT(ctx, 5076 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5077 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5078 &rxr->hn_pktbuf_len, 0, 5079 "Temporary channel packet buffer length"); 5080 } 5081 } 5082 } 5083 5084 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5085 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5086 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5087 #if __FreeBSD_version < 1100095 5088 hn_rx_stat_int_sysctl, 5089 #else 5090 hn_rx_stat_u64_sysctl, 5091 #endif 5092 "LU", "LRO queued"); 5093 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5094 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5095 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5096 #if __FreeBSD_version < 1100095 5097 hn_rx_stat_int_sysctl, 5098 #else 5099 hn_rx_stat_u64_sysctl, 5100 #endif 5101 "LU", "LRO flushed"); 5102 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5103 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5104 __offsetof(struct hn_rx_ring, hn_lro_tried), 5105 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5106 #if __FreeBSD_version >= 1100099 5107 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5108 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5109 hn_lro_lenlim_sysctl, "IU", 5110 "Max # of data bytes to be aggregated by LRO"); 5111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5112 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5113 hn_lro_ackcnt_sysctl, "I", 5114 "Max # of ACKs to be aggregated by LRO"); 5115 #endif 5116 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5117 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5118 hn_trust_hcsum_sysctl, "I", 5119 "Trust tcp segement verification on host side, " 5120 "when csum info is missing"); 5121 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5122 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5123 hn_trust_hcsum_sysctl, "I", 5124 "Trust udp datagram verification on host side, " 5125 "when csum info is missing"); 5126 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5127 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5128 hn_trust_hcsum_sysctl, "I", 5129 "Trust ip packet verification on host side, " 5130 "when csum info is missing"); 5131 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5132 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5133 __offsetof(struct hn_rx_ring, hn_csum_ip), 5134 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5135 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5136 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5137 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5138 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5139 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5140 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5141 __offsetof(struct hn_rx_ring, hn_csum_udp), 5142 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5143 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5144 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5145 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5146 hn_rx_stat_ulong_sysctl, "LU", 5147 "# of packets that we trust host's csum verification"); 5148 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5149 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5150 __offsetof(struct hn_rx_ring, hn_small_pkts), 5151 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5152 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5153 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5154 __offsetof(struct hn_rx_ring, hn_ack_failed), 5155 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5156 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5157 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5158 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5159 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5160 5161 return (0); 5162 } 5163 5164 static void 5165 hn_destroy_rx_data(struct hn_softc *sc) 5166 { 5167 int i; 5168 5169 if (sc->hn_rxbuf != NULL) { 5170 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5171 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5172 else 5173 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5174 sc->hn_rxbuf = NULL; 5175 } 5176 5177 if (sc->hn_rx_ring_cnt == 0) 5178 return; 5179 5180 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5181 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5182 5183 if (rxr->hn_br == NULL) 5184 continue; 5185 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5186 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5187 } else { 5188 device_printf(sc->hn_dev, 5189 "%dth channel bufring is referenced", i); 5190 } 5191 rxr->hn_br = NULL; 5192 5193 #if defined(INET) || defined(INET6) 5194 tcp_lro_free(&rxr->hn_lro); 5195 #endif 5196 free(rxr->hn_pktbuf, M_DEVBUF); 5197 } 5198 free(sc->hn_rx_ring, M_DEVBUF); 5199 sc->hn_rx_ring = NULL; 5200 5201 sc->hn_rx_ring_cnt = 0; 5202 sc->hn_rx_ring_inuse = 0; 5203 } 5204 5205 static int 5206 hn_tx_ring_create(struct hn_softc *sc, int id) 5207 { 5208 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5209 device_t dev = sc->hn_dev; 5210 bus_dma_tag_t parent_dtag; 5211 int error, i; 5212 5213 txr->hn_sc = sc; 5214 txr->hn_tx_idx = id; 5215 5216 #ifndef HN_USE_TXDESC_BUFRING 5217 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5218 #endif 5219 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5220 5221 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5222 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5223 M_DEVBUF, M_WAITOK | M_ZERO); 5224 #ifndef HN_USE_TXDESC_BUFRING 5225 SLIST_INIT(&txr->hn_txlist); 5226 #else 5227 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5228 M_WAITOK, &txr->hn_tx_lock); 5229 #endif 5230 5231 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5232 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5233 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5234 } else { 5235 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5236 } 5237 5238 #ifdef HN_IFSTART_SUPPORT 5239 if (hn_use_if_start) { 5240 txr->hn_txeof = hn_start_txeof; 5241 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5242 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5243 } else 5244 #endif 5245 { 5246 int br_depth; 5247 5248 txr->hn_txeof = hn_xmit_txeof; 5249 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5250 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5251 5252 br_depth = hn_get_txswq_depth(txr); 5253 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5254 M_WAITOK, &txr->hn_tx_lock); 5255 } 5256 5257 txr->hn_direct_tx_size = hn_direct_tx_size; 5258 5259 /* 5260 * Always schedule transmission instead of trying to do direct 5261 * transmission. This one gives the best performance so far. 5262 */ 5263 txr->hn_sched_tx = 1; 5264 5265 parent_dtag = bus_get_dma_tag(dev); 5266 5267 /* DMA tag for RNDIS packet messages. */ 5268 error = bus_dma_tag_create(parent_dtag, /* parent */ 5269 HN_RNDIS_PKT_ALIGN, /* alignment */ 5270 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5271 BUS_SPACE_MAXADDR, /* lowaddr */ 5272 BUS_SPACE_MAXADDR, /* highaddr */ 5273 NULL, NULL, /* filter, filterarg */ 5274 HN_RNDIS_PKT_LEN, /* maxsize */ 5275 1, /* nsegments */ 5276 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5277 0, /* flags */ 5278 NULL, /* lockfunc */ 5279 NULL, /* lockfuncarg */ 5280 &txr->hn_tx_rndis_dtag); 5281 if (error) { 5282 device_printf(dev, "failed to create rndis dmatag\n"); 5283 return error; 5284 } 5285 5286 /* DMA tag for data. */ 5287 error = bus_dma_tag_create(parent_dtag, /* parent */ 5288 1, /* alignment */ 5289 HN_TX_DATA_BOUNDARY, /* boundary */ 5290 BUS_SPACE_MAXADDR, /* lowaddr */ 5291 BUS_SPACE_MAXADDR, /* highaddr */ 5292 NULL, NULL, /* filter, filterarg */ 5293 HN_TX_DATA_MAXSIZE, /* maxsize */ 5294 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5295 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5296 0, /* flags */ 5297 NULL, /* lockfunc */ 5298 NULL, /* lockfuncarg */ 5299 &txr->hn_tx_data_dtag); 5300 if (error) { 5301 device_printf(dev, "failed to create data dmatag\n"); 5302 return error; 5303 } 5304 5305 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5306 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5307 5308 txd->txr = txr; 5309 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5310 STAILQ_INIT(&txd->agg_list); 5311 5312 /* 5313 * Allocate and load RNDIS packet message. 5314 */ 5315 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5316 (void **)&txd->rndis_pkt, 5317 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5318 &txd->rndis_pkt_dmap); 5319 if (error) { 5320 device_printf(dev, 5321 "failed to allocate rndis_packet_msg, %d\n", i); 5322 return error; 5323 } 5324 5325 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5326 txd->rndis_pkt_dmap, 5327 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5328 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5329 BUS_DMA_NOWAIT); 5330 if (error) { 5331 device_printf(dev, 5332 "failed to load rndis_packet_msg, %d\n", i); 5333 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5334 txd->rndis_pkt, txd->rndis_pkt_dmap); 5335 return error; 5336 } 5337 5338 /* DMA map for TX data. */ 5339 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5340 &txd->data_dmap); 5341 if (error) { 5342 device_printf(dev, 5343 "failed to allocate tx data dmamap\n"); 5344 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5345 txd->rndis_pkt_dmap); 5346 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5347 txd->rndis_pkt, txd->rndis_pkt_dmap); 5348 return error; 5349 } 5350 5351 /* All set, put it to list */ 5352 txd->flags |= HN_TXD_FLAG_ONLIST; 5353 #ifndef HN_USE_TXDESC_BUFRING 5354 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5355 #else 5356 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5357 #endif 5358 } 5359 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5360 5361 if (sc->hn_tx_sysctl_tree != NULL) { 5362 struct sysctl_oid_list *child; 5363 struct sysctl_ctx_list *ctx; 5364 char name[16]; 5365 5366 /* 5367 * Create per TX ring sysctl tree: 5368 * dev.hn.UNIT.tx.RINGID 5369 */ 5370 ctx = device_get_sysctl_ctx(dev); 5371 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5372 5373 snprintf(name, sizeof(name), "%d", id); 5374 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5375 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5376 5377 if (txr->hn_tx_sysctl_tree != NULL) { 5378 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5379 5380 #ifdef HN_DEBUG 5381 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5382 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5383 "# of available TX descs"); 5384 #endif 5385 #ifdef HN_IFSTART_SUPPORT 5386 if (!hn_use_if_start) 5387 #endif 5388 { 5389 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5390 CTLFLAG_RD, &txr->hn_oactive, 0, 5391 "over active"); 5392 } 5393 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5394 CTLFLAG_RW, &txr->hn_pkts, 5395 "# of packets transmitted"); 5396 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5397 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 5398 } 5399 } 5400 5401 return 0; 5402 } 5403 5404 static void 5405 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5406 { 5407 struct hn_tx_ring *txr = txd->txr; 5408 5409 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5410 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5411 5412 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5413 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5414 txd->rndis_pkt_dmap); 5415 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5416 } 5417 5418 static void 5419 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5420 { 5421 5422 KASSERT(txd->refs == 0 || txd->refs == 1, 5423 ("invalid txd refs %d", txd->refs)); 5424 5425 /* Aggregated txds will be freed by their aggregating txd. */ 5426 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5427 int freed; 5428 5429 freed = hn_txdesc_put(txr, txd); 5430 KASSERT(freed, ("can't free txdesc")); 5431 } 5432 } 5433 5434 static void 5435 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5436 { 5437 int i; 5438 5439 if (txr->hn_txdesc == NULL) 5440 return; 5441 5442 /* 5443 * NOTE: 5444 * Because the freeing of aggregated txds will be deferred 5445 * to the aggregating txd, two passes are used here: 5446 * - The first pass GCes any pending txds. This GC is necessary, 5447 * since if the channels are revoked, hypervisor will not 5448 * deliver send-done for all pending txds. 5449 * - The second pass frees the busdma stuffs, i.e. after all txds 5450 * were freed. 5451 */ 5452 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5453 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5454 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5455 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5456 5457 if (txr->hn_tx_data_dtag != NULL) 5458 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5459 if (txr->hn_tx_rndis_dtag != NULL) 5460 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5461 5462 #ifdef HN_USE_TXDESC_BUFRING 5463 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5464 #endif 5465 5466 free(txr->hn_txdesc, M_DEVBUF); 5467 txr->hn_txdesc = NULL; 5468 5469 if (txr->hn_mbuf_br != NULL) 5470 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5471 5472 #ifndef HN_USE_TXDESC_BUFRING 5473 mtx_destroy(&txr->hn_txlist_spin); 5474 #endif 5475 mtx_destroy(&txr->hn_tx_lock); 5476 } 5477 5478 static int 5479 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5480 { 5481 struct sysctl_oid_list *child; 5482 struct sysctl_ctx_list *ctx; 5483 int i; 5484 5485 /* 5486 * Create TXBUF for chimney sending. 5487 * 5488 * NOTE: It is shared by all channels. 5489 */ 5490 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5491 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5492 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5493 if (sc->hn_chim == NULL) { 5494 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5495 return (ENOMEM); 5496 } 5497 5498 sc->hn_tx_ring_cnt = ring_cnt; 5499 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5500 5501 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5502 M_DEVBUF, M_WAITOK | M_ZERO); 5503 5504 ctx = device_get_sysctl_ctx(sc->hn_dev); 5505 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5506 5507 /* Create dev.hn.UNIT.tx sysctl tree */ 5508 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5509 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5510 5511 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5512 int error; 5513 5514 error = hn_tx_ring_create(sc, i); 5515 if (error) 5516 return error; 5517 } 5518 5519 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5520 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5521 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5522 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5523 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5524 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5525 __offsetof(struct hn_tx_ring, hn_send_failed), 5526 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5527 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5528 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5529 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5530 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5531 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5532 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5533 __offsetof(struct hn_tx_ring, hn_flush_failed), 5534 hn_tx_stat_ulong_sysctl, "LU", 5535 "# of packet transmission aggregation flush failure"); 5536 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5537 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5538 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5539 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5540 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5541 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5542 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5543 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5544 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5545 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5546 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5547 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5548 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5549 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5550 "# of total TX descs"); 5551 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5552 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5553 "Chimney send packet size upper boundary"); 5554 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5555 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5556 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5557 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5558 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5559 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5560 hn_tx_conf_int_sysctl, "I", 5561 "Size of the packet for direct transmission"); 5562 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5563 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5564 __offsetof(struct hn_tx_ring, hn_sched_tx), 5565 hn_tx_conf_int_sysctl, "I", 5566 "Always schedule transmission " 5567 "instead of doing direct transmission"); 5568 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5569 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5570 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5571 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5572 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5573 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5574 "Applied packet transmission aggregation size"); 5575 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5576 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5577 hn_txagg_pktmax_sysctl, "I", 5578 "Applied packet transmission aggregation packets"); 5579 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5580 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5581 hn_txagg_align_sysctl, "I", 5582 "Applied packet transmission aggregation alignment"); 5583 5584 return 0; 5585 } 5586 5587 static void 5588 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5589 { 5590 int i; 5591 5592 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5593 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5594 } 5595 5596 static void 5597 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5598 { 5599 struct ifnet *ifp = sc->hn_ifp; 5600 u_int hw_tsomax; 5601 int tso_minlen; 5602 5603 HN_LOCK_ASSERT(sc); 5604 5605 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5606 return; 5607 5608 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5609 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5610 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5611 5612 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5613 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5614 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5615 5616 if (tso_maxlen < tso_minlen) 5617 tso_maxlen = tso_minlen; 5618 else if (tso_maxlen > IP_MAXPACKET) 5619 tso_maxlen = IP_MAXPACKET; 5620 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5621 tso_maxlen = sc->hn_ndis_tso_szmax; 5622 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5623 5624 if (hn_xpnt_vf_isready(sc)) { 5625 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5626 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5627 } 5628 ifp->if_hw_tsomax = hw_tsomax; 5629 if (bootverbose) 5630 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5631 } 5632 5633 static void 5634 hn_fixup_tx_data(struct hn_softc *sc) 5635 { 5636 uint64_t csum_assist; 5637 int i; 5638 5639 hn_set_chim_size(sc, sc->hn_chim_szmax); 5640 if (hn_tx_chimney_size > 0 && 5641 hn_tx_chimney_size < sc->hn_chim_szmax) 5642 hn_set_chim_size(sc, hn_tx_chimney_size); 5643 5644 csum_assist = 0; 5645 if (sc->hn_caps & HN_CAP_IPCS) 5646 csum_assist |= CSUM_IP; 5647 if (sc->hn_caps & HN_CAP_TCP4CS) 5648 csum_assist |= CSUM_IP_TCP; 5649 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5650 csum_assist |= CSUM_IP_UDP; 5651 if (sc->hn_caps & HN_CAP_TCP6CS) 5652 csum_assist |= CSUM_IP6_TCP; 5653 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5654 csum_assist |= CSUM_IP6_UDP; 5655 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5656 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5657 5658 if (sc->hn_caps & HN_CAP_HASHVAL) { 5659 /* 5660 * Support HASHVAL pktinfo on TX path. 5661 */ 5662 if (bootverbose) 5663 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5664 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5665 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5666 } 5667 } 5668 5669 static void 5670 hn_fixup_rx_data(struct hn_softc *sc) 5671 { 5672 5673 if (sc->hn_caps & HN_CAP_UDPHASH) { 5674 int i; 5675 5676 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5677 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5678 } 5679 } 5680 5681 static void 5682 hn_destroy_tx_data(struct hn_softc *sc) 5683 { 5684 int i; 5685 5686 if (sc->hn_chim != NULL) { 5687 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5688 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5689 } else { 5690 device_printf(sc->hn_dev, 5691 "chimney sending buffer is referenced"); 5692 } 5693 sc->hn_chim = NULL; 5694 } 5695 5696 if (sc->hn_tx_ring_cnt == 0) 5697 return; 5698 5699 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5700 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5701 5702 free(sc->hn_tx_ring, M_DEVBUF); 5703 sc->hn_tx_ring = NULL; 5704 5705 sc->hn_tx_ring_cnt = 0; 5706 sc->hn_tx_ring_inuse = 0; 5707 } 5708 5709 #ifdef HN_IFSTART_SUPPORT 5710 5711 static void 5712 hn_start_taskfunc(void *xtxr, int pending __unused) 5713 { 5714 struct hn_tx_ring *txr = xtxr; 5715 5716 mtx_lock(&txr->hn_tx_lock); 5717 hn_start_locked(txr, 0); 5718 mtx_unlock(&txr->hn_tx_lock); 5719 } 5720 5721 static int 5722 hn_start_locked(struct hn_tx_ring *txr, int len) 5723 { 5724 struct hn_softc *sc = txr->hn_sc; 5725 struct ifnet *ifp = sc->hn_ifp; 5726 int sched = 0; 5727 5728 KASSERT(hn_use_if_start, 5729 ("hn_start_locked is called, when if_start is disabled")); 5730 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5731 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5732 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5733 5734 if (__predict_false(txr->hn_suspended)) 5735 return (0); 5736 5737 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5738 IFF_DRV_RUNNING) 5739 return (0); 5740 5741 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5742 struct hn_txdesc *txd; 5743 struct mbuf *m_head; 5744 int error; 5745 5746 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5747 if (m_head == NULL) 5748 break; 5749 5750 if (len > 0 && m_head->m_pkthdr.len > len) { 5751 /* 5752 * This sending could be time consuming; let callers 5753 * dispatch this packet sending (and sending of any 5754 * following up packets) to tx taskqueue. 5755 */ 5756 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5757 sched = 1; 5758 break; 5759 } 5760 5761 #if defined(INET6) || defined(INET) 5762 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5763 m_head = hn_tso_fixup(m_head); 5764 if (__predict_false(m_head == NULL)) { 5765 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5766 continue; 5767 } 5768 } else if (m_head->m_pkthdr.csum_flags & 5769 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5770 m_head = hn_set_hlen(m_head); 5771 if (__predict_false(m_head == NULL)) { 5772 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5773 continue; 5774 } 5775 } 5776 #endif 5777 5778 txd = hn_txdesc_get(txr); 5779 if (txd == NULL) { 5780 txr->hn_no_txdescs++; 5781 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5782 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5783 break; 5784 } 5785 5786 error = hn_encap(ifp, txr, txd, &m_head); 5787 if (error) { 5788 /* Both txd and m_head are freed */ 5789 KASSERT(txr->hn_agg_txd == NULL, 5790 ("encap failed w/ pending aggregating txdesc")); 5791 continue; 5792 } 5793 5794 if (txr->hn_agg_pktleft == 0) { 5795 if (txr->hn_agg_txd != NULL) { 5796 KASSERT(m_head == NULL, 5797 ("pending mbuf for aggregating txdesc")); 5798 error = hn_flush_txagg(ifp, txr); 5799 if (__predict_false(error)) { 5800 atomic_set_int(&ifp->if_drv_flags, 5801 IFF_DRV_OACTIVE); 5802 break; 5803 } 5804 } else { 5805 KASSERT(m_head != NULL, ("mbuf was freed")); 5806 error = hn_txpkt(ifp, txr, txd); 5807 if (__predict_false(error)) { 5808 /* txd is freed, but m_head is not */ 5809 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5810 atomic_set_int(&ifp->if_drv_flags, 5811 IFF_DRV_OACTIVE); 5812 break; 5813 } 5814 } 5815 } 5816 #ifdef INVARIANTS 5817 else { 5818 KASSERT(txr->hn_agg_txd != NULL, 5819 ("no aggregating txdesc")); 5820 KASSERT(m_head == NULL, 5821 ("pending mbuf for aggregating txdesc")); 5822 } 5823 #endif 5824 } 5825 5826 /* Flush pending aggerated transmission. */ 5827 if (txr->hn_agg_txd != NULL) 5828 hn_flush_txagg(ifp, txr); 5829 return (sched); 5830 } 5831 5832 static void 5833 hn_start(struct ifnet *ifp) 5834 { 5835 struct hn_softc *sc = ifp->if_softc; 5836 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5837 5838 if (txr->hn_sched_tx) 5839 goto do_sched; 5840 5841 if (mtx_trylock(&txr->hn_tx_lock)) { 5842 int sched; 5843 5844 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5845 mtx_unlock(&txr->hn_tx_lock); 5846 if (!sched) 5847 return; 5848 } 5849 do_sched: 5850 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5851 } 5852 5853 static void 5854 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5855 { 5856 struct hn_tx_ring *txr = xtxr; 5857 5858 mtx_lock(&txr->hn_tx_lock); 5859 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5860 hn_start_locked(txr, 0); 5861 mtx_unlock(&txr->hn_tx_lock); 5862 } 5863 5864 static void 5865 hn_start_txeof(struct hn_tx_ring *txr) 5866 { 5867 struct hn_softc *sc = txr->hn_sc; 5868 struct ifnet *ifp = sc->hn_ifp; 5869 5870 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5871 5872 if (txr->hn_sched_tx) 5873 goto do_sched; 5874 5875 if (mtx_trylock(&txr->hn_tx_lock)) { 5876 int sched; 5877 5878 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5879 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5880 mtx_unlock(&txr->hn_tx_lock); 5881 if (sched) { 5882 taskqueue_enqueue(txr->hn_tx_taskq, 5883 &txr->hn_tx_task); 5884 } 5885 } else { 5886 do_sched: 5887 /* 5888 * Release the OACTIVE earlier, with the hope, that 5889 * others could catch up. The task will clear the 5890 * flag again with the hn_tx_lock to avoid possible 5891 * races. 5892 */ 5893 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5894 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5895 } 5896 } 5897 5898 #endif /* HN_IFSTART_SUPPORT */ 5899 5900 static int 5901 hn_xmit(struct hn_tx_ring *txr, int len) 5902 { 5903 struct hn_softc *sc = txr->hn_sc; 5904 struct ifnet *ifp = sc->hn_ifp; 5905 struct mbuf *m_head; 5906 int sched = 0; 5907 5908 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5909 #ifdef HN_IFSTART_SUPPORT 5910 KASSERT(hn_use_if_start == 0, 5911 ("hn_xmit is called, when if_start is enabled")); 5912 #endif 5913 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5914 5915 if (__predict_false(txr->hn_suspended)) 5916 return (0); 5917 5918 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5919 return (0); 5920 5921 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5922 struct hn_txdesc *txd; 5923 int error; 5924 5925 if (len > 0 && m_head->m_pkthdr.len > len) { 5926 /* 5927 * This sending could be time consuming; let callers 5928 * dispatch this packet sending (and sending of any 5929 * following up packets) to tx taskqueue. 5930 */ 5931 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5932 sched = 1; 5933 break; 5934 } 5935 5936 txd = hn_txdesc_get(txr); 5937 if (txd == NULL) { 5938 txr->hn_no_txdescs++; 5939 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5940 txr->hn_oactive = 1; 5941 break; 5942 } 5943 5944 error = hn_encap(ifp, txr, txd, &m_head); 5945 if (error) { 5946 /* Both txd and m_head are freed; discard */ 5947 KASSERT(txr->hn_agg_txd == NULL, 5948 ("encap failed w/ pending aggregating txdesc")); 5949 drbr_advance(ifp, txr->hn_mbuf_br); 5950 continue; 5951 } 5952 5953 if (txr->hn_agg_pktleft == 0) { 5954 if (txr->hn_agg_txd != NULL) { 5955 KASSERT(m_head == NULL, 5956 ("pending mbuf for aggregating txdesc")); 5957 error = hn_flush_txagg(ifp, txr); 5958 if (__predict_false(error)) { 5959 txr->hn_oactive = 1; 5960 break; 5961 } 5962 } else { 5963 KASSERT(m_head != NULL, ("mbuf was freed")); 5964 error = hn_txpkt(ifp, txr, txd); 5965 if (__predict_false(error)) { 5966 /* txd is freed, but m_head is not */ 5967 drbr_putback(ifp, txr->hn_mbuf_br, 5968 m_head); 5969 txr->hn_oactive = 1; 5970 break; 5971 } 5972 } 5973 } 5974 #ifdef INVARIANTS 5975 else { 5976 KASSERT(txr->hn_agg_txd != NULL, 5977 ("no aggregating txdesc")); 5978 KASSERT(m_head == NULL, 5979 ("pending mbuf for aggregating txdesc")); 5980 } 5981 #endif 5982 5983 /* Sent */ 5984 drbr_advance(ifp, txr->hn_mbuf_br); 5985 } 5986 5987 /* Flush pending aggerated transmission. */ 5988 if (txr->hn_agg_txd != NULL) 5989 hn_flush_txagg(ifp, txr); 5990 return (sched); 5991 } 5992 5993 static int 5994 hn_transmit(struct ifnet *ifp, struct mbuf *m) 5995 { 5996 struct hn_softc *sc = ifp->if_softc; 5997 struct hn_tx_ring *txr; 5998 int error, idx = 0; 5999 6000 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6001 struct rm_priotracker pt; 6002 6003 rm_rlock(&sc->hn_vf_lock, &pt); 6004 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6005 struct mbuf *m_bpf = NULL; 6006 int obytes, omcast; 6007 6008 obytes = m->m_pkthdr.len; 6009 omcast = (m->m_flags & M_MCAST) != 0; 6010 6011 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6012 if (bpf_peers_present(ifp->if_bpf)) { 6013 m_bpf = m_copypacket(m, M_NOWAIT); 6014 if (m_bpf == NULL) { 6015 /* 6016 * Failed to grab a shallow 6017 * copy; tap now. 6018 */ 6019 ETHER_BPF_MTAP(ifp, m); 6020 } 6021 } 6022 } else { 6023 ETHER_BPF_MTAP(ifp, m); 6024 } 6025 6026 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 6027 rm_runlock(&sc->hn_vf_lock, &pt); 6028 6029 if (m_bpf != NULL) { 6030 if (!error) 6031 ETHER_BPF_MTAP(ifp, m_bpf); 6032 m_freem(m_bpf); 6033 } 6034 6035 if (error == ENOBUFS) { 6036 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6037 } else if (error) { 6038 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6039 } else { 6040 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6041 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6042 if (omcast) { 6043 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6044 omcast); 6045 } 6046 } 6047 return (error); 6048 } 6049 rm_runlock(&sc->hn_vf_lock, &pt); 6050 } 6051 6052 #if defined(INET6) || defined(INET) 6053 /* 6054 * Perform TSO packet header fixup or get l2/l3 header length now, 6055 * since packet headers should be cache-hot. 6056 */ 6057 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6058 m = hn_tso_fixup(m); 6059 if (__predict_false(m == NULL)) { 6060 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6061 return EIO; 6062 } 6063 } else if (m->m_pkthdr.csum_flags & 6064 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6065 m = hn_set_hlen(m); 6066 if (__predict_false(m == NULL)) { 6067 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6068 return EIO; 6069 } 6070 } 6071 #endif 6072 6073 /* 6074 * Select the TX ring based on flowid 6075 */ 6076 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6077 #ifdef RSS 6078 uint32_t bid; 6079 6080 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6081 &bid) == 0) 6082 idx = bid % sc->hn_tx_ring_inuse; 6083 else 6084 #endif 6085 { 6086 #if defined(INET6) || defined(INET) 6087 int tcpsyn = 0; 6088 6089 if (m->m_pkthdr.len < 128 && 6090 (m->m_pkthdr.csum_flags & 6091 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6092 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6093 m = hn_check_tcpsyn(m, &tcpsyn); 6094 if (__predict_false(m == NULL)) { 6095 if_inc_counter(ifp, 6096 IFCOUNTER_OERRORS, 1); 6097 return (EIO); 6098 } 6099 } 6100 #else 6101 const int tcpsyn = 0; 6102 #endif 6103 if (tcpsyn) 6104 idx = 0; 6105 else 6106 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6107 } 6108 } 6109 txr = &sc->hn_tx_ring[idx]; 6110 6111 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6112 if (error) { 6113 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6114 return error; 6115 } 6116 6117 if (txr->hn_oactive) 6118 return 0; 6119 6120 if (txr->hn_sched_tx) 6121 goto do_sched; 6122 6123 if (mtx_trylock(&txr->hn_tx_lock)) { 6124 int sched; 6125 6126 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6127 mtx_unlock(&txr->hn_tx_lock); 6128 if (!sched) 6129 return 0; 6130 } 6131 do_sched: 6132 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6133 return 0; 6134 } 6135 6136 static void 6137 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6138 { 6139 struct mbuf *m; 6140 6141 mtx_lock(&txr->hn_tx_lock); 6142 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6143 m_freem(m); 6144 mtx_unlock(&txr->hn_tx_lock); 6145 } 6146 6147 static void 6148 hn_xmit_qflush(struct ifnet *ifp) 6149 { 6150 struct hn_softc *sc = ifp->if_softc; 6151 struct rm_priotracker pt; 6152 int i; 6153 6154 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6155 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6156 if_qflush(ifp); 6157 6158 rm_rlock(&sc->hn_vf_lock, &pt); 6159 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6160 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6161 rm_runlock(&sc->hn_vf_lock, &pt); 6162 } 6163 6164 static void 6165 hn_xmit_txeof(struct hn_tx_ring *txr) 6166 { 6167 6168 if (txr->hn_sched_tx) 6169 goto do_sched; 6170 6171 if (mtx_trylock(&txr->hn_tx_lock)) { 6172 int sched; 6173 6174 txr->hn_oactive = 0; 6175 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6176 mtx_unlock(&txr->hn_tx_lock); 6177 if (sched) { 6178 taskqueue_enqueue(txr->hn_tx_taskq, 6179 &txr->hn_tx_task); 6180 } 6181 } else { 6182 do_sched: 6183 /* 6184 * Release the oactive earlier, with the hope, that 6185 * others could catch up. The task will clear the 6186 * oactive again with the hn_tx_lock to avoid possible 6187 * races. 6188 */ 6189 txr->hn_oactive = 0; 6190 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6191 } 6192 } 6193 6194 static void 6195 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6196 { 6197 struct hn_tx_ring *txr = xtxr; 6198 6199 mtx_lock(&txr->hn_tx_lock); 6200 hn_xmit(txr, 0); 6201 mtx_unlock(&txr->hn_tx_lock); 6202 } 6203 6204 static void 6205 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6206 { 6207 struct hn_tx_ring *txr = xtxr; 6208 6209 mtx_lock(&txr->hn_tx_lock); 6210 txr->hn_oactive = 0; 6211 hn_xmit(txr, 0); 6212 mtx_unlock(&txr->hn_tx_lock); 6213 } 6214 6215 static int 6216 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6217 { 6218 struct vmbus_chan_br cbr; 6219 struct hn_rx_ring *rxr; 6220 struct hn_tx_ring *txr = NULL; 6221 int idx, error; 6222 6223 idx = vmbus_chan_subidx(chan); 6224 6225 /* 6226 * Link this channel to RX/TX ring. 6227 */ 6228 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6229 ("invalid channel index %d, should > 0 && < %d", 6230 idx, sc->hn_rx_ring_inuse)); 6231 rxr = &sc->hn_rx_ring[idx]; 6232 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6233 ("RX ring %d already attached", idx)); 6234 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6235 rxr->hn_chan = chan; 6236 6237 if (bootverbose) { 6238 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6239 idx, vmbus_chan_id(chan)); 6240 } 6241 6242 if (idx < sc->hn_tx_ring_inuse) { 6243 txr = &sc->hn_tx_ring[idx]; 6244 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6245 ("TX ring %d already attached", idx)); 6246 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6247 6248 txr->hn_chan = chan; 6249 if (bootverbose) { 6250 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6251 idx, vmbus_chan_id(chan)); 6252 } 6253 } 6254 6255 /* Bind this channel to a proper CPU. */ 6256 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6257 6258 /* 6259 * Open this channel 6260 */ 6261 cbr.cbr = rxr->hn_br; 6262 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6263 cbr.cbr_txsz = HN_TXBR_SIZE; 6264 cbr.cbr_rxsz = HN_RXBR_SIZE; 6265 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6266 if (error) { 6267 if (error == EISCONN) { 6268 if_printf(sc->hn_ifp, "bufring is connected after " 6269 "chan%u open failure\n", vmbus_chan_id(chan)); 6270 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6271 } else { 6272 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6273 vmbus_chan_id(chan), error); 6274 } 6275 } 6276 return (error); 6277 } 6278 6279 static void 6280 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6281 { 6282 struct hn_rx_ring *rxr; 6283 int idx, error; 6284 6285 idx = vmbus_chan_subidx(chan); 6286 6287 /* 6288 * Link this channel to RX/TX ring. 6289 */ 6290 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6291 ("invalid channel index %d, should > 0 && < %d", 6292 idx, sc->hn_rx_ring_inuse)); 6293 rxr = &sc->hn_rx_ring[idx]; 6294 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6295 ("RX ring %d is not attached", idx)); 6296 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6297 6298 if (idx < sc->hn_tx_ring_inuse) { 6299 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6300 6301 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6302 ("TX ring %d is not attached attached", idx)); 6303 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6304 } 6305 6306 /* 6307 * Close this channel. 6308 * 6309 * NOTE: 6310 * Channel closing does _not_ destroy the target channel. 6311 */ 6312 error = vmbus_chan_close_direct(chan); 6313 if (error == EISCONN) { 6314 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6315 "after being closed\n", vmbus_chan_id(chan)); 6316 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6317 } else if (error) { 6318 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6319 vmbus_chan_id(chan), error); 6320 } 6321 } 6322 6323 static int 6324 hn_attach_subchans(struct hn_softc *sc) 6325 { 6326 struct vmbus_channel **subchans; 6327 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6328 int i, error = 0; 6329 6330 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6331 6332 /* Attach the sub-channels. */ 6333 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6334 for (i = 0; i < subchan_cnt; ++i) { 6335 int error1; 6336 6337 error1 = hn_chan_attach(sc, subchans[i]); 6338 if (error1) { 6339 error = error1; 6340 /* Move on; all channels will be detached later. */ 6341 } 6342 } 6343 vmbus_subchan_rel(subchans, subchan_cnt); 6344 6345 if (error) { 6346 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6347 } else { 6348 if (bootverbose) { 6349 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6350 subchan_cnt); 6351 } 6352 } 6353 return (error); 6354 } 6355 6356 static void 6357 hn_detach_allchans(struct hn_softc *sc) 6358 { 6359 struct vmbus_channel **subchans; 6360 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6361 int i; 6362 6363 if (subchan_cnt == 0) 6364 goto back; 6365 6366 /* Detach the sub-channels. */ 6367 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6368 for (i = 0; i < subchan_cnt; ++i) 6369 hn_chan_detach(sc, subchans[i]); 6370 vmbus_subchan_rel(subchans, subchan_cnt); 6371 6372 back: 6373 /* 6374 * Detach the primary channel, _after_ all sub-channels 6375 * are detached. 6376 */ 6377 hn_chan_detach(sc, sc->hn_prichan); 6378 6379 /* Wait for sub-channels to be destroyed, if any. */ 6380 vmbus_subchan_drain(sc->hn_prichan); 6381 6382 #ifdef INVARIANTS 6383 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6384 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6385 HN_RX_FLAG_ATTACHED) == 0, 6386 ("%dth RX ring is still attached", i)); 6387 } 6388 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6389 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6390 HN_TX_FLAG_ATTACHED) == 0, 6391 ("%dth TX ring is still attached", i)); 6392 } 6393 #endif 6394 } 6395 6396 static int 6397 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6398 { 6399 struct vmbus_channel **subchans; 6400 int nchan, rxr_cnt, error; 6401 6402 nchan = *nsubch + 1; 6403 if (nchan == 1) { 6404 /* 6405 * Multiple RX/TX rings are not requested. 6406 */ 6407 *nsubch = 0; 6408 return (0); 6409 } 6410 6411 /* 6412 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6413 * table entries. 6414 */ 6415 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6416 if (error) { 6417 /* No RSS; this is benign. */ 6418 *nsubch = 0; 6419 return (0); 6420 } 6421 if (bootverbose) { 6422 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6423 rxr_cnt, nchan); 6424 } 6425 6426 if (nchan > rxr_cnt) 6427 nchan = rxr_cnt; 6428 if (nchan == 1) { 6429 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6430 *nsubch = 0; 6431 return (0); 6432 } 6433 6434 /* 6435 * Allocate sub-channels from NVS. 6436 */ 6437 *nsubch = nchan - 1; 6438 error = hn_nvs_alloc_subchans(sc, nsubch); 6439 if (error || *nsubch == 0) { 6440 /* Failed to allocate sub-channels. */ 6441 *nsubch = 0; 6442 return (0); 6443 } 6444 6445 /* 6446 * Wait for all sub-channels to become ready before moving on. 6447 */ 6448 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6449 vmbus_subchan_rel(subchans, *nsubch); 6450 return (0); 6451 } 6452 6453 static bool 6454 hn_synth_attachable(const struct hn_softc *sc) 6455 { 6456 int i; 6457 6458 if (sc->hn_flags & HN_FLAG_ERRORS) 6459 return (false); 6460 6461 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6462 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6463 6464 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6465 return (false); 6466 } 6467 return (true); 6468 } 6469 6470 /* 6471 * Make sure that the RX filter is zero after the successful 6472 * RNDIS initialization. 6473 * 6474 * NOTE: 6475 * Under certain conditions on certain versions of Hyper-V, 6476 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6477 * after the successful RNDIS initialization, which breaks 6478 * the assumption of any following code (well, it breaks the 6479 * RNDIS API contract actually). Clear the RNDIS rxfilter 6480 * explicitly, drain packets sneaking through, and drain the 6481 * interrupt taskqueues scheduled due to the stealth packets. 6482 */ 6483 static void 6484 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6485 { 6486 6487 hn_disable_rx(sc); 6488 hn_drain_rxtx(sc, nchan); 6489 } 6490 6491 static int 6492 hn_synth_attach(struct hn_softc *sc, int mtu) 6493 { 6494 #define ATTACHED_NVS 0x0002 6495 #define ATTACHED_RNDIS 0x0004 6496 6497 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6498 int error, nsubch, nchan = 1, i, rndis_inited; 6499 uint32_t old_caps, attached = 0; 6500 6501 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6502 ("synthetic parts were attached")); 6503 6504 if (!hn_synth_attachable(sc)) 6505 return (ENXIO); 6506 6507 /* Save capabilities for later verification. */ 6508 old_caps = sc->hn_caps; 6509 sc->hn_caps = 0; 6510 6511 /* Clear RSS stuffs. */ 6512 sc->hn_rss_ind_size = 0; 6513 sc->hn_rss_hash = 0; 6514 sc->hn_rss_hcap = 0; 6515 6516 /* 6517 * Attach the primary channel _before_ attaching NVS and RNDIS. 6518 */ 6519 error = hn_chan_attach(sc, sc->hn_prichan); 6520 if (error) 6521 goto failed; 6522 6523 /* 6524 * Attach NVS. 6525 */ 6526 error = hn_nvs_attach(sc, mtu); 6527 if (error) 6528 goto failed; 6529 attached |= ATTACHED_NVS; 6530 6531 /* 6532 * Attach RNDIS _after_ NVS is attached. 6533 */ 6534 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6535 if (rndis_inited) 6536 attached |= ATTACHED_RNDIS; 6537 if (error) 6538 goto failed; 6539 6540 /* 6541 * Make sure capabilities are not changed. 6542 */ 6543 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6544 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6545 old_caps, sc->hn_caps); 6546 error = ENXIO; 6547 goto failed; 6548 } 6549 6550 /* 6551 * Allocate sub-channels for multi-TX/RX rings. 6552 * 6553 * NOTE: 6554 * The # of RX rings that can be used is equivalent to the # of 6555 * channels to be requested. 6556 */ 6557 nsubch = sc->hn_rx_ring_cnt - 1; 6558 error = hn_synth_alloc_subchans(sc, &nsubch); 6559 if (error) 6560 goto failed; 6561 /* NOTE: _Full_ synthetic parts detach is required now. */ 6562 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6563 6564 /* 6565 * Set the # of TX/RX rings that could be used according to 6566 * the # of channels that NVS offered. 6567 */ 6568 nchan = nsubch + 1; 6569 hn_set_ring_inuse(sc, nchan); 6570 if (nchan == 1) { 6571 /* Only the primary channel can be used; done */ 6572 goto back; 6573 } 6574 6575 /* 6576 * Attach the sub-channels. 6577 * 6578 * NOTE: hn_set_ring_inuse() _must_ have been called. 6579 */ 6580 error = hn_attach_subchans(sc); 6581 if (error) 6582 goto failed; 6583 6584 /* 6585 * Configure RSS key and indirect table _after_ all sub-channels 6586 * are attached. 6587 */ 6588 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6589 /* 6590 * RSS key is not set yet; set it to the default RSS key. 6591 */ 6592 if (bootverbose) 6593 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6594 #ifdef RSS 6595 rss_getkey(rss->rss_key); 6596 #else 6597 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6598 #endif 6599 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6600 } 6601 6602 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6603 /* 6604 * RSS indirect table is not set yet; set it up in round- 6605 * robin fashion. 6606 */ 6607 if (bootverbose) { 6608 if_printf(sc->hn_ifp, "setup default RSS indirect " 6609 "table\n"); 6610 } 6611 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6612 uint32_t subidx; 6613 6614 #ifdef RSS 6615 subidx = rss_get_indirection_to_bucket(i); 6616 #else 6617 subidx = i; 6618 #endif 6619 rss->rss_ind[i] = subidx % nchan; 6620 } 6621 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6622 } else { 6623 /* 6624 * # of usable channels may be changed, so we have to 6625 * make sure that all entries in RSS indirect table 6626 * are valid. 6627 * 6628 * NOTE: hn_set_ring_inuse() _must_ have been called. 6629 */ 6630 hn_rss_ind_fixup(sc); 6631 } 6632 6633 sc->hn_rss_hash = sc->hn_rss_hcap; 6634 if ((sc->hn_flags & HN_FLAG_RXVF) || 6635 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6636 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6637 hn_vf_rss_fixup(sc, false); 6638 } 6639 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6640 if (error) 6641 goto failed; 6642 back: 6643 /* 6644 * Fixup transmission aggregation setup. 6645 */ 6646 hn_set_txagg(sc); 6647 hn_rndis_init_fixat(sc, nchan); 6648 return (0); 6649 6650 failed: 6651 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6652 hn_rndis_init_fixat(sc, nchan); 6653 hn_synth_detach(sc); 6654 } else { 6655 if (attached & ATTACHED_RNDIS) { 6656 hn_rndis_init_fixat(sc, nchan); 6657 hn_rndis_detach(sc); 6658 } 6659 if (attached & ATTACHED_NVS) 6660 hn_nvs_detach(sc); 6661 hn_chan_detach(sc, sc->hn_prichan); 6662 /* Restore old capabilities. */ 6663 sc->hn_caps = old_caps; 6664 } 6665 return (error); 6666 6667 #undef ATTACHED_RNDIS 6668 #undef ATTACHED_NVS 6669 } 6670 6671 /* 6672 * NOTE: 6673 * The interface must have been suspended though hn_suspend(), before 6674 * this function get called. 6675 */ 6676 static void 6677 hn_synth_detach(struct hn_softc *sc) 6678 { 6679 6680 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6681 ("synthetic parts were not attached")); 6682 6683 /* Detach the RNDIS first. */ 6684 hn_rndis_detach(sc); 6685 6686 /* Detach NVS. */ 6687 hn_nvs_detach(sc); 6688 6689 /* Detach all of the channels. */ 6690 hn_detach_allchans(sc); 6691 6692 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6693 /* 6694 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6695 */ 6696 int error; 6697 6698 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6699 sc->hn_rxbuf_gpadl); 6700 if (error) { 6701 if_printf(sc->hn_ifp, 6702 "rxbuf gpadl disconn failed: %d\n", error); 6703 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6704 } 6705 sc->hn_rxbuf_gpadl = 0; 6706 } 6707 6708 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6709 /* 6710 * Host is post-Win2016, disconnect chimney sending buffer from 6711 * primary channel here. 6712 */ 6713 int error; 6714 6715 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6716 sc->hn_chim_gpadl); 6717 if (error) { 6718 if_printf(sc->hn_ifp, 6719 "chim gpadl disconn failed: %d\n", error); 6720 sc->hn_flags |= HN_FLAG_CHIM_REF; 6721 } 6722 sc->hn_chim_gpadl = 0; 6723 } 6724 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6725 } 6726 6727 static void 6728 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6729 { 6730 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6731 ("invalid ring count %d", ring_cnt)); 6732 6733 if (sc->hn_tx_ring_cnt > ring_cnt) 6734 sc->hn_tx_ring_inuse = ring_cnt; 6735 else 6736 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6737 sc->hn_rx_ring_inuse = ring_cnt; 6738 6739 #ifdef RSS 6740 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6741 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6742 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6743 rss_getnumbuckets()); 6744 } 6745 #endif 6746 6747 if (bootverbose) { 6748 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6749 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6750 } 6751 } 6752 6753 static void 6754 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6755 { 6756 6757 /* 6758 * NOTE: 6759 * The TX bufring will not be drained by the hypervisor, 6760 * if the primary channel is revoked. 6761 */ 6762 while (!vmbus_chan_rx_empty(chan) || 6763 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6764 !vmbus_chan_tx_empty(chan))) 6765 pause("waitch", 1); 6766 vmbus_chan_intr_drain(chan); 6767 } 6768 6769 static void 6770 hn_disable_rx(struct hn_softc *sc) 6771 { 6772 6773 /* 6774 * Disable RX by clearing RX filter forcefully. 6775 */ 6776 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6777 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6778 6779 /* 6780 * Give RNDIS enough time to flush all pending data packets. 6781 */ 6782 pause("waitrx", (200 * hz) / 1000); 6783 } 6784 6785 /* 6786 * NOTE: 6787 * RX/TX _must_ have been suspended/disabled, before this function 6788 * is called. 6789 */ 6790 static void 6791 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6792 { 6793 struct vmbus_channel **subch = NULL; 6794 int nsubch; 6795 6796 /* 6797 * Drain RX/TX bufrings and interrupts. 6798 */ 6799 nsubch = nchan - 1; 6800 if (nsubch > 0) 6801 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6802 6803 if (subch != NULL) { 6804 int i; 6805 6806 for (i = 0; i < nsubch; ++i) 6807 hn_chan_drain(sc, subch[i]); 6808 } 6809 hn_chan_drain(sc, sc->hn_prichan); 6810 6811 if (subch != NULL) 6812 vmbus_subchan_rel(subch, nsubch); 6813 } 6814 6815 static void 6816 hn_suspend_data(struct hn_softc *sc) 6817 { 6818 struct hn_tx_ring *txr; 6819 int i; 6820 6821 HN_LOCK_ASSERT(sc); 6822 6823 /* 6824 * Suspend TX. 6825 */ 6826 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6827 txr = &sc->hn_tx_ring[i]; 6828 6829 mtx_lock(&txr->hn_tx_lock); 6830 txr->hn_suspended = 1; 6831 mtx_unlock(&txr->hn_tx_lock); 6832 /* No one is able send more packets now. */ 6833 6834 /* 6835 * Wait for all pending sends to finish. 6836 * 6837 * NOTE: 6838 * We will _not_ receive all pending send-done, if the 6839 * primary channel is revoked. 6840 */ 6841 while (hn_tx_ring_pending(txr) && 6842 !vmbus_chan_is_revoked(sc->hn_prichan)) 6843 pause("hnwtx", 1 /* 1 tick */); 6844 } 6845 6846 /* 6847 * Disable RX. 6848 */ 6849 hn_disable_rx(sc); 6850 6851 /* 6852 * Drain RX/TX. 6853 */ 6854 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6855 6856 /* 6857 * Drain any pending TX tasks. 6858 * 6859 * NOTE: 6860 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6861 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6862 */ 6863 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6864 txr = &sc->hn_tx_ring[i]; 6865 6866 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6867 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6868 } 6869 } 6870 6871 static void 6872 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6873 { 6874 6875 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6876 } 6877 6878 static void 6879 hn_suspend_mgmt(struct hn_softc *sc) 6880 { 6881 struct task task; 6882 6883 HN_LOCK_ASSERT(sc); 6884 6885 /* 6886 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6887 * through hn_mgmt_taskq. 6888 */ 6889 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6890 vmbus_chan_run_task(sc->hn_prichan, &task); 6891 6892 /* 6893 * Make sure that all pending management tasks are completed. 6894 */ 6895 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6896 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6897 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6898 } 6899 6900 static void 6901 hn_suspend(struct hn_softc *sc) 6902 { 6903 6904 /* Disable polling. */ 6905 hn_polling(sc, 0); 6906 6907 /* 6908 * If the non-transparent mode VF is activated, the synthetic 6909 * device is receiving packets, so the data path of the 6910 * synthetic device must be suspended. 6911 */ 6912 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6913 (sc->hn_flags & HN_FLAG_RXVF)) 6914 hn_suspend_data(sc); 6915 hn_suspend_mgmt(sc); 6916 } 6917 6918 static void 6919 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6920 { 6921 int i; 6922 6923 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6924 ("invalid TX ring count %d", tx_ring_cnt)); 6925 6926 for (i = 0; i < tx_ring_cnt; ++i) { 6927 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6928 6929 mtx_lock(&txr->hn_tx_lock); 6930 txr->hn_suspended = 0; 6931 mtx_unlock(&txr->hn_tx_lock); 6932 } 6933 } 6934 6935 static void 6936 hn_resume_data(struct hn_softc *sc) 6937 { 6938 int i; 6939 6940 HN_LOCK_ASSERT(sc); 6941 6942 /* 6943 * Re-enable RX. 6944 */ 6945 hn_rxfilter_config(sc); 6946 6947 /* 6948 * Make sure to clear suspend status on "all" TX rings, 6949 * since hn_tx_ring_inuse can be changed after 6950 * hn_suspend_data(). 6951 */ 6952 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6953 6954 #ifdef HN_IFSTART_SUPPORT 6955 if (!hn_use_if_start) 6956 #endif 6957 { 6958 /* 6959 * Flush unused drbrs, since hn_tx_ring_inuse may be 6960 * reduced. 6961 */ 6962 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6963 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6964 } 6965 6966 /* 6967 * Kick start TX. 6968 */ 6969 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6970 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6971 6972 /* 6973 * Use txeof task, so that any pending oactive can be 6974 * cleared properly. 6975 */ 6976 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6977 } 6978 } 6979 6980 static void 6981 hn_resume_mgmt(struct hn_softc *sc) 6982 { 6983 6984 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6985 6986 /* 6987 * Kick off network change detection, if it was pending. 6988 * If no network change was pending, start link status 6989 * checks, which is more lightweight than network change 6990 * detection. 6991 */ 6992 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6993 hn_change_network(sc); 6994 else 6995 hn_update_link_status(sc); 6996 } 6997 6998 static void 6999 hn_resume(struct hn_softc *sc) 7000 { 7001 7002 /* 7003 * If the non-transparent mode VF is activated, the synthetic 7004 * device have to receive packets, so the data path of the 7005 * synthetic device must be resumed. 7006 */ 7007 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 7008 (sc->hn_flags & HN_FLAG_RXVF)) 7009 hn_resume_data(sc); 7010 7011 /* 7012 * Don't resume link status change if VF is attached/activated. 7013 * - In the non-transparent VF mode, the synthetic device marks 7014 * link down until the VF is deactivated; i.e. VF is down. 7015 * - In transparent VF mode, VF's media status is used until 7016 * the VF is detached. 7017 */ 7018 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7019 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7020 hn_resume_mgmt(sc); 7021 7022 /* 7023 * Re-enable polling if this interface is running and 7024 * the polling is requested. 7025 */ 7026 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7027 hn_polling(sc, sc->hn_pollhz); 7028 } 7029 7030 static void 7031 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7032 { 7033 const struct rndis_status_msg *msg; 7034 int ofs; 7035 7036 if (dlen < sizeof(*msg)) { 7037 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7038 return; 7039 } 7040 msg = data; 7041 7042 switch (msg->rm_status) { 7043 case RNDIS_STATUS_MEDIA_CONNECT: 7044 case RNDIS_STATUS_MEDIA_DISCONNECT: 7045 hn_update_link_status(sc); 7046 break; 7047 7048 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7049 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7050 /* Not really useful; ignore. */ 7051 break; 7052 7053 case RNDIS_STATUS_NETWORK_CHANGE: 7054 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7055 if (dlen < ofs + msg->rm_stbuflen || 7056 msg->rm_stbuflen < sizeof(uint32_t)) { 7057 if_printf(sc->hn_ifp, "network changed\n"); 7058 } else { 7059 uint32_t change; 7060 7061 memcpy(&change, ((const uint8_t *)msg) + ofs, 7062 sizeof(change)); 7063 if_printf(sc->hn_ifp, "network changed, change %u\n", 7064 change); 7065 } 7066 hn_change_network(sc); 7067 break; 7068 7069 default: 7070 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7071 msg->rm_status); 7072 break; 7073 } 7074 } 7075 7076 static int 7077 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7078 { 7079 const struct rndis_pktinfo *pi = info_data; 7080 uint32_t mask = 0; 7081 7082 while (info_dlen != 0) { 7083 const void *data; 7084 uint32_t dlen; 7085 7086 if (__predict_false(info_dlen < sizeof(*pi))) 7087 return (EINVAL); 7088 if (__predict_false(info_dlen < pi->rm_size)) 7089 return (EINVAL); 7090 info_dlen -= pi->rm_size; 7091 7092 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7093 return (EINVAL); 7094 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7095 return (EINVAL); 7096 dlen = pi->rm_size - pi->rm_pktinfooffset; 7097 data = pi->rm_data; 7098 7099 if (pi->rm_internal == 1) { 7100 switch (pi->rm_type) { 7101 case NDIS_PKTINFO_IT_PKTINFO_ID: 7102 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7103 return (EINVAL); 7104 info->pktinfo_id = 7105 (const struct packet_info_id *)data; 7106 mask |= HN_RXINFO_PKTINFO_ID; 7107 break; 7108 7109 default: 7110 goto next; 7111 } 7112 } else { 7113 switch (pi->rm_type) { 7114 case NDIS_PKTINFO_TYPE_VLAN: 7115 if (__predict_false(dlen 7116 < NDIS_VLAN_INFO_SIZE)) 7117 return (EINVAL); 7118 info->vlan_info = (const uint32_t *)data; 7119 mask |= HN_RXINFO_VLAN; 7120 break; 7121 7122 case NDIS_PKTINFO_TYPE_CSUM: 7123 if (__predict_false(dlen 7124 < NDIS_RXCSUM_INFO_SIZE)) 7125 return (EINVAL); 7126 info->csum_info = (const uint32_t *)data; 7127 mask |= HN_RXINFO_CSUM; 7128 break; 7129 7130 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7131 if (__predict_false(dlen 7132 < HN_NDIS_HASH_VALUE_SIZE)) 7133 return (EINVAL); 7134 info->hash_value = (const uint32_t *)data; 7135 mask |= HN_RXINFO_HASHVAL; 7136 break; 7137 7138 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7139 if (__predict_false(dlen 7140 < HN_NDIS_HASH_INFO_SIZE)) 7141 return (EINVAL); 7142 info->hash_info = (const uint32_t *)data; 7143 mask |= HN_RXINFO_HASHINF; 7144 break; 7145 7146 default: 7147 goto next; 7148 } 7149 } 7150 7151 if (mask == HN_RXINFO_ALL) { 7152 /* All found; done */ 7153 break; 7154 } 7155 next: 7156 pi = (const struct rndis_pktinfo *) 7157 ((const uint8_t *)pi + pi->rm_size); 7158 } 7159 7160 /* 7161 * Final fixup. 7162 * - If there is no hash value, invalidate the hash info. 7163 */ 7164 if ((mask & HN_RXINFO_HASHVAL) == 0) 7165 info->hash_info = NULL; 7166 return (0); 7167 } 7168 7169 static __inline bool 7170 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7171 { 7172 7173 if (off < check_off) { 7174 if (__predict_true(off + len <= check_off)) 7175 return (false); 7176 } else if (off > check_off) { 7177 if (__predict_true(check_off + check_len <= off)) 7178 return (false); 7179 } 7180 return (true); 7181 } 7182 7183 static __inline void 7184 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7185 uint32_t len, struct hn_rxinfo *info) 7186 { 7187 uint32_t cnt = rxr->rsc.cnt; 7188 7189 if (cnt) { 7190 rxr->rsc.pktlen += len; 7191 } else { 7192 rxr->rsc.vlan_info = info->vlan_info; 7193 rxr->rsc.csum_info = info->csum_info; 7194 rxr->rsc.hash_info = info->hash_info; 7195 rxr->rsc.hash_value = info->hash_value; 7196 rxr->rsc.pktlen = len; 7197 } 7198 7199 rxr->rsc.frag_data[cnt] = data; 7200 rxr->rsc.frag_len[cnt] = len; 7201 rxr->rsc.cnt++; 7202 } 7203 7204 static void 7205 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7206 { 7207 const struct rndis_packet_msg *pkt; 7208 struct hn_rxinfo info; 7209 int data_off, pktinfo_off, data_len, pktinfo_len; 7210 bool rsc_more= false; 7211 7212 /* 7213 * Check length. 7214 */ 7215 if (__predict_false(dlen < sizeof(*pkt))) { 7216 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7217 return; 7218 } 7219 pkt = data; 7220 7221 if (__predict_false(dlen < pkt->rm_len)) { 7222 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7223 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7224 return; 7225 } 7226 if (__predict_false(pkt->rm_len < 7227 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7228 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7229 "msglen %u, data %u, oob %u, pktinfo %u\n", 7230 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7231 pkt->rm_pktinfolen); 7232 return; 7233 } 7234 if (__predict_false(pkt->rm_datalen == 0)) { 7235 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7236 return; 7237 } 7238 7239 /* 7240 * Check offests. 7241 */ 7242 #define IS_OFFSET_INVALID(ofs) \ 7243 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7244 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7245 7246 /* XXX Hyper-V does not meet data offset alignment requirement */ 7247 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7248 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7249 "data offset %u\n", pkt->rm_dataoffset); 7250 return; 7251 } 7252 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7253 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7254 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7255 "oob offset %u\n", pkt->rm_oobdataoffset); 7256 return; 7257 } 7258 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7259 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7260 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7261 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7262 return; 7263 } 7264 7265 #undef IS_OFFSET_INVALID 7266 7267 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7268 data_len = pkt->rm_datalen; 7269 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7270 pktinfo_len = pkt->rm_pktinfolen; 7271 7272 /* 7273 * Check OOB coverage. 7274 */ 7275 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7276 int oob_off, oob_len; 7277 7278 if_printf(rxr->hn_ifp, "got oobdata\n"); 7279 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7280 oob_len = pkt->rm_oobdatalen; 7281 7282 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7283 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7284 "oob overflow, msglen %u, oob abs %d len %d\n", 7285 pkt->rm_len, oob_off, oob_len); 7286 return; 7287 } 7288 7289 /* 7290 * Check against data. 7291 */ 7292 if (hn_rndis_check_overlap(oob_off, oob_len, 7293 data_off, data_len)) { 7294 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7295 "oob overlaps data, oob abs %d len %d, " 7296 "data abs %d len %d\n", 7297 oob_off, oob_len, data_off, data_len); 7298 return; 7299 } 7300 7301 /* 7302 * Check against pktinfo. 7303 */ 7304 if (pktinfo_len != 0 && 7305 hn_rndis_check_overlap(oob_off, oob_len, 7306 pktinfo_off, pktinfo_len)) { 7307 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7308 "oob overlaps pktinfo, oob abs %d len %d, " 7309 "pktinfo abs %d len %d\n", 7310 oob_off, oob_len, pktinfo_off, pktinfo_len); 7311 return; 7312 } 7313 } 7314 7315 /* 7316 * Check per-packet-info coverage and find useful per-packet-info. 7317 */ 7318 info.vlan_info = NULL; 7319 info.csum_info = NULL; 7320 info.hash_info = NULL; 7321 info.pktinfo_id = NULL; 7322 7323 if (__predict_true(pktinfo_len != 0)) { 7324 bool overlap; 7325 int error; 7326 7327 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7328 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7329 "pktinfo overflow, msglen %u, " 7330 "pktinfo abs %d len %d\n", 7331 pkt->rm_len, pktinfo_off, pktinfo_len); 7332 return; 7333 } 7334 7335 /* 7336 * Check packet info coverage. 7337 */ 7338 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7339 data_off, data_len); 7340 if (__predict_false(overlap)) { 7341 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7342 "pktinfo overlap data, pktinfo abs %d len %d, " 7343 "data abs %d len %d\n", 7344 pktinfo_off, pktinfo_len, data_off, data_len); 7345 return; 7346 } 7347 7348 /* 7349 * Find useful per-packet-info. 7350 */ 7351 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7352 pktinfo_len, &info); 7353 if (__predict_false(error)) { 7354 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7355 "pktinfo\n"); 7356 return; 7357 } 7358 } 7359 7360 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7361 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7362 "data overflow, msglen %u, data abs %d len %d\n", 7363 pkt->rm_len, data_off, data_len); 7364 return; 7365 } 7366 7367 /* Identify RSC fragments, drop invalid packets */ 7368 if ((info.pktinfo_id != NULL) && 7369 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7370 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7371 rxr->rsc.cnt = 0; 7372 rxr->hn_rsc_pkts++; 7373 } else if (rxr->rsc.cnt == 0) 7374 goto drop; 7375 7376 rsc_more = true; 7377 7378 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7379 rsc_more = false; 7380 7381 if (rsc_more && rxr->rsc.is_last) 7382 goto drop; 7383 } else { 7384 rxr->rsc.cnt = 0; 7385 } 7386 7387 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7388 goto drop; 7389 7390 /* Store data in per rx ring structure */ 7391 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7392 data_len, &info); 7393 7394 if (rsc_more) 7395 return; 7396 7397 hn_rxpkt(rxr); 7398 rxr->rsc.cnt = 0; 7399 return; 7400 drop: 7401 rxr->hn_rsc_drop++; 7402 return; 7403 } 7404 7405 static __inline void 7406 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7407 { 7408 const struct rndis_msghdr *hdr; 7409 7410 if (__predict_false(dlen < sizeof(*hdr))) { 7411 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7412 return; 7413 } 7414 hdr = data; 7415 7416 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7417 /* Hot data path. */ 7418 hn_rndis_rx_data(rxr, data, dlen); 7419 /* Done! */ 7420 return; 7421 } 7422 7423 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7424 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7425 else 7426 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7427 } 7428 7429 static void 7430 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7431 { 7432 const struct hn_nvs_hdr *hdr; 7433 7434 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7435 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7436 return; 7437 } 7438 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7439 7440 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7441 /* Useless; ignore */ 7442 return; 7443 } 7444 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7445 } 7446 7447 static void 7448 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7449 const struct vmbus_chanpkt_hdr *pkt) 7450 { 7451 struct hn_nvs_sendctx *sndc; 7452 7453 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7454 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7455 VMBUS_CHANPKT_DATALEN(pkt)); 7456 /* 7457 * NOTE: 7458 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7459 * its callback. 7460 */ 7461 } 7462 7463 static void 7464 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7465 const struct vmbus_chanpkt_hdr *pkthdr) 7466 { 7467 struct epoch_tracker et; 7468 const struct vmbus_chanpkt_rxbuf *pkt; 7469 const struct hn_nvs_hdr *nvs_hdr; 7470 int count, i, hlen; 7471 7472 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7473 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7474 return; 7475 } 7476 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7477 7478 /* Make sure that this is a RNDIS message. */ 7479 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7480 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7481 nvs_hdr->nvs_type); 7482 return; 7483 } 7484 7485 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7486 if (__predict_false(hlen < sizeof(*pkt))) { 7487 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7488 return; 7489 } 7490 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7491 7492 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7493 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7494 pkt->cp_rxbuf_id); 7495 return; 7496 } 7497 7498 count = pkt->cp_rxbuf_cnt; 7499 if (__predict_false(hlen < 7500 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7501 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7502 return; 7503 } 7504 7505 NET_EPOCH_ENTER(et); 7506 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7507 for (i = 0; i < count; ++i) { 7508 int ofs, len; 7509 7510 ofs = pkt->cp_rxbuf[i].rb_ofs; 7511 len = pkt->cp_rxbuf[i].rb_len; 7512 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7513 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7514 "ofs %d, len %d\n", i, ofs, len); 7515 continue; 7516 } 7517 7518 rxr->rsc.is_last = (i == (count - 1)); 7519 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7520 } 7521 NET_EPOCH_EXIT(et); 7522 7523 /* 7524 * Ack the consumed RXBUF associated w/ this channel packet, 7525 * so that this RXBUF can be recycled by the hypervisor. 7526 */ 7527 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7528 } 7529 7530 static void 7531 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7532 uint64_t tid) 7533 { 7534 struct hn_nvs_rndis_ack ack; 7535 int retries, error; 7536 7537 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7538 ack.nvs_status = HN_NVS_STATUS_OK; 7539 7540 retries = 0; 7541 again: 7542 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7543 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7544 if (__predict_false(error == EAGAIN)) { 7545 /* 7546 * NOTE: 7547 * This should _not_ happen in real world, since the 7548 * consumption of the TX bufring from the TX path is 7549 * controlled. 7550 */ 7551 if (rxr->hn_ack_failed == 0) 7552 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7553 rxr->hn_ack_failed++; 7554 retries++; 7555 if (retries < 10) { 7556 DELAY(100); 7557 goto again; 7558 } 7559 /* RXBUF leaks! */ 7560 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7561 } 7562 } 7563 7564 static void 7565 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7566 { 7567 struct hn_rx_ring *rxr = xrxr; 7568 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7569 7570 for (;;) { 7571 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7572 int error, pktlen; 7573 7574 pktlen = rxr->hn_pktbuf_len; 7575 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7576 if (__predict_false(error == ENOBUFS)) { 7577 void *nbuf; 7578 int nlen; 7579 7580 /* 7581 * Expand channel packet buffer. 7582 * 7583 * XXX 7584 * Use M_WAITOK here, since allocation failure 7585 * is fatal. 7586 */ 7587 nlen = rxr->hn_pktbuf_len * 2; 7588 while (nlen < pktlen) 7589 nlen *= 2; 7590 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7591 7592 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7593 rxr->hn_pktbuf_len, nlen); 7594 7595 free(rxr->hn_pktbuf, M_DEVBUF); 7596 rxr->hn_pktbuf = nbuf; 7597 rxr->hn_pktbuf_len = nlen; 7598 /* Retry! */ 7599 continue; 7600 } else if (__predict_false(error == EAGAIN)) { 7601 /* No more channel packets; done! */ 7602 break; 7603 } 7604 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7605 7606 switch (pkt->cph_type) { 7607 case VMBUS_CHANPKT_TYPE_COMP: 7608 hn_nvs_handle_comp(sc, chan, pkt); 7609 break; 7610 7611 case VMBUS_CHANPKT_TYPE_RXBUF: 7612 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7613 break; 7614 7615 case VMBUS_CHANPKT_TYPE_INBAND: 7616 hn_nvs_handle_notify(sc, pkt); 7617 break; 7618 7619 default: 7620 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7621 pkt->cph_type); 7622 break; 7623 } 7624 } 7625 hn_chan_rollup(rxr, rxr->hn_txr); 7626 } 7627 7628 static void 7629 hn_sysinit(void *arg __unused) 7630 { 7631 int i; 7632 7633 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7634 7635 #ifdef HN_IFSTART_SUPPORT 7636 /* 7637 * Don't use ifnet.if_start if transparent VF mode is requested; 7638 * mainly due to the IFF_DRV_OACTIVE flag. 7639 */ 7640 if (hn_xpnt_vf && hn_use_if_start) { 7641 hn_use_if_start = 0; 7642 printf("hn: tranparent VF mode, if_transmit will be used, " 7643 "instead of if_start\n"); 7644 } 7645 #endif 7646 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7647 printf("hn: invalid transparent VF attach routing " 7648 "wait timeout %d, reset to %d\n", 7649 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7650 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7651 } 7652 7653 /* 7654 * Initialize VF map. 7655 */ 7656 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7657 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7658 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7659 M_WAITOK | M_ZERO); 7660 7661 /* 7662 * Fix the # of TX taskqueues. 7663 */ 7664 if (hn_tx_taskq_cnt <= 0) 7665 hn_tx_taskq_cnt = 1; 7666 else if (hn_tx_taskq_cnt > mp_ncpus) 7667 hn_tx_taskq_cnt = mp_ncpus; 7668 7669 /* 7670 * Fix the TX taskqueue mode. 7671 */ 7672 switch (hn_tx_taskq_mode) { 7673 case HN_TX_TASKQ_M_INDEP: 7674 case HN_TX_TASKQ_M_GLOBAL: 7675 case HN_TX_TASKQ_M_EVTTQ: 7676 break; 7677 default: 7678 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7679 break; 7680 } 7681 7682 if (vm_guest != VM_GUEST_HV) 7683 return; 7684 7685 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7686 return; 7687 7688 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7689 M_DEVBUF, M_WAITOK); 7690 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7691 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7692 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7693 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7694 "hn tx%d", i); 7695 } 7696 } 7697 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7698 7699 static void 7700 hn_sysuninit(void *arg __unused) 7701 { 7702 7703 if (hn_tx_taskque != NULL) { 7704 int i; 7705 7706 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7707 taskqueue_free(hn_tx_taskque[i]); 7708 free(hn_tx_taskque, M_DEVBUF); 7709 } 7710 7711 if (hn_vfmap != NULL) 7712 free(hn_vfmap, M_DEVBUF); 7713 rm_destroy(&hn_vfmap_lock); 7714 7715 counter_u64_free(hn_udpcs_fixup); 7716 } 7717 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7718