1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <machine/atomic.h> 89 #include <machine/in_cksum.h> 90 91 #include <net/bpf.h> 92 #include <net/ethernet.h> 93 #include <net/if.h> 94 #include <net/if_dl.h> 95 #include <net/if_media.h> 96 #include <net/if_types.h> 97 #include <net/if_var.h> 98 #include <net/rndis.h> 99 #ifdef RSS 100 #include <net/rss_config.h> 101 #endif 102 103 #include <netinet/in_systm.h> 104 #include <netinet/in.h> 105 #include <netinet/ip.h> 106 #include <netinet/ip6.h> 107 #include <netinet/tcp.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/udp.h> 110 111 #include <dev/hyperv/include/hyperv.h> 112 #include <dev/hyperv/include/hyperv_busdma.h> 113 #include <dev/hyperv/include/vmbus.h> 114 #include <dev/hyperv/include/vmbus_xact.h> 115 116 #include <dev/hyperv/netvsc/ndis.h> 117 #include <dev/hyperv/netvsc/if_hnreg.h> 118 #include <dev/hyperv/netvsc/if_hnvar.h> 119 #include <dev/hyperv/netvsc/hn_nvs.h> 120 #include <dev/hyperv/netvsc/hn_rndis.h> 121 122 #include "vmbus_if.h" 123 124 #define HN_IFSTART_SUPPORT 125 126 #define HN_RING_CNT_DEF_MAX 8 127 128 #define HN_VFMAP_SIZE_DEF 8 129 130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 131 132 /* YYY should get it from the underlying channel */ 133 #define HN_TX_DESC_CNT 512 134 135 #define HN_RNDIS_PKT_LEN \ 136 (sizeof(struct rndis_packet_msg) + \ 137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 143 144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 147 /* -1 for RNDIS packet message */ 148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 149 150 #define HN_DIRECT_TX_SIZE_DEF 128 151 152 #define HN_EARLY_TXEOF_THRESH 8 153 154 #define HN_PKTBUF_LEN_DEF (16 * 1024) 155 156 #define HN_LROENT_CNT_DEF 128 157 158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 160 /* YYY 2*MTU is a bit rough, but should be good enough. */ 161 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 162 163 #define HN_LRO_ACKCNT_DEF 1 164 165 #define HN_LOCK_INIT(sc) \ 166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 169 #define HN_LOCK(sc) \ 170 do { \ 171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 172 /* Relinquish cpu to avoid deadlock */ \ 173 sched_relinquish(curthread); \ 174 DELAY(1000); \ 175 } \ 176 } while (0) 177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 178 179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 181 #define HN_CSUM_IP_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 183 #define HN_CSUM_IP6_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 185 186 #define HN_PKTSIZE_MIN(align) \ 187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 188 HN_RNDIS_PKT_LEN, (align)) 189 #define HN_PKTSIZE(m, align) \ 190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 191 192 #ifdef RSS 193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 194 #else 195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 196 #endif 197 198 struct hn_txdesc { 199 #ifndef HN_USE_TXDESC_BUFRING 200 SLIST_ENTRY(hn_txdesc) link; 201 #endif 202 STAILQ_ENTRY(hn_txdesc) agg_link; 203 204 /* Aggregated txdescs, in sending order. */ 205 STAILQ_HEAD(, hn_txdesc) agg_list; 206 207 /* The oldest packet, if transmission aggregation happens. */ 208 struct mbuf *m; 209 struct hn_tx_ring *txr; 210 int refs; 211 uint32_t flags; /* HN_TXD_FLAG_ */ 212 struct hn_nvs_sendctx send_ctx; 213 uint32_t chim_index; 214 int chim_size; 215 216 bus_dmamap_t data_dmap; 217 218 bus_addr_t rndis_pkt_paddr; 219 struct rndis_packet_msg *rndis_pkt; 220 bus_dmamap_t rndis_pkt_dmap; 221 }; 222 223 #define HN_TXD_FLAG_ONLIST 0x0001 224 #define HN_TXD_FLAG_DMAMAP 0x0002 225 #define HN_TXD_FLAG_ONAGG 0x0004 226 227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 230 231 struct packet_info_id { 232 uint8_t ver; 233 uint8_t flag; 234 uint16_t pkt_id; 235 }; 236 237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 238 239 240 struct hn_rxinfo { 241 const uint32_t *vlan_info; 242 const uint32_t *csum_info; 243 const uint32_t *hash_info; 244 const uint32_t *hash_value; 245 const struct packet_info_id *pktinfo_id; 246 }; 247 248 struct hn_rxvf_setarg { 249 struct hn_rx_ring *rxr; 250 struct ifnet *vf_ifp; 251 }; 252 253 #define HN_RXINFO_VLAN 0x0001 254 #define HN_RXINFO_CSUM 0x0002 255 #define HN_RXINFO_HASHINF 0x0004 256 #define HN_RXINFO_HASHVAL 0x0008 257 #define HN_RXINFO_PKTINFO_ID 0x0010 258 #define HN_RXINFO_ALL \ 259 (HN_RXINFO_VLAN | \ 260 HN_RXINFO_CSUM | \ 261 HN_RXINFO_HASHINF | \ 262 HN_RXINFO_HASHVAL | \ 263 HN_RXINFO_PKTINFO_ID) 264 265 static int hn_probe(device_t); 266 static int hn_attach(device_t); 267 static int hn_detach(device_t); 268 static int hn_shutdown(device_t); 269 static void hn_chan_callback(struct vmbus_channel *, 270 void *); 271 272 static void hn_init(void *); 273 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 274 #ifdef HN_IFSTART_SUPPORT 275 static void hn_start(struct ifnet *); 276 #endif 277 static int hn_transmit(struct ifnet *, struct mbuf *); 278 static void hn_xmit_qflush(struct ifnet *); 279 static int hn_ifmedia_upd(struct ifnet *); 280 static void hn_ifmedia_sts(struct ifnet *, 281 struct ifmediareq *); 282 283 static void hn_ifnet_event(void *, struct ifnet *, int); 284 static void hn_ifaddr_event(void *, struct ifnet *); 285 static void hn_ifnet_attevent(void *, struct ifnet *); 286 static void hn_ifnet_detevent(void *, struct ifnet *); 287 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 288 289 static bool hn_ismyvf(const struct hn_softc *, 290 const struct ifnet *); 291 static void hn_rxvf_change(struct hn_softc *, 292 struct ifnet *, bool); 293 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 294 static void hn_rxvf_set_task(void *, int); 295 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 298 struct ifreq *); 299 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 300 static bool hn_xpnt_vf_isready(struct hn_softc *); 301 static void hn_xpnt_vf_setready(struct hn_softc *); 302 static void hn_xpnt_vf_init_taskfunc(void *, int); 303 static void hn_xpnt_vf_init(struct hn_softc *); 304 static void hn_xpnt_vf_setenable(struct hn_softc *); 305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 306 static void hn_vf_rss_fixup(struct hn_softc *, bool); 307 static void hn_vf_rss_restore(struct hn_softc *); 308 309 static int hn_rndis_rxinfo(const void *, int, 310 struct hn_rxinfo *); 311 static void hn_rndis_rx_data(struct hn_rx_ring *, 312 const void *, int); 313 static void hn_rndis_rx_status(struct hn_softc *, 314 const void *, int); 315 static void hn_rndis_init_fixat(struct hn_softc *, int); 316 317 static void hn_nvs_handle_notify(struct hn_softc *, 318 const struct vmbus_chanpkt_hdr *); 319 static void hn_nvs_handle_comp(struct hn_softc *, 320 struct vmbus_channel *, 321 const struct vmbus_chanpkt_hdr *); 322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 323 struct vmbus_channel *, 324 const struct vmbus_chanpkt_hdr *); 325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 326 struct vmbus_channel *, uint64_t); 327 328 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 329 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 331 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 340 #ifndef RSS 341 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 343 #endif 344 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 346 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 347 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 349 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 358 359 static void hn_stop(struct hn_softc *, bool); 360 static void hn_init_locked(struct hn_softc *); 361 static int hn_chan_attach(struct hn_softc *, 362 struct vmbus_channel *); 363 static void hn_chan_detach(struct hn_softc *, 364 struct vmbus_channel *); 365 static int hn_attach_subchans(struct hn_softc *); 366 static void hn_detach_allchans(struct hn_softc *); 367 static void hn_chan_rollup(struct hn_rx_ring *, 368 struct hn_tx_ring *); 369 static void hn_set_ring_inuse(struct hn_softc *, int); 370 static int hn_synth_attach(struct hn_softc *, int); 371 static void hn_synth_detach(struct hn_softc *); 372 static int hn_synth_alloc_subchans(struct hn_softc *, 373 int *); 374 static bool hn_synth_attachable(const struct hn_softc *); 375 static void hn_suspend(struct hn_softc *); 376 static void hn_suspend_data(struct hn_softc *); 377 static void hn_suspend_mgmt(struct hn_softc *); 378 static void hn_resume(struct hn_softc *); 379 static void hn_resume_data(struct hn_softc *); 380 static void hn_resume_mgmt(struct hn_softc *); 381 static void hn_suspend_mgmt_taskfunc(void *, int); 382 static void hn_chan_drain(struct hn_softc *, 383 struct vmbus_channel *); 384 static void hn_disable_rx(struct hn_softc *); 385 static void hn_drain_rxtx(struct hn_softc *, int); 386 static void hn_polling(struct hn_softc *, u_int); 387 static void hn_chan_polling(struct vmbus_channel *, u_int); 388 static void hn_mtu_change_fixup(struct hn_softc *); 389 390 static void hn_update_link_status(struct hn_softc *); 391 static void hn_change_network(struct hn_softc *); 392 static void hn_link_taskfunc(void *, int); 393 static void hn_netchg_init_taskfunc(void *, int); 394 static void hn_netchg_status_taskfunc(void *, int); 395 static void hn_link_status(struct hn_softc *); 396 397 static int hn_create_rx_data(struct hn_softc *, int); 398 static void hn_destroy_rx_data(struct hn_softc *); 399 static int hn_check_iplen(const struct mbuf *, int); 400 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 401 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 402 static int hn_rxfilter_config(struct hn_softc *); 403 static int hn_rss_reconfig(struct hn_softc *); 404 static void hn_rss_ind_fixup(struct hn_softc *); 405 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 406 static int hn_rxpkt(struct hn_rx_ring *); 407 static uint32_t hn_rss_type_fromndis(uint32_t); 408 static uint32_t hn_rss_type_tondis(uint32_t); 409 410 static int hn_tx_ring_create(struct hn_softc *, int); 411 static void hn_tx_ring_destroy(struct hn_tx_ring *); 412 static int hn_create_tx_data(struct hn_softc *, int); 413 static void hn_fixup_tx_data(struct hn_softc *); 414 static void hn_fixup_rx_data(struct hn_softc *); 415 static void hn_destroy_tx_data(struct hn_softc *); 416 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 417 static void hn_txdesc_gc(struct hn_tx_ring *, 418 struct hn_txdesc *); 419 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 420 struct hn_txdesc *, struct mbuf **); 421 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 422 struct hn_txdesc *); 423 static void hn_set_chim_size(struct hn_softc *, int); 424 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 425 static bool hn_tx_ring_pending(struct hn_tx_ring *); 426 static void hn_tx_ring_qflush(struct hn_tx_ring *); 427 static void hn_resume_tx(struct hn_softc *, int); 428 static void hn_set_txagg(struct hn_softc *); 429 static void *hn_try_txagg(struct ifnet *, 430 struct hn_tx_ring *, struct hn_txdesc *, 431 int); 432 static int hn_get_txswq_depth(const struct hn_tx_ring *); 433 static void hn_txpkt_done(struct hn_nvs_sendctx *, 434 struct hn_softc *, struct vmbus_channel *, 435 const void *, int); 436 static int hn_txpkt_sglist(struct hn_tx_ring *, 437 struct hn_txdesc *); 438 static int hn_txpkt_chim(struct hn_tx_ring *, 439 struct hn_txdesc *); 440 static int hn_xmit(struct hn_tx_ring *, int); 441 static void hn_xmit_taskfunc(void *, int); 442 static void hn_xmit_txeof(struct hn_tx_ring *); 443 static void hn_xmit_txeof_taskfunc(void *, int); 444 #ifdef HN_IFSTART_SUPPORT 445 static int hn_start_locked(struct hn_tx_ring *, int); 446 static void hn_start_taskfunc(void *, int); 447 static void hn_start_txeof(struct hn_tx_ring *); 448 static void hn_start_txeof_taskfunc(void *, int); 449 #endif 450 451 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 452 453 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 454 "Hyper-V network interface"); 455 456 /* Trust tcp segment verification on host side. */ 457 static int hn_trust_hosttcp = 1; 458 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 459 &hn_trust_hosttcp, 0, 460 "Trust tcp segment verification on host side, " 461 "when csum info is missing (global setting)"); 462 463 /* Trust udp datagrams verification on host side. */ 464 static int hn_trust_hostudp = 1; 465 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 466 &hn_trust_hostudp, 0, 467 "Trust udp datagram verification on host side, " 468 "when csum info is missing (global setting)"); 469 470 /* Trust ip packets verification on host side. */ 471 static int hn_trust_hostip = 1; 472 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 473 &hn_trust_hostip, 0, 474 "Trust ip packet verification on host side, " 475 "when csum info is missing (global setting)"); 476 477 /* 478 * Offload UDP/IPv4 checksum. 479 */ 480 static int hn_enable_udp4cs = 1; 481 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 482 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 483 484 /* 485 * Offload UDP/IPv6 checksum. 486 */ 487 static int hn_enable_udp6cs = 1; 488 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 489 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 490 491 /* Stats. */ 492 static counter_u64_t hn_udpcs_fixup; 493 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 494 &hn_udpcs_fixup, "# of UDP checksum fixup"); 495 496 /* 497 * See hn_set_hlen(). 498 * 499 * This value is for Azure. For Hyper-V, set this above 500 * 65536 to disable UDP datagram checksum fixup. 501 */ 502 static int hn_udpcs_fixup_mtu = 1420; 503 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 504 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 505 506 /* Limit TSO burst size */ 507 static int hn_tso_maxlen = IP_MAXPACKET; 508 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 509 &hn_tso_maxlen, 0, "TSO burst limit"); 510 511 /* Limit chimney send size */ 512 static int hn_tx_chimney_size = 0; 513 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 514 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 515 516 /* Limit the size of packet for direct transmission */ 517 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 518 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 519 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 520 521 /* # of LRO entries per RX ring */ 522 #if defined(INET) || defined(INET6) 523 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 524 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 525 &hn_lro_entry_count, 0, "LRO entry count"); 526 #endif 527 528 static int hn_tx_taskq_cnt = 1; 529 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 530 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 531 532 #define HN_TX_TASKQ_M_INDEP 0 533 #define HN_TX_TASKQ_M_GLOBAL 1 534 #define HN_TX_TASKQ_M_EVTTQ 2 535 536 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 538 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 539 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 540 541 #ifndef HN_USE_TXDESC_BUFRING 542 static int hn_use_txdesc_bufring = 0; 543 #else 544 static int hn_use_txdesc_bufring = 1; 545 #endif 546 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 547 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 548 549 #ifdef HN_IFSTART_SUPPORT 550 /* Use ifnet.if_start instead of ifnet.if_transmit */ 551 static int hn_use_if_start = 0; 552 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 553 &hn_use_if_start, 0, "Use if_start TX method"); 554 #endif 555 556 /* # of channels to use */ 557 static int hn_chan_cnt = 0; 558 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 559 &hn_chan_cnt, 0, 560 "# of channels to use; each channel has one RX ring and one TX ring"); 561 562 /* # of transmit rings to use */ 563 static int hn_tx_ring_cnt = 0; 564 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 565 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 566 567 /* Software TX ring deptch */ 568 static int hn_tx_swq_depth = 0; 569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 570 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 571 572 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 573 static u_int hn_lro_mbufq_depth = 0; 574 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 575 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 576 577 /* Packet transmission aggregation size limit */ 578 static int hn_tx_agg_size = -1; 579 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 580 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 581 582 /* Packet transmission aggregation count limit */ 583 static int hn_tx_agg_pkts = -1; 584 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 585 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 586 587 /* VF list */ 588 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 589 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 590 hn_vflist_sysctl, "A", 591 "VF list"); 592 593 /* VF mapping */ 594 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 595 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 596 hn_vfmap_sysctl, "A", 597 "VF mapping"); 598 599 /* Transparent VF */ 600 static int hn_xpnt_vf = 1; 601 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 602 &hn_xpnt_vf, 0, "Transparent VF mod"); 603 604 /* Accurate BPF support for Transparent VF */ 605 static int hn_xpnt_vf_accbpf = 0; 606 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 607 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 608 609 /* Extra wait for transparent VF attach routing; unit seconds. */ 610 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 612 &hn_xpnt_vf_attwait, 0, 613 "Extra wait for transparent VF attach routing; unit: seconds"); 614 615 static u_int hn_cpu_index; /* next CPU for channel */ 616 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 617 618 static struct rmlock hn_vfmap_lock; 619 static int hn_vfmap_size; 620 static struct ifnet **hn_vfmap; 621 622 #ifndef RSS 623 static const uint8_t 624 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 625 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 626 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 627 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 628 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 629 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 630 }; 631 #endif /* !RSS */ 632 633 static const struct hyperv_guid hn_guid = { 634 .hv_guid = { 635 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 636 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 637 }; 638 639 static device_method_t hn_methods[] = { 640 /* Device interface */ 641 DEVMETHOD(device_probe, hn_probe), 642 DEVMETHOD(device_attach, hn_attach), 643 DEVMETHOD(device_detach, hn_detach), 644 DEVMETHOD(device_shutdown, hn_shutdown), 645 DEVMETHOD_END 646 }; 647 648 static driver_t hn_driver = { 649 "hn", 650 hn_methods, 651 sizeof(struct hn_softc) 652 }; 653 654 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); 655 MODULE_VERSION(hn, 1); 656 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 657 658 static void 659 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 660 { 661 int i; 662 663 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 664 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 665 } 666 667 static int 668 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 669 { 670 671 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 672 txd->chim_size == 0, ("invalid rndis sglist txd")); 673 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 674 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 675 } 676 677 static int 678 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 679 { 680 struct hn_nvs_rndis rndis; 681 682 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 683 txd->chim_size > 0, ("invalid rndis chim txd")); 684 685 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 686 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 687 rndis.nvs_chim_idx = txd->chim_index; 688 rndis.nvs_chim_sz = txd->chim_size; 689 690 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 691 &rndis, sizeof(rndis), &txd->send_ctx)); 692 } 693 694 static __inline uint32_t 695 hn_chim_alloc(struct hn_softc *sc) 696 { 697 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 698 u_long *bmap = sc->hn_chim_bmap; 699 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 700 701 for (i = 0; i < bmap_cnt; ++i) { 702 int idx; 703 704 idx = ffsl(~bmap[i]); 705 if (idx == 0) 706 continue; 707 708 --idx; /* ffsl is 1-based */ 709 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 710 ("invalid i %d and idx %d", i, idx)); 711 712 if (atomic_testandset_long(&bmap[i], idx)) 713 continue; 714 715 ret = i * LONG_BIT + idx; 716 break; 717 } 718 return (ret); 719 } 720 721 static __inline void 722 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 723 { 724 u_long mask; 725 uint32_t idx; 726 727 idx = chim_idx / LONG_BIT; 728 KASSERT(idx < sc->hn_chim_bmap_cnt, 729 ("invalid chimney index 0x%x", chim_idx)); 730 731 mask = 1UL << (chim_idx % LONG_BIT); 732 KASSERT(sc->hn_chim_bmap[idx] & mask, 733 ("index bitmap 0x%lx, chimney index %u, " 734 "bitmap idx %d, bitmask 0x%lx", 735 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 736 737 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 738 } 739 740 #if defined(INET6) || defined(INET) 741 742 #define PULLUP_HDR(m, len) \ 743 do { \ 744 if (__predict_false((m)->m_len < (len))) { \ 745 (m) = m_pullup((m), (len)); \ 746 if ((m) == NULL) \ 747 return (NULL); \ 748 } \ 749 } while (0) 750 751 /* 752 * NOTE: If this function failed, the m_head would be freed. 753 */ 754 static __inline struct mbuf * 755 hn_tso_fixup(struct mbuf *m_head) 756 { 757 struct ether_vlan_header *evl; 758 struct tcphdr *th; 759 int ehlen; 760 761 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 762 763 PULLUP_HDR(m_head, sizeof(*evl)); 764 evl = mtod(m_head, struct ether_vlan_header *); 765 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 766 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 767 else 768 ehlen = ETHER_HDR_LEN; 769 m_head->m_pkthdr.l2hlen = ehlen; 770 771 #ifdef INET 772 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 773 struct ip *ip; 774 int iphlen; 775 776 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 777 ip = mtodo(m_head, ehlen); 778 iphlen = ip->ip_hl << 2; 779 m_head->m_pkthdr.l3hlen = iphlen; 780 781 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 782 th = mtodo(m_head, ehlen + iphlen); 783 784 ip->ip_len = 0; 785 ip->ip_sum = 0; 786 th->th_sum = in_pseudo(ip->ip_src.s_addr, 787 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 788 } 789 #endif 790 #if defined(INET6) && defined(INET) 791 else 792 #endif 793 #ifdef INET6 794 { 795 struct ip6_hdr *ip6; 796 797 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 798 ip6 = mtodo(m_head, ehlen); 799 if (ip6->ip6_nxt != IPPROTO_TCP) { 800 m_freem(m_head); 801 return (NULL); 802 } 803 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 804 805 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 806 th = mtodo(m_head, ehlen + sizeof(*ip6)); 807 808 ip6->ip6_plen = 0; 809 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 810 } 811 #endif 812 return (m_head); 813 } 814 815 /* 816 * NOTE: If this function failed, the m_head would be freed. 817 */ 818 static __inline struct mbuf * 819 hn_set_hlen(struct mbuf *m_head) 820 { 821 const struct ether_vlan_header *evl; 822 int ehlen; 823 824 PULLUP_HDR(m_head, sizeof(*evl)); 825 evl = mtod(m_head, const struct ether_vlan_header *); 826 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 827 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 828 else 829 ehlen = ETHER_HDR_LEN; 830 m_head->m_pkthdr.l2hlen = ehlen; 831 832 #ifdef INET 833 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 834 const struct ip *ip; 835 int iphlen; 836 837 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 838 ip = mtodo(m_head, ehlen); 839 iphlen = ip->ip_hl << 2; 840 m_head->m_pkthdr.l3hlen = iphlen; 841 842 /* 843 * UDP checksum offload does not work in Azure, if the 844 * following conditions meet: 845 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 846 * - IP_DF is not set in the IP hdr. 847 * 848 * Fallback to software checksum for these UDP datagrams. 849 */ 850 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 851 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 852 (ntohs(ip->ip_off) & IP_DF) == 0) { 853 uint16_t off = ehlen + iphlen; 854 855 counter_u64_add(hn_udpcs_fixup, 1); 856 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 857 *(uint16_t *)(m_head->m_data + off + 858 m_head->m_pkthdr.csum_data) = in_cksum_skip( 859 m_head, m_head->m_pkthdr.len, off); 860 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 861 } 862 } 863 #endif 864 #if defined(INET6) && defined(INET) 865 else 866 #endif 867 #ifdef INET6 868 { 869 const struct ip6_hdr *ip6; 870 871 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 872 ip6 = mtodo(m_head, ehlen); 873 if (ip6->ip6_nxt != IPPROTO_TCP && 874 ip6->ip6_nxt != IPPROTO_UDP) { 875 m_freem(m_head); 876 return (NULL); 877 } 878 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 879 } 880 #endif 881 return (m_head); 882 } 883 884 /* 885 * NOTE: If this function failed, the m_head would be freed. 886 */ 887 static __inline struct mbuf * 888 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 889 { 890 const struct tcphdr *th; 891 int ehlen, iphlen; 892 893 *tcpsyn = 0; 894 ehlen = m_head->m_pkthdr.l2hlen; 895 iphlen = m_head->m_pkthdr.l3hlen; 896 897 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 898 th = mtodo(m_head, ehlen + iphlen); 899 if (th->th_flags & TH_SYN) 900 *tcpsyn = 1; 901 return (m_head); 902 } 903 904 #undef PULLUP_HDR 905 906 #endif /* INET6 || INET */ 907 908 static int 909 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 910 { 911 int error = 0; 912 913 HN_LOCK_ASSERT(sc); 914 915 if (sc->hn_rx_filter != filter) { 916 error = hn_rndis_set_rxfilter(sc, filter); 917 if (!error) 918 sc->hn_rx_filter = filter; 919 } 920 return (error); 921 } 922 923 static int 924 hn_rxfilter_config(struct hn_softc *sc) 925 { 926 struct ifnet *ifp = sc->hn_ifp; 927 uint32_t filter; 928 929 HN_LOCK_ASSERT(sc); 930 931 /* 932 * If the non-transparent mode VF is activated, we don't know how 933 * its RX filter is configured, so stick the synthetic device in 934 * the promiscous mode. 935 */ 936 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 937 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 938 } else { 939 filter = NDIS_PACKET_TYPE_DIRECTED; 940 if (ifp->if_flags & IFF_BROADCAST) 941 filter |= NDIS_PACKET_TYPE_BROADCAST; 942 /* TODO: support multicast list */ 943 if ((ifp->if_flags & IFF_ALLMULTI) || 944 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 945 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 946 } 947 return (hn_set_rxfilter(sc, filter)); 948 } 949 950 static void 951 hn_set_txagg(struct hn_softc *sc) 952 { 953 uint32_t size, pkts; 954 int i; 955 956 /* 957 * Setup aggregation size. 958 */ 959 if (sc->hn_agg_size < 0) 960 size = UINT32_MAX; 961 else 962 size = sc->hn_agg_size; 963 964 if (sc->hn_rndis_agg_size < size) 965 size = sc->hn_rndis_agg_size; 966 967 /* NOTE: We only aggregate packets using chimney sending buffers. */ 968 if (size > (uint32_t)sc->hn_chim_szmax) 969 size = sc->hn_chim_szmax; 970 971 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 972 /* Disable */ 973 size = 0; 974 pkts = 0; 975 goto done; 976 } 977 978 /* NOTE: Type of the per TX ring setting is 'int'. */ 979 if (size > INT_MAX) 980 size = INT_MAX; 981 982 /* 983 * Setup aggregation packet count. 984 */ 985 if (sc->hn_agg_pkts < 0) 986 pkts = UINT32_MAX; 987 else 988 pkts = sc->hn_agg_pkts; 989 990 if (sc->hn_rndis_agg_pkts < pkts) 991 pkts = sc->hn_rndis_agg_pkts; 992 993 if (pkts <= 1) { 994 /* Disable */ 995 size = 0; 996 pkts = 0; 997 goto done; 998 } 999 1000 /* NOTE: Type of the per TX ring setting is 'short'. */ 1001 if (pkts > SHRT_MAX) 1002 pkts = SHRT_MAX; 1003 1004 done: 1005 /* NOTE: Type of the per TX ring setting is 'short'. */ 1006 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1007 /* Disable */ 1008 size = 0; 1009 pkts = 0; 1010 } 1011 1012 if (bootverbose) { 1013 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1014 size, pkts, sc->hn_rndis_agg_align); 1015 } 1016 1017 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1018 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1019 1020 mtx_lock(&txr->hn_tx_lock); 1021 txr->hn_agg_szmax = size; 1022 txr->hn_agg_pktmax = pkts; 1023 txr->hn_agg_align = sc->hn_rndis_agg_align; 1024 mtx_unlock(&txr->hn_tx_lock); 1025 } 1026 } 1027 1028 static int 1029 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1030 { 1031 1032 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1033 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1034 return txr->hn_txdesc_cnt; 1035 return hn_tx_swq_depth; 1036 } 1037 1038 static int 1039 hn_rss_reconfig(struct hn_softc *sc) 1040 { 1041 int error; 1042 1043 HN_LOCK_ASSERT(sc); 1044 1045 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1046 return (ENXIO); 1047 1048 /* 1049 * Disable RSS first. 1050 * 1051 * NOTE: 1052 * Direct reconfiguration by setting the UNCHG flags does 1053 * _not_ work properly. 1054 */ 1055 if (bootverbose) 1056 if_printf(sc->hn_ifp, "disable RSS\n"); 1057 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1058 if (error) { 1059 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1060 return (error); 1061 } 1062 1063 /* 1064 * Reenable the RSS w/ the updated RSS key or indirect 1065 * table. 1066 */ 1067 if (bootverbose) 1068 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1069 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1070 if (error) { 1071 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1072 return (error); 1073 } 1074 return (0); 1075 } 1076 1077 static void 1078 hn_rss_ind_fixup(struct hn_softc *sc) 1079 { 1080 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1081 int i, nchan; 1082 1083 nchan = sc->hn_rx_ring_inuse; 1084 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1085 1086 /* 1087 * Check indirect table to make sure that all channels in it 1088 * can be used. 1089 */ 1090 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1091 if (rss->rss_ind[i] >= nchan) { 1092 if_printf(sc->hn_ifp, 1093 "RSS indirect table %d fixup: %u -> %d\n", 1094 i, rss->rss_ind[i], nchan - 1); 1095 rss->rss_ind[i] = nchan - 1; 1096 } 1097 } 1098 } 1099 1100 static int 1101 hn_ifmedia_upd(struct ifnet *ifp __unused) 1102 { 1103 1104 return EOPNOTSUPP; 1105 } 1106 1107 static void 1108 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1109 { 1110 struct hn_softc *sc = ifp->if_softc; 1111 1112 ifmr->ifm_status = IFM_AVALID; 1113 ifmr->ifm_active = IFM_ETHER; 1114 1115 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1116 ifmr->ifm_active |= IFM_NONE; 1117 return; 1118 } 1119 ifmr->ifm_status |= IFM_ACTIVE; 1120 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1121 } 1122 1123 static void 1124 hn_rxvf_set_task(void *xarg, int pending __unused) 1125 { 1126 struct hn_rxvf_setarg *arg = xarg; 1127 1128 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1129 } 1130 1131 static void 1132 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1133 { 1134 struct hn_rx_ring *rxr; 1135 struct hn_rxvf_setarg arg; 1136 struct task task; 1137 int i; 1138 1139 HN_LOCK_ASSERT(sc); 1140 1141 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1142 1143 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1144 rxr = &sc->hn_rx_ring[i]; 1145 1146 if (i < sc->hn_rx_ring_inuse) { 1147 arg.rxr = rxr; 1148 arg.vf_ifp = vf_ifp; 1149 vmbus_chan_run_task(rxr->hn_chan, &task); 1150 } else { 1151 rxr->hn_rxvf_ifp = vf_ifp; 1152 } 1153 } 1154 } 1155 1156 static bool 1157 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1158 { 1159 const struct ifnet *hn_ifp; 1160 1161 hn_ifp = sc->hn_ifp; 1162 1163 if (ifp == hn_ifp) 1164 return (false); 1165 1166 if (ifp->if_alloctype != IFT_ETHER) 1167 return (false); 1168 1169 /* Ignore lagg/vlan interfaces */ 1170 if (strcmp(ifp->if_dname, "lagg") == 0 || 1171 strcmp(ifp->if_dname, "vlan") == 0) 1172 return (false); 1173 1174 /* 1175 * During detach events ifp->if_addr might be NULL. 1176 * Make sure the bcmp() below doesn't panic on that: 1177 */ 1178 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1179 return (false); 1180 1181 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1182 return (false); 1183 1184 return (true); 1185 } 1186 1187 static void 1188 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1189 { 1190 struct ifnet *hn_ifp; 1191 1192 HN_LOCK(sc); 1193 1194 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1195 goto out; 1196 1197 if (!hn_ismyvf(sc, ifp)) 1198 goto out; 1199 hn_ifp = sc->hn_ifp; 1200 1201 if (rxvf) { 1202 if (sc->hn_flags & HN_FLAG_RXVF) 1203 goto out; 1204 1205 sc->hn_flags |= HN_FLAG_RXVF; 1206 hn_rxfilter_config(sc); 1207 } else { 1208 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1209 goto out; 1210 1211 sc->hn_flags &= ~HN_FLAG_RXVF; 1212 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1213 hn_rxfilter_config(sc); 1214 else 1215 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1216 } 1217 1218 hn_nvs_set_datapath(sc, 1219 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1220 1221 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1222 1223 if (rxvf) { 1224 hn_vf_rss_fixup(sc, true); 1225 hn_suspend_mgmt(sc); 1226 sc->hn_link_flags &= 1227 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1228 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1229 } else { 1230 hn_vf_rss_restore(sc); 1231 hn_resume_mgmt(sc); 1232 } 1233 1234 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1235 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1236 1237 if (bootverbose) { 1238 if_printf(hn_ifp, "datapath is switched %s %s\n", 1239 rxvf ? "to" : "from", ifp->if_xname); 1240 } 1241 out: 1242 HN_UNLOCK(sc); 1243 } 1244 1245 static void 1246 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1247 { 1248 1249 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1250 return; 1251 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1252 } 1253 1254 static void 1255 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1256 { 1257 1258 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1259 } 1260 1261 static int 1262 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1263 { 1264 struct ifnet *ifp, *vf_ifp; 1265 uint64_t tmp; 1266 int error; 1267 1268 HN_LOCK_ASSERT(sc); 1269 ifp = sc->hn_ifp; 1270 vf_ifp = sc->hn_vf_ifp; 1271 1272 /* 1273 * Fix up requested capabilities w/ supported capabilities, 1274 * since the supported capabilities could have been changed. 1275 */ 1276 ifr->ifr_reqcap &= ifp->if_capabilities; 1277 /* Pass SIOCSIFCAP to VF. */ 1278 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1279 1280 /* 1281 * NOTE: 1282 * The error will be propagated to the callers, however, it 1283 * is _not_ useful here. 1284 */ 1285 1286 /* 1287 * Merge VF's enabled capabilities. 1288 */ 1289 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1290 1291 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1292 if (ifp->if_capenable & IFCAP_TXCSUM) 1293 ifp->if_hwassist |= tmp; 1294 else 1295 ifp->if_hwassist &= ~tmp; 1296 1297 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1298 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1299 ifp->if_hwassist |= tmp; 1300 else 1301 ifp->if_hwassist &= ~tmp; 1302 1303 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1304 if (ifp->if_capenable & IFCAP_TSO4) 1305 ifp->if_hwassist |= tmp; 1306 else 1307 ifp->if_hwassist &= ~tmp; 1308 1309 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1310 if (ifp->if_capenable & IFCAP_TSO6) 1311 ifp->if_hwassist |= tmp; 1312 else 1313 ifp->if_hwassist &= ~tmp; 1314 1315 return (error); 1316 } 1317 1318 static int 1319 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1320 { 1321 struct ifnet *vf_ifp; 1322 struct ifreq ifr; 1323 1324 HN_LOCK_ASSERT(sc); 1325 vf_ifp = sc->hn_vf_ifp; 1326 1327 memset(&ifr, 0, sizeof(ifr)); 1328 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1329 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1330 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1331 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1332 } 1333 1334 static void 1335 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1336 { 1337 struct ifnet *ifp = sc->hn_ifp; 1338 int allmulti = 0; 1339 1340 HN_LOCK_ASSERT(sc); 1341 1342 /* XXX vlan(4) style mcast addr maintenance */ 1343 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1344 allmulti = IFF_ALLMULTI; 1345 1346 /* Always set the VF's if_flags */ 1347 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1348 } 1349 1350 static void 1351 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1352 { 1353 struct rm_priotracker pt; 1354 struct ifnet *hn_ifp = NULL; 1355 struct mbuf *mn; 1356 1357 /* 1358 * XXX racy, if hn(4) ever detached. 1359 */ 1360 rm_rlock(&hn_vfmap_lock, &pt); 1361 if (vf_ifp->if_index < hn_vfmap_size) 1362 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1363 rm_runlock(&hn_vfmap_lock, &pt); 1364 1365 if (hn_ifp != NULL) { 1366 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1367 /* 1368 * Allow tapping on the VF. 1369 */ 1370 ETHER_BPF_MTAP(vf_ifp, mn); 1371 1372 /* 1373 * Update VF stats. 1374 */ 1375 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1376 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1377 mn->m_pkthdr.len); 1378 } 1379 /* 1380 * XXX IFCOUNTER_IMCAST 1381 * This stat updating is kinda invasive, since it 1382 * requires two checks on the mbuf: the length check 1383 * and the ethernet header check. As of this write, 1384 * all multicast packets go directly to hn(4), which 1385 * makes imcast stat updating in the VF a try in vian. 1386 */ 1387 1388 /* 1389 * Fix up rcvif and increase hn(4)'s ipackets. 1390 */ 1391 mn->m_pkthdr.rcvif = hn_ifp; 1392 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1393 } 1394 /* 1395 * Go through hn(4)'s if_input. 1396 */ 1397 hn_ifp->if_input(hn_ifp, m); 1398 } else { 1399 /* 1400 * In the middle of the transition; free this 1401 * mbuf chain. 1402 */ 1403 while (m != NULL) { 1404 mn = m->m_nextpkt; 1405 m->m_nextpkt = NULL; 1406 m_freem(m); 1407 m = mn; 1408 } 1409 } 1410 } 1411 1412 static void 1413 hn_mtu_change_fixup(struct hn_softc *sc) 1414 { 1415 struct ifnet *ifp; 1416 1417 HN_LOCK_ASSERT(sc); 1418 ifp = sc->hn_ifp; 1419 1420 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1421 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1422 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1423 } 1424 1425 static uint32_t 1426 hn_rss_type_fromndis(uint32_t rss_hash) 1427 { 1428 uint32_t types = 0; 1429 1430 if (rss_hash & NDIS_HASH_IPV4) 1431 types |= RSS_TYPE_IPV4; 1432 if (rss_hash & NDIS_HASH_TCP_IPV4) 1433 types |= RSS_TYPE_TCP_IPV4; 1434 if (rss_hash & NDIS_HASH_IPV6) 1435 types |= RSS_TYPE_IPV6; 1436 if (rss_hash & NDIS_HASH_IPV6_EX) 1437 types |= RSS_TYPE_IPV6_EX; 1438 if (rss_hash & NDIS_HASH_TCP_IPV6) 1439 types |= RSS_TYPE_TCP_IPV6; 1440 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1441 types |= RSS_TYPE_TCP_IPV6_EX; 1442 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1443 types |= RSS_TYPE_UDP_IPV4; 1444 return (types); 1445 } 1446 1447 static uint32_t 1448 hn_rss_type_tondis(uint32_t types) 1449 { 1450 uint32_t rss_hash = 0; 1451 1452 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1453 ("UDP6 and UDP6EX are not supported")); 1454 1455 if (types & RSS_TYPE_IPV4) 1456 rss_hash |= NDIS_HASH_IPV4; 1457 if (types & RSS_TYPE_TCP_IPV4) 1458 rss_hash |= NDIS_HASH_TCP_IPV4; 1459 if (types & RSS_TYPE_IPV6) 1460 rss_hash |= NDIS_HASH_IPV6; 1461 if (types & RSS_TYPE_IPV6_EX) 1462 rss_hash |= NDIS_HASH_IPV6_EX; 1463 if (types & RSS_TYPE_TCP_IPV6) 1464 rss_hash |= NDIS_HASH_TCP_IPV6; 1465 if (types & RSS_TYPE_TCP_IPV6_EX) 1466 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1467 if (types & RSS_TYPE_UDP_IPV4) 1468 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1469 return (rss_hash); 1470 } 1471 1472 static void 1473 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1474 { 1475 int i; 1476 1477 HN_LOCK_ASSERT(sc); 1478 1479 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1480 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1481 } 1482 1483 static void 1484 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1485 { 1486 struct ifnet *ifp, *vf_ifp; 1487 struct ifrsshash ifrh; 1488 struct ifrsskey ifrk; 1489 int error; 1490 uint32_t my_types, diff_types, mbuf_types = 0; 1491 1492 HN_LOCK_ASSERT(sc); 1493 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1494 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1495 1496 if (sc->hn_rx_ring_inuse == 1) { 1497 /* No RSS on synthetic parts; done. */ 1498 return; 1499 } 1500 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1501 /* Synthetic parts do not support Toeplitz; done. */ 1502 return; 1503 } 1504 1505 ifp = sc->hn_ifp; 1506 vf_ifp = sc->hn_vf_ifp; 1507 1508 /* 1509 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1510 * supported. 1511 */ 1512 memset(&ifrk, 0, sizeof(ifrk)); 1513 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1514 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1515 if (error) { 1516 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1517 vf_ifp->if_xname, error); 1518 goto done; 1519 } 1520 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1521 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1522 vf_ifp->if_xname, ifrk.ifrk_func); 1523 goto done; 1524 } 1525 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1526 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1527 vf_ifp->if_xname, ifrk.ifrk_keylen); 1528 goto done; 1529 } 1530 1531 /* 1532 * Extract VF's RSS hash. Only Toeplitz is supported. 1533 */ 1534 memset(&ifrh, 0, sizeof(ifrh)); 1535 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1536 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1537 if (error) { 1538 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1539 vf_ifp->if_xname, error); 1540 goto done; 1541 } 1542 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1543 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1544 vf_ifp->if_xname, ifrh.ifrh_func); 1545 goto done; 1546 } 1547 1548 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1549 if ((ifrh.ifrh_types & my_types) == 0) { 1550 /* This disables RSS; ignore it then */ 1551 if_printf(ifp, "%s intersection of RSS types failed. " 1552 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1553 ifrh.ifrh_types, my_types); 1554 goto done; 1555 } 1556 1557 diff_types = my_types ^ ifrh.ifrh_types; 1558 my_types &= ifrh.ifrh_types; 1559 mbuf_types = my_types; 1560 1561 /* 1562 * Detect RSS hash value/type confliction. 1563 * 1564 * NOTE: 1565 * We don't disable the hash type, but stop delivery the hash 1566 * value/type through mbufs on RX path. 1567 * 1568 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1569 * hash is delivered with type of TCP_IPV4. This means if 1570 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1571 * least to hn_mbuf_hash. However, given that _all_ of the 1572 * NICs implement TCP_IPV4, this will _not_ impose any issues 1573 * here. 1574 */ 1575 if ((my_types & RSS_TYPE_IPV4) && 1576 (diff_types & ifrh.ifrh_types & 1577 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1578 /* Conflict; disable IPV4 hash type/value delivery. */ 1579 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1580 mbuf_types &= ~RSS_TYPE_IPV4; 1581 } 1582 if ((my_types & RSS_TYPE_IPV6) && 1583 (diff_types & ifrh.ifrh_types & 1584 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1585 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1586 RSS_TYPE_IPV6_EX))) { 1587 /* Conflict; disable IPV6 hash type/value delivery. */ 1588 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1589 mbuf_types &= ~RSS_TYPE_IPV6; 1590 } 1591 if ((my_types & RSS_TYPE_IPV6_EX) && 1592 (diff_types & ifrh.ifrh_types & 1593 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1594 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1595 RSS_TYPE_IPV6))) { 1596 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1597 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1598 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1599 } 1600 if ((my_types & RSS_TYPE_TCP_IPV6) && 1601 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1602 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1603 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1604 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1605 } 1606 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1607 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1608 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1609 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1610 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1611 } 1612 if ((my_types & RSS_TYPE_UDP_IPV6) && 1613 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1614 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1615 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1616 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1617 } 1618 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1619 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1620 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1621 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1622 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1623 } 1624 1625 /* 1626 * Indirect table does not matter. 1627 */ 1628 1629 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1630 hn_rss_type_tondis(my_types); 1631 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1632 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1633 1634 if (reconf) { 1635 error = hn_rss_reconfig(sc); 1636 if (error) { 1637 /* XXX roll-back? */ 1638 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1639 /* XXX keep going. */ 1640 } 1641 } 1642 done: 1643 /* Hash deliverability for mbufs. */ 1644 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1645 } 1646 1647 static void 1648 hn_vf_rss_restore(struct hn_softc *sc) 1649 { 1650 1651 HN_LOCK_ASSERT(sc); 1652 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1653 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1654 1655 if (sc->hn_rx_ring_inuse == 1) 1656 goto done; 1657 1658 /* 1659 * Restore hash types. Key does _not_ matter. 1660 */ 1661 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1662 int error; 1663 1664 sc->hn_rss_hash = sc->hn_rss_hcap; 1665 error = hn_rss_reconfig(sc); 1666 if (error) { 1667 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1668 error); 1669 /* XXX keep going. */ 1670 } 1671 } 1672 done: 1673 /* Hash deliverability for mbufs. */ 1674 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1675 } 1676 1677 static void 1678 hn_xpnt_vf_setready(struct hn_softc *sc) 1679 { 1680 struct ifnet *ifp, *vf_ifp; 1681 struct ifreq ifr; 1682 1683 HN_LOCK_ASSERT(sc); 1684 ifp = sc->hn_ifp; 1685 vf_ifp = sc->hn_vf_ifp; 1686 1687 /* 1688 * Mark the VF ready. 1689 */ 1690 sc->hn_vf_rdytick = 0; 1691 1692 /* 1693 * Save information for restoration. 1694 */ 1695 sc->hn_saved_caps = ifp->if_capabilities; 1696 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1697 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1698 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1699 1700 /* 1701 * Intersect supported/enabled capabilities. 1702 * 1703 * NOTE: 1704 * if_hwassist is not changed here. 1705 */ 1706 ifp->if_capabilities &= vf_ifp->if_capabilities; 1707 ifp->if_capenable &= ifp->if_capabilities; 1708 1709 /* 1710 * Fix TSO settings. 1711 */ 1712 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1713 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1714 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1715 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1716 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1717 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1718 1719 /* 1720 * Change VF's enabled capabilities. 1721 */ 1722 memset(&ifr, 0, sizeof(ifr)); 1723 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1724 ifr.ifr_reqcap = ifp->if_capenable; 1725 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1726 1727 if (ifp->if_mtu != ETHERMTU) { 1728 int error; 1729 1730 /* 1731 * Change VF's MTU. 1732 */ 1733 memset(&ifr, 0, sizeof(ifr)); 1734 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1735 ifr.ifr_mtu = ifp->if_mtu; 1736 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1737 if (error) { 1738 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1739 vf_ifp->if_xname, ifp->if_mtu); 1740 if (ifp->if_mtu > ETHERMTU) { 1741 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1742 1743 /* 1744 * XXX 1745 * No need to adjust the synthetic parts' MTU; 1746 * failure of the adjustment will cause us 1747 * infinite headache. 1748 */ 1749 ifp->if_mtu = ETHERMTU; 1750 hn_mtu_change_fixup(sc); 1751 } 1752 } 1753 } 1754 } 1755 1756 static bool 1757 hn_xpnt_vf_isready(struct hn_softc *sc) 1758 { 1759 1760 HN_LOCK_ASSERT(sc); 1761 1762 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1763 return (false); 1764 1765 if (sc->hn_vf_rdytick == 0) 1766 return (true); 1767 1768 if (sc->hn_vf_rdytick > ticks) 1769 return (false); 1770 1771 /* Mark VF as ready. */ 1772 hn_xpnt_vf_setready(sc); 1773 return (true); 1774 } 1775 1776 static void 1777 hn_xpnt_vf_setenable(struct hn_softc *sc) 1778 { 1779 int i; 1780 1781 HN_LOCK_ASSERT(sc); 1782 1783 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1784 rm_wlock(&sc->hn_vf_lock); 1785 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1786 rm_wunlock(&sc->hn_vf_lock); 1787 1788 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1789 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1790 } 1791 1792 static void 1793 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1794 { 1795 int i; 1796 1797 HN_LOCK_ASSERT(sc); 1798 1799 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1800 rm_wlock(&sc->hn_vf_lock); 1801 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1802 if (clear_vf) 1803 sc->hn_vf_ifp = NULL; 1804 rm_wunlock(&sc->hn_vf_lock); 1805 1806 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1807 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1808 } 1809 1810 static void 1811 hn_xpnt_vf_init(struct hn_softc *sc) 1812 { 1813 int error; 1814 1815 HN_LOCK_ASSERT(sc); 1816 1817 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1818 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1819 1820 if (bootverbose) { 1821 if_printf(sc->hn_ifp, "try bringing up %s\n", 1822 sc->hn_vf_ifp->if_xname); 1823 } 1824 1825 /* 1826 * Bring the VF up. 1827 */ 1828 hn_xpnt_vf_saveifflags(sc); 1829 sc->hn_vf_ifp->if_flags |= IFF_UP; 1830 error = hn_xpnt_vf_iocsetflags(sc); 1831 if (error) { 1832 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1833 sc->hn_vf_ifp->if_xname, error); 1834 return; 1835 } 1836 1837 /* 1838 * NOTE: 1839 * Datapath setting must happen _after_ bringing the VF up. 1840 */ 1841 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1842 1843 /* 1844 * NOTE: 1845 * Fixup RSS related bits _after_ the VF is brought up, since 1846 * many VFs generate RSS key during it's initialization. 1847 */ 1848 hn_vf_rss_fixup(sc, true); 1849 1850 /* Mark transparent mode VF as enabled. */ 1851 hn_xpnt_vf_setenable(sc); 1852 } 1853 1854 static void 1855 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1856 { 1857 struct hn_softc *sc = xsc; 1858 1859 HN_LOCK(sc); 1860 1861 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1862 goto done; 1863 if (sc->hn_vf_ifp == NULL) 1864 goto done; 1865 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1866 goto done; 1867 1868 if (sc->hn_vf_rdytick != 0) { 1869 /* Mark VF as ready. */ 1870 hn_xpnt_vf_setready(sc); 1871 } 1872 1873 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1874 /* 1875 * Delayed VF initialization. 1876 */ 1877 if (bootverbose) { 1878 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1879 sc->hn_vf_ifp->if_xname); 1880 } 1881 hn_xpnt_vf_init(sc); 1882 } 1883 done: 1884 HN_UNLOCK(sc); 1885 } 1886 1887 static void 1888 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1889 { 1890 struct hn_softc *sc = xsc; 1891 1892 HN_LOCK(sc); 1893 1894 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1895 goto done; 1896 1897 if (!hn_ismyvf(sc, ifp)) 1898 goto done; 1899 1900 if (sc->hn_vf_ifp != NULL) { 1901 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1902 sc->hn_vf_ifp->if_xname); 1903 goto done; 1904 } 1905 1906 if (hn_xpnt_vf && ifp->if_start != NULL) { 1907 /* 1908 * ifnet.if_start is _not_ supported by transparent 1909 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1910 */ 1911 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1912 "in transparent VF mode.\n", ifp->if_xname); 1913 goto done; 1914 } 1915 1916 rm_wlock(&hn_vfmap_lock); 1917 1918 if (ifp->if_index >= hn_vfmap_size) { 1919 struct ifnet **newmap; 1920 int newsize; 1921 1922 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1923 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1924 M_WAITOK | M_ZERO); 1925 1926 memcpy(newmap, hn_vfmap, 1927 sizeof(struct ifnet *) * hn_vfmap_size); 1928 free(hn_vfmap, M_DEVBUF); 1929 hn_vfmap = newmap; 1930 hn_vfmap_size = newsize; 1931 } 1932 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1933 ("%s: ifindex %d was mapped to %s", 1934 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1935 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1936 1937 rm_wunlock(&hn_vfmap_lock); 1938 1939 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1940 rm_wlock(&sc->hn_vf_lock); 1941 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1942 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1943 sc->hn_vf_ifp = ifp; 1944 rm_wunlock(&sc->hn_vf_lock); 1945 1946 if (hn_xpnt_vf) { 1947 int wait_ticks; 1948 1949 /* 1950 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1951 * Save vf_ifp's current if_input for later restoration. 1952 */ 1953 sc->hn_vf_input = ifp->if_input; 1954 ifp->if_input = hn_xpnt_vf_input; 1955 1956 /* 1957 * Stop link status management; use the VF's. 1958 */ 1959 hn_suspend_mgmt(sc); 1960 1961 /* 1962 * Give VF sometime to complete its attach routing. 1963 */ 1964 wait_ticks = hn_xpnt_vf_attwait * hz; 1965 sc->hn_vf_rdytick = ticks + wait_ticks; 1966 1967 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1968 wait_ticks); 1969 } 1970 done: 1971 HN_UNLOCK(sc); 1972 } 1973 1974 static void 1975 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1976 { 1977 struct hn_softc *sc = xsc; 1978 1979 HN_LOCK(sc); 1980 1981 if (sc->hn_vf_ifp == NULL) 1982 goto done; 1983 1984 if (!hn_ismyvf(sc, ifp)) 1985 goto done; 1986 1987 if (hn_xpnt_vf) { 1988 /* 1989 * Make sure that the delayed initialization is not running. 1990 * 1991 * NOTE: 1992 * - This lock _must_ be released, since the hn_vf_init task 1993 * will try holding this lock. 1994 * - It is safe to release this lock here, since the 1995 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1996 * 1997 * XXX racy, if hn(4) ever detached. 1998 */ 1999 HN_UNLOCK(sc); 2000 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2001 HN_LOCK(sc); 2002 2003 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2004 sc->hn_ifp->if_xname)); 2005 ifp->if_input = sc->hn_vf_input; 2006 sc->hn_vf_input = NULL; 2007 2008 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2009 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2010 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2011 2012 if (sc->hn_vf_rdytick == 0) { 2013 /* 2014 * The VF was ready; restore some settings. 2015 */ 2016 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2017 /* 2018 * NOTE: 2019 * There is _no_ need to fixup if_capenable and 2020 * if_hwassist, since the if_capabilities before 2021 * restoration was an intersection of the VF's 2022 * if_capabilites and the synthetic device's 2023 * if_capabilites. 2024 */ 2025 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2026 sc->hn_ifp->if_hw_tsomaxsegcount = 2027 sc->hn_saved_tsosegcnt; 2028 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2029 } 2030 2031 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2032 /* 2033 * Restore RSS settings. 2034 */ 2035 hn_vf_rss_restore(sc); 2036 2037 /* 2038 * Resume link status management, which was suspended 2039 * by hn_ifnet_attevent(). 2040 */ 2041 hn_resume_mgmt(sc); 2042 } 2043 } 2044 2045 /* Mark transparent mode VF as disabled. */ 2046 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2047 2048 rm_wlock(&hn_vfmap_lock); 2049 2050 KASSERT(ifp->if_index < hn_vfmap_size, 2051 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2052 if (hn_vfmap[ifp->if_index] != NULL) { 2053 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2054 ("%s: ifindex %d was mapped to %s", 2055 ifp->if_xname, ifp->if_index, 2056 hn_vfmap[ifp->if_index]->if_xname)); 2057 hn_vfmap[ifp->if_index] = NULL; 2058 } 2059 2060 rm_wunlock(&hn_vfmap_lock); 2061 done: 2062 HN_UNLOCK(sc); 2063 } 2064 2065 static void 2066 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2067 { 2068 struct hn_softc *sc = xsc; 2069 2070 if (sc->hn_vf_ifp == ifp) 2071 if_link_state_change(sc->hn_ifp, link_state); 2072 } 2073 2074 static int 2075 hn_probe(device_t dev) 2076 { 2077 2078 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2079 device_set_desc(dev, "Hyper-V Network Interface"); 2080 return BUS_PROBE_DEFAULT; 2081 } 2082 return ENXIO; 2083 } 2084 2085 static int 2086 hn_attach(device_t dev) 2087 { 2088 struct hn_softc *sc = device_get_softc(dev); 2089 struct sysctl_oid_list *child; 2090 struct sysctl_ctx_list *ctx; 2091 uint8_t eaddr[ETHER_ADDR_LEN]; 2092 struct ifnet *ifp = NULL; 2093 int error, ring_cnt, tx_ring_cnt; 2094 uint32_t mtu; 2095 2096 sc->hn_dev = dev; 2097 sc->hn_prichan = vmbus_get_channel(dev); 2098 HN_LOCK_INIT(sc); 2099 rm_init(&sc->hn_vf_lock, "hnvf"); 2100 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2101 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2102 2103 /* 2104 * Initialize these tunables once. 2105 */ 2106 sc->hn_agg_size = hn_tx_agg_size; 2107 sc->hn_agg_pkts = hn_tx_agg_pkts; 2108 2109 /* 2110 * Setup taskqueue for transmission. 2111 */ 2112 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2113 int i; 2114 2115 sc->hn_tx_taskqs = 2116 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2117 M_DEVBUF, M_WAITOK); 2118 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2119 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2120 M_WAITOK, taskqueue_thread_enqueue, 2121 &sc->hn_tx_taskqs[i]); 2122 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2123 "%s tx%d", device_get_nameunit(dev), i); 2124 } 2125 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2126 sc->hn_tx_taskqs = hn_tx_taskque; 2127 } 2128 2129 /* 2130 * Setup taskqueue for mangement tasks, e.g. link status. 2131 */ 2132 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2133 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2134 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2135 device_get_nameunit(dev)); 2136 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2137 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2138 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2139 hn_netchg_status_taskfunc, sc); 2140 2141 if (hn_xpnt_vf) { 2142 /* 2143 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2144 */ 2145 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2146 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2147 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2148 device_get_nameunit(dev)); 2149 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2150 hn_xpnt_vf_init_taskfunc, sc); 2151 } 2152 2153 /* 2154 * Allocate ifnet and setup its name earlier, so that if_printf 2155 * can be used by functions, which will be called after 2156 * ether_ifattach(). 2157 */ 2158 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2159 ifp->if_softc = sc; 2160 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2161 2162 /* 2163 * Initialize ifmedia earlier so that it can be unconditionally 2164 * destroyed, if error happened later on. 2165 */ 2166 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2167 2168 /* 2169 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2170 * to use (tx_ring_cnt). 2171 * 2172 * NOTE: 2173 * The # of RX rings to use is same as the # of channels to use. 2174 */ 2175 ring_cnt = hn_chan_cnt; 2176 if (ring_cnt <= 0) { 2177 /* Default */ 2178 ring_cnt = mp_ncpus; 2179 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2180 ring_cnt = HN_RING_CNT_DEF_MAX; 2181 } else if (ring_cnt > mp_ncpus) { 2182 ring_cnt = mp_ncpus; 2183 } 2184 #ifdef RSS 2185 if (ring_cnt > rss_getnumbuckets()) 2186 ring_cnt = rss_getnumbuckets(); 2187 #endif 2188 2189 tx_ring_cnt = hn_tx_ring_cnt; 2190 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2191 tx_ring_cnt = ring_cnt; 2192 #ifdef HN_IFSTART_SUPPORT 2193 if (hn_use_if_start) { 2194 /* ifnet.if_start only needs one TX ring. */ 2195 tx_ring_cnt = 1; 2196 } 2197 #endif 2198 2199 /* 2200 * Set the leader CPU for channels. 2201 */ 2202 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2203 2204 /* 2205 * Create enough TX/RX rings, even if only limited number of 2206 * channels can be allocated. 2207 */ 2208 error = hn_create_tx_data(sc, tx_ring_cnt); 2209 if (error) 2210 goto failed; 2211 error = hn_create_rx_data(sc, ring_cnt); 2212 if (error) 2213 goto failed; 2214 2215 /* 2216 * Create transaction context for NVS and RNDIS transactions. 2217 */ 2218 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2219 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2220 if (sc->hn_xact == NULL) { 2221 error = ENXIO; 2222 goto failed; 2223 } 2224 2225 /* 2226 * Install orphan handler for the revocation of this device's 2227 * primary channel. 2228 * 2229 * NOTE: 2230 * The processing order is critical here: 2231 * Install the orphan handler, _before_ testing whether this 2232 * device's primary channel has been revoked or not. 2233 */ 2234 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2235 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2236 error = ENXIO; 2237 goto failed; 2238 } 2239 2240 /* 2241 * Attach the synthetic parts, i.e. NVS and RNDIS. 2242 */ 2243 error = hn_synth_attach(sc, ETHERMTU); 2244 if (error) 2245 goto failed; 2246 2247 error = hn_rndis_get_eaddr(sc, eaddr); 2248 if (error) 2249 goto failed; 2250 2251 error = hn_rndis_get_mtu(sc, &mtu); 2252 if (error) 2253 mtu = ETHERMTU; 2254 else if (bootverbose) 2255 device_printf(dev, "RNDIS mtu %u\n", mtu); 2256 2257 if (sc->hn_rx_ring_inuse > 1) { 2258 /* 2259 * Reduce TCP segment aggregation limit for multiple 2260 * RX rings to increase ACK timeliness. 2261 */ 2262 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2263 } 2264 2265 /* 2266 * Fixup TX/RX stuffs after synthetic parts are attached. 2267 */ 2268 hn_fixup_tx_data(sc); 2269 hn_fixup_rx_data(sc); 2270 2271 ctx = device_get_sysctl_ctx(dev); 2272 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2273 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2274 &sc->hn_nvs_ver, 0, "NVS version"); 2275 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2276 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2277 hn_ndis_version_sysctl, "A", "NDIS version"); 2278 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2279 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2280 hn_caps_sysctl, "A", "capabilities"); 2281 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2282 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2283 hn_hwassist_sysctl, "A", "hwassist"); 2284 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2285 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2286 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2287 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2288 "max # of TSO segments"); 2289 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2290 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2291 "max size of TSO segment"); 2292 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2293 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2294 hn_rxfilter_sysctl, "A", "rxfilter"); 2295 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2296 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2297 hn_rss_hash_sysctl, "A", "RSS hash"); 2298 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2299 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2300 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2301 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2302 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2303 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2304 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2305 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2306 #ifndef RSS 2307 /* 2308 * Don't allow RSS key/indirect table changes, if RSS is defined. 2309 */ 2310 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2311 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2312 hn_rss_key_sysctl, "IU", "RSS key"); 2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2314 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2315 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2316 #endif 2317 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2318 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2319 "RNDIS offered packet transmission aggregation size limit"); 2320 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2321 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2322 "RNDIS offered packet transmission aggregation count limit"); 2323 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2324 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2325 "RNDIS packet transmission aggregation alignment"); 2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2327 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2328 hn_txagg_size_sysctl, "I", 2329 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2330 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2331 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2332 hn_txagg_pkts_sysctl, "I", 2333 "Packet transmission aggregation packets, " 2334 "0 -- disable, -1 -- auto"); 2335 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2336 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2337 hn_polling_sysctl, "I", 2338 "Polling frequency: [100,1000000], 0 disable polling"); 2339 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2340 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2341 hn_vf_sysctl, "A", "Virtual Function's name"); 2342 if (!hn_xpnt_vf) { 2343 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2344 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2345 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2346 } else { 2347 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2348 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2349 hn_xpnt_vf_enabled_sysctl, "I", 2350 "Transparent VF enabled"); 2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2352 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2353 hn_xpnt_vf_accbpf_sysctl, "I", 2354 "Accurate BPF for transparent VF"); 2355 } 2356 2357 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2358 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2359 "switch to rsc"); 2360 2361 /* 2362 * Setup the ifmedia, which has been initialized earlier. 2363 */ 2364 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2365 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2366 /* XXX ifmedia_set really should do this for us */ 2367 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2368 2369 /* 2370 * Setup the ifnet for this interface. 2371 */ 2372 2373 ifp->if_baudrate = IF_Gbps(10); 2374 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2375 ifp->if_ioctl = hn_ioctl; 2376 ifp->if_init = hn_init; 2377 #ifdef HN_IFSTART_SUPPORT 2378 if (hn_use_if_start) { 2379 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2380 2381 ifp->if_start = hn_start; 2382 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2383 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2384 IFQ_SET_READY(&ifp->if_snd); 2385 } else 2386 #endif 2387 { 2388 ifp->if_transmit = hn_transmit; 2389 ifp->if_qflush = hn_xmit_qflush; 2390 } 2391 2392 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2393 #ifdef foo 2394 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2395 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2396 #endif 2397 if (sc->hn_caps & HN_CAP_VLAN) { 2398 /* XXX not sure about VLAN_MTU. */ 2399 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2400 } 2401 2402 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2403 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2404 ifp->if_capabilities |= IFCAP_TXCSUM; 2405 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2406 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2407 if (sc->hn_caps & HN_CAP_TSO4) { 2408 ifp->if_capabilities |= IFCAP_TSO4; 2409 ifp->if_hwassist |= CSUM_IP_TSO; 2410 } 2411 if (sc->hn_caps & HN_CAP_TSO6) { 2412 ifp->if_capabilities |= IFCAP_TSO6; 2413 ifp->if_hwassist |= CSUM_IP6_TSO; 2414 } 2415 2416 /* Enable all available capabilities by default. */ 2417 ifp->if_capenable = ifp->if_capabilities; 2418 2419 /* 2420 * Disable IPv6 TSO and TXCSUM by default, they still can 2421 * be enabled through SIOCSIFCAP. 2422 */ 2423 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2424 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2425 2426 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2427 /* 2428 * Lock hn_set_tso_maxsize() to simplify its 2429 * internal logic. 2430 */ 2431 HN_LOCK(sc); 2432 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2433 HN_UNLOCK(sc); 2434 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2435 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2436 } 2437 2438 ether_ifattach(ifp, eaddr); 2439 2440 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2441 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2442 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2443 } 2444 if (mtu < ETHERMTU) { 2445 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2446 ifp->if_mtu = mtu; 2447 } 2448 2449 /* Inform the upper layer about the long frame support. */ 2450 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2451 2452 /* 2453 * Kick off link status check. 2454 */ 2455 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2456 hn_update_link_status(sc); 2457 2458 if (!hn_xpnt_vf) { 2459 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2460 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2461 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2462 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2463 } else { 2464 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2465 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2466 } 2467 2468 /* 2469 * NOTE: 2470 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2471 * since interface's LLADDR is needed; interface LLADDR is not 2472 * available when ifnet_arrival event is triggered. 2473 */ 2474 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2475 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2476 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2477 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2478 2479 return (0); 2480 failed: 2481 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2482 hn_synth_detach(sc); 2483 hn_detach(dev); 2484 return (error); 2485 } 2486 2487 static int 2488 hn_detach(device_t dev) 2489 { 2490 struct hn_softc *sc = device_get_softc(dev); 2491 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2492 2493 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2494 /* 2495 * In case that the vmbus missed the orphan handler 2496 * installation. 2497 */ 2498 vmbus_xact_ctx_orphan(sc->hn_xact); 2499 } 2500 2501 if (sc->hn_ifaddr_evthand != NULL) 2502 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2503 if (sc->hn_ifnet_evthand != NULL) 2504 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2505 if (sc->hn_ifnet_atthand != NULL) { 2506 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2507 sc->hn_ifnet_atthand); 2508 } 2509 if (sc->hn_ifnet_dethand != NULL) { 2510 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2511 sc->hn_ifnet_dethand); 2512 } 2513 if (sc->hn_ifnet_lnkhand != NULL) 2514 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2515 2516 vf_ifp = sc->hn_vf_ifp; 2517 __compiler_membar(); 2518 if (vf_ifp != NULL) 2519 hn_ifnet_detevent(sc, vf_ifp); 2520 2521 if (device_is_attached(dev)) { 2522 HN_LOCK(sc); 2523 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2524 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2525 hn_stop(sc, true); 2526 /* 2527 * NOTE: 2528 * hn_stop() only suspends data, so managment 2529 * stuffs have to be suspended manually here. 2530 */ 2531 hn_suspend_mgmt(sc); 2532 hn_synth_detach(sc); 2533 } 2534 HN_UNLOCK(sc); 2535 ether_ifdetach(ifp); 2536 } 2537 2538 ifmedia_removeall(&sc->hn_media); 2539 hn_destroy_rx_data(sc); 2540 hn_destroy_tx_data(sc); 2541 2542 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2543 int i; 2544 2545 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2546 taskqueue_free(sc->hn_tx_taskqs[i]); 2547 free(sc->hn_tx_taskqs, M_DEVBUF); 2548 } 2549 taskqueue_free(sc->hn_mgmt_taskq0); 2550 if (sc->hn_vf_taskq != NULL) 2551 taskqueue_free(sc->hn_vf_taskq); 2552 2553 if (sc->hn_xact != NULL) { 2554 /* 2555 * Uninstall the orphan handler _before_ the xact is 2556 * destructed. 2557 */ 2558 vmbus_chan_unset_orphan(sc->hn_prichan); 2559 vmbus_xact_ctx_destroy(sc->hn_xact); 2560 } 2561 2562 if_free(ifp); 2563 2564 HN_LOCK_DESTROY(sc); 2565 rm_destroy(&sc->hn_vf_lock); 2566 return (0); 2567 } 2568 2569 static int 2570 hn_shutdown(device_t dev) 2571 { 2572 2573 return (0); 2574 } 2575 2576 static void 2577 hn_link_status(struct hn_softc *sc) 2578 { 2579 uint32_t link_status; 2580 int error; 2581 2582 error = hn_rndis_get_linkstatus(sc, &link_status); 2583 if (error) { 2584 /* XXX what to do? */ 2585 return; 2586 } 2587 2588 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2589 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2590 else 2591 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2592 if_link_state_change(sc->hn_ifp, 2593 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2594 LINK_STATE_UP : LINK_STATE_DOWN); 2595 } 2596 2597 static void 2598 hn_link_taskfunc(void *xsc, int pending __unused) 2599 { 2600 struct hn_softc *sc = xsc; 2601 2602 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2603 return; 2604 hn_link_status(sc); 2605 } 2606 2607 static void 2608 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2609 { 2610 struct hn_softc *sc = xsc; 2611 2612 /* Prevent any link status checks from running. */ 2613 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2614 2615 /* 2616 * Fake up a [link down --> link up] state change; 5 seconds 2617 * delay is used, which closely simulates miibus reaction 2618 * upon link down event. 2619 */ 2620 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2621 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2622 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2623 &sc->hn_netchg_status, 5 * hz); 2624 } 2625 2626 static void 2627 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2628 { 2629 struct hn_softc *sc = xsc; 2630 2631 /* Re-allow link status checks. */ 2632 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2633 hn_link_status(sc); 2634 } 2635 2636 static void 2637 hn_update_link_status(struct hn_softc *sc) 2638 { 2639 2640 if (sc->hn_mgmt_taskq != NULL) 2641 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2642 } 2643 2644 static void 2645 hn_change_network(struct hn_softc *sc) 2646 { 2647 2648 if (sc->hn_mgmt_taskq != NULL) 2649 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2650 } 2651 2652 static __inline int 2653 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2654 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2655 { 2656 struct mbuf *m = *m_head; 2657 int error; 2658 2659 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2660 2661 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2662 m, segs, nsegs, BUS_DMA_NOWAIT); 2663 if (error == EFBIG) { 2664 struct mbuf *m_new; 2665 2666 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2667 if (m_new == NULL) 2668 return ENOBUFS; 2669 else 2670 *m_head = m = m_new; 2671 txr->hn_tx_collapsed++; 2672 2673 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2674 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2675 } 2676 if (!error) { 2677 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2678 BUS_DMASYNC_PREWRITE); 2679 txd->flags |= HN_TXD_FLAG_DMAMAP; 2680 } 2681 return error; 2682 } 2683 2684 static __inline int 2685 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2686 { 2687 2688 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2689 ("put an onlist txd %#x", txd->flags)); 2690 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2691 ("put an onagg txd %#x", txd->flags)); 2692 2693 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2694 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2695 return 0; 2696 2697 if (!STAILQ_EMPTY(&txd->agg_list)) { 2698 struct hn_txdesc *tmp_txd; 2699 2700 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2701 int freed __diagused; 2702 2703 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2704 ("resursive aggregation on aggregated txdesc")); 2705 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2706 ("not aggregated txdesc")); 2707 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2708 ("aggregated txdesc uses dmamap")); 2709 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2710 ("aggregated txdesc consumes " 2711 "chimney sending buffer")); 2712 KASSERT(tmp_txd->chim_size == 0, 2713 ("aggregated txdesc has non-zero " 2714 "chimney sending size")); 2715 2716 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2717 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2718 freed = hn_txdesc_put(txr, tmp_txd); 2719 KASSERT(freed, ("failed to free aggregated txdesc")); 2720 } 2721 } 2722 2723 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2724 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2725 ("chim txd uses dmamap")); 2726 hn_chim_free(txr->hn_sc, txd->chim_index); 2727 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2728 txd->chim_size = 0; 2729 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2730 bus_dmamap_sync(txr->hn_tx_data_dtag, 2731 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2732 bus_dmamap_unload(txr->hn_tx_data_dtag, 2733 txd->data_dmap); 2734 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2735 } 2736 2737 if (txd->m != NULL) { 2738 m_freem(txd->m); 2739 txd->m = NULL; 2740 } 2741 2742 txd->flags |= HN_TXD_FLAG_ONLIST; 2743 #ifndef HN_USE_TXDESC_BUFRING 2744 mtx_lock_spin(&txr->hn_txlist_spin); 2745 KASSERT(txr->hn_txdesc_avail >= 0 && 2746 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2747 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2748 txr->hn_txdesc_avail++; 2749 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2750 mtx_unlock_spin(&txr->hn_txlist_spin); 2751 #else /* HN_USE_TXDESC_BUFRING */ 2752 #ifdef HN_DEBUG 2753 atomic_add_int(&txr->hn_txdesc_avail, 1); 2754 #endif 2755 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2756 #endif /* !HN_USE_TXDESC_BUFRING */ 2757 2758 return 1; 2759 } 2760 2761 static __inline struct hn_txdesc * 2762 hn_txdesc_get(struct hn_tx_ring *txr) 2763 { 2764 struct hn_txdesc *txd; 2765 2766 #ifndef HN_USE_TXDESC_BUFRING 2767 mtx_lock_spin(&txr->hn_txlist_spin); 2768 txd = SLIST_FIRST(&txr->hn_txlist); 2769 if (txd != NULL) { 2770 KASSERT(txr->hn_txdesc_avail > 0, 2771 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2772 txr->hn_txdesc_avail--; 2773 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2774 } 2775 mtx_unlock_spin(&txr->hn_txlist_spin); 2776 #else 2777 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2778 #endif 2779 2780 if (txd != NULL) { 2781 #ifdef HN_USE_TXDESC_BUFRING 2782 #ifdef HN_DEBUG 2783 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2784 #endif 2785 #endif /* HN_USE_TXDESC_BUFRING */ 2786 KASSERT(txd->m == NULL && txd->refs == 0 && 2787 STAILQ_EMPTY(&txd->agg_list) && 2788 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2789 txd->chim_size == 0 && 2790 (txd->flags & HN_TXD_FLAG_ONLIST) && 2791 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2792 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2793 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2794 txd->refs = 1; 2795 } 2796 return txd; 2797 } 2798 2799 static __inline void 2800 hn_txdesc_hold(struct hn_txdesc *txd) 2801 { 2802 2803 /* 0->1 transition will never work */ 2804 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2805 atomic_add_int(&txd->refs, 1); 2806 } 2807 2808 static __inline void 2809 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2810 { 2811 2812 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2813 ("recursive aggregation on aggregating txdesc")); 2814 2815 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2816 ("already aggregated")); 2817 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2818 ("recursive aggregation on to-be-aggregated txdesc")); 2819 2820 txd->flags |= HN_TXD_FLAG_ONAGG; 2821 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2822 } 2823 2824 static bool 2825 hn_tx_ring_pending(struct hn_tx_ring *txr) 2826 { 2827 bool pending = false; 2828 2829 #ifndef HN_USE_TXDESC_BUFRING 2830 mtx_lock_spin(&txr->hn_txlist_spin); 2831 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2832 pending = true; 2833 mtx_unlock_spin(&txr->hn_txlist_spin); 2834 #else 2835 if (!buf_ring_full(txr->hn_txdesc_br)) 2836 pending = true; 2837 #endif 2838 return (pending); 2839 } 2840 2841 static __inline void 2842 hn_txeof(struct hn_tx_ring *txr) 2843 { 2844 txr->hn_has_txeof = 0; 2845 txr->hn_txeof(txr); 2846 } 2847 2848 static void 2849 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2850 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2851 { 2852 struct hn_txdesc *txd = sndc->hn_cbarg; 2853 struct hn_tx_ring *txr; 2854 2855 txr = txd->txr; 2856 KASSERT(txr->hn_chan == chan, 2857 ("channel mismatch, on chan%u, should be chan%u", 2858 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2859 2860 txr->hn_has_txeof = 1; 2861 hn_txdesc_put(txr, txd); 2862 2863 ++txr->hn_txdone_cnt; 2864 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2865 txr->hn_txdone_cnt = 0; 2866 if (txr->hn_oactive) 2867 hn_txeof(txr); 2868 } 2869 } 2870 2871 static void 2872 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2873 { 2874 #if defined(INET) || defined(INET6) 2875 struct epoch_tracker et; 2876 2877 NET_EPOCH_ENTER(et); 2878 tcp_lro_flush_all(&rxr->hn_lro); 2879 NET_EPOCH_EXIT(et); 2880 #endif 2881 2882 /* 2883 * NOTE: 2884 * 'txr' could be NULL, if multiple channels and 2885 * ifnet.if_start method are enabled. 2886 */ 2887 if (txr == NULL || !txr->hn_has_txeof) 2888 return; 2889 2890 txr->hn_txdone_cnt = 0; 2891 hn_txeof(txr); 2892 } 2893 2894 static __inline uint32_t 2895 hn_rndis_pktmsg_offset(uint32_t ofs) 2896 { 2897 2898 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2899 ("invalid RNDIS packet msg offset %u", ofs)); 2900 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2901 } 2902 2903 static __inline void * 2904 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2905 size_t pi_dlen, uint32_t pi_type) 2906 { 2907 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2908 struct rndis_pktinfo *pi; 2909 2910 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2911 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2912 2913 /* 2914 * Per-packet-info does not move; it only grows. 2915 * 2916 * NOTE: 2917 * rm_pktinfooffset in this phase counts from the beginning 2918 * of rndis_packet_msg. 2919 */ 2920 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2921 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2922 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2923 pkt->rm_pktinfolen); 2924 pkt->rm_pktinfolen += pi_size; 2925 2926 pi->rm_size = pi_size; 2927 pi->rm_type = pi_type; 2928 pi->rm_internal = 0; 2929 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2930 2931 return (pi->rm_data); 2932 } 2933 2934 static __inline int 2935 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2936 { 2937 struct hn_txdesc *txd; 2938 struct mbuf *m; 2939 int error, pkts; 2940 2941 txd = txr->hn_agg_txd; 2942 KASSERT(txd != NULL, ("no aggregate txdesc")); 2943 2944 /* 2945 * Since hn_txpkt() will reset this temporary stat, save 2946 * it now, so that oerrors can be updated properly, if 2947 * hn_txpkt() ever fails. 2948 */ 2949 pkts = txr->hn_stat_pkts; 2950 2951 /* 2952 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2953 * failure, save it for later freeing, if hn_txpkt() ever 2954 * fails. 2955 */ 2956 m = txd->m; 2957 error = hn_txpkt(ifp, txr, txd); 2958 if (__predict_false(error)) { 2959 /* txd is freed, but m is not. */ 2960 m_freem(m); 2961 2962 txr->hn_flush_failed++; 2963 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2964 } 2965 2966 /* Reset all aggregation states. */ 2967 txr->hn_agg_txd = NULL; 2968 txr->hn_agg_szleft = 0; 2969 txr->hn_agg_pktleft = 0; 2970 txr->hn_agg_prevpkt = NULL; 2971 2972 return (error); 2973 } 2974 2975 static void * 2976 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2977 int pktsize) 2978 { 2979 void *chim; 2980 2981 if (txr->hn_agg_txd != NULL) { 2982 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2983 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2984 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2985 int olen; 2986 2987 /* 2988 * Update the previous RNDIS packet's total length, 2989 * it can be increased due to the mandatory alignment 2990 * padding for this RNDIS packet. And update the 2991 * aggregating txdesc's chimney sending buffer size 2992 * accordingly. 2993 * 2994 * XXX 2995 * Zero-out the padding, as required by the RNDIS spec. 2996 */ 2997 olen = pkt->rm_len; 2998 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 2999 agg_txd->chim_size += pkt->rm_len - olen; 3000 3001 /* Link this txdesc to the parent. */ 3002 hn_txdesc_agg(agg_txd, txd); 3003 3004 chim = (uint8_t *)pkt + pkt->rm_len; 3005 /* Save the current packet for later fixup. */ 3006 txr->hn_agg_prevpkt = chim; 3007 3008 txr->hn_agg_pktleft--; 3009 txr->hn_agg_szleft -= pktsize; 3010 if (txr->hn_agg_szleft <= 3011 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3012 /* 3013 * Probably can't aggregate more packets, 3014 * flush this aggregating txdesc proactively. 3015 */ 3016 txr->hn_agg_pktleft = 0; 3017 } 3018 /* Done! */ 3019 return (chim); 3020 } 3021 hn_flush_txagg(ifp, txr); 3022 } 3023 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3024 3025 txr->hn_tx_chimney_tried++; 3026 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3027 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3028 return (NULL); 3029 txr->hn_tx_chimney++; 3030 3031 chim = txr->hn_sc->hn_chim + 3032 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3033 3034 if (txr->hn_agg_pktmax > 1 && 3035 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3036 txr->hn_agg_txd = txd; 3037 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3038 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3039 txr->hn_agg_prevpkt = chim; 3040 } 3041 return (chim); 3042 } 3043 3044 /* 3045 * NOTE: 3046 * If this function fails, then both txd and m_head0 will be freed. 3047 */ 3048 static int 3049 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3050 struct mbuf **m_head0) 3051 { 3052 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3053 int error, nsegs, i; 3054 struct mbuf *m_head = *m_head0; 3055 struct rndis_packet_msg *pkt; 3056 uint32_t *pi_data; 3057 void *chim = NULL; 3058 int pkt_hlen, pkt_size; 3059 3060 pkt = txd->rndis_pkt; 3061 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3062 if (pkt_size < txr->hn_chim_size) { 3063 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3064 if (chim != NULL) 3065 pkt = chim; 3066 } else { 3067 if (txr->hn_agg_txd != NULL) 3068 hn_flush_txagg(ifp, txr); 3069 } 3070 3071 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3072 pkt->rm_len = m_head->m_pkthdr.len; 3073 pkt->rm_dataoffset = 0; 3074 pkt->rm_datalen = m_head->m_pkthdr.len; 3075 pkt->rm_oobdataoffset = 0; 3076 pkt->rm_oobdatalen = 0; 3077 pkt->rm_oobdataelements = 0; 3078 pkt->rm_pktinfooffset = sizeof(*pkt); 3079 pkt->rm_pktinfolen = 0; 3080 pkt->rm_vchandle = 0; 3081 pkt->rm_reserved = 0; 3082 3083 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3084 /* 3085 * Set the hash value for this packet. 3086 */ 3087 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3088 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3089 3090 if (M_HASHTYPE_ISHASH(m_head)) 3091 /* 3092 * The flowid field contains the hash value host 3093 * set in the rx queue if it is a ip forwarding pkt. 3094 * Set the same hash value so host can send on the 3095 * cpu it was received. 3096 */ 3097 *pi_data = m_head->m_pkthdr.flowid; 3098 else 3099 /* 3100 * Otherwise just put the tx queue index. 3101 */ 3102 *pi_data = txr->hn_tx_idx; 3103 } 3104 3105 if (m_head->m_flags & M_VLANTAG) { 3106 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3107 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3108 *pi_data = NDIS_VLAN_INFO_MAKE( 3109 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3110 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3111 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3112 } 3113 3114 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3115 #if defined(INET6) || defined(INET) 3116 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3117 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3118 #ifdef INET 3119 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3120 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3121 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3122 m_head->m_pkthdr.tso_segsz); 3123 } 3124 #endif 3125 #if defined(INET6) && defined(INET) 3126 else 3127 #endif 3128 #ifdef INET6 3129 { 3130 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3131 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3132 m_head->m_pkthdr.tso_segsz); 3133 } 3134 #endif 3135 #endif /* INET6 || INET */ 3136 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3137 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3138 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3139 if (m_head->m_pkthdr.csum_flags & 3140 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3141 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3142 } else { 3143 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3144 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3145 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3146 } 3147 3148 if (m_head->m_pkthdr.csum_flags & 3149 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3150 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3151 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3152 } else if (m_head->m_pkthdr.csum_flags & 3153 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3154 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3155 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3156 } 3157 } 3158 3159 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3160 /* Fixup RNDIS packet message total length */ 3161 pkt->rm_len += pkt_hlen; 3162 /* Convert RNDIS packet message offsets */ 3163 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3164 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3165 3166 /* 3167 * Fast path: Chimney sending. 3168 */ 3169 if (chim != NULL) { 3170 struct hn_txdesc *tgt_txd = txd; 3171 3172 if (txr->hn_agg_txd != NULL) { 3173 tgt_txd = txr->hn_agg_txd; 3174 #ifdef INVARIANTS 3175 *m_head0 = NULL; 3176 #endif 3177 } 3178 3179 KASSERT(pkt == chim, 3180 ("RNDIS pkt not in chimney sending buffer")); 3181 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3182 ("chimney sending buffer is not used")); 3183 tgt_txd->chim_size += pkt->rm_len; 3184 3185 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3186 ((uint8_t *)chim) + pkt_hlen); 3187 3188 txr->hn_gpa_cnt = 0; 3189 txr->hn_sendpkt = hn_txpkt_chim; 3190 goto done; 3191 } 3192 3193 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3194 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3195 ("chimney buffer is used")); 3196 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3197 3198 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3199 if (__predict_false(error)) { 3200 int freed __diagused; 3201 3202 /* 3203 * This mbuf is not linked w/ the txd yet, so free it now. 3204 */ 3205 m_freem(m_head); 3206 *m_head0 = NULL; 3207 3208 freed = hn_txdesc_put(txr, txd); 3209 KASSERT(freed != 0, 3210 ("fail to free txd upon txdma error")); 3211 3212 txr->hn_txdma_failed++; 3213 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3214 return error; 3215 } 3216 *m_head0 = m_head; 3217 3218 /* +1 RNDIS packet message */ 3219 txr->hn_gpa_cnt = nsegs + 1; 3220 3221 /* send packet with page buffer */ 3222 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3223 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3224 txr->hn_gpa[0].gpa_len = pkt_hlen; 3225 3226 /* 3227 * Fill the page buffers with mbuf info after the page 3228 * buffer for RNDIS packet message. 3229 */ 3230 for (i = 0; i < nsegs; ++i) { 3231 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3232 3233 gpa->gpa_page = atop(segs[i].ds_addr); 3234 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3235 gpa->gpa_len = segs[i].ds_len; 3236 } 3237 3238 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3239 txd->chim_size = 0; 3240 txr->hn_sendpkt = hn_txpkt_sglist; 3241 done: 3242 txd->m = m_head; 3243 3244 /* Set the completion routine */ 3245 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3246 3247 /* Update temporary stats for later use. */ 3248 txr->hn_stat_pkts++; 3249 txr->hn_stat_size += m_head->m_pkthdr.len; 3250 if (m_head->m_flags & M_MCAST) 3251 txr->hn_stat_mcasts++; 3252 3253 return 0; 3254 } 3255 3256 /* 3257 * NOTE: 3258 * If this function fails, then txd will be freed, but the mbuf 3259 * associated w/ the txd will _not_ be freed. 3260 */ 3261 static int 3262 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3263 { 3264 int error, send_failed = 0, has_bpf; 3265 3266 again: 3267 has_bpf = bpf_peers_present(ifp->if_bpf); 3268 if (has_bpf) { 3269 /* 3270 * Make sure that this txd and any aggregated txds are not 3271 * freed before ETHER_BPF_MTAP. 3272 */ 3273 hn_txdesc_hold(txd); 3274 } 3275 error = txr->hn_sendpkt(txr, txd); 3276 if (!error) { 3277 if (has_bpf) { 3278 const struct hn_txdesc *tmp_txd; 3279 3280 ETHER_BPF_MTAP(ifp, txd->m); 3281 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3282 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3283 } 3284 3285 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3286 #ifdef HN_IFSTART_SUPPORT 3287 if (!hn_use_if_start) 3288 #endif 3289 { 3290 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3291 txr->hn_stat_size); 3292 if (txr->hn_stat_mcasts != 0) { 3293 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3294 txr->hn_stat_mcasts); 3295 } 3296 } 3297 txr->hn_pkts += txr->hn_stat_pkts; 3298 txr->hn_sends++; 3299 } 3300 if (has_bpf) 3301 hn_txdesc_put(txr, txd); 3302 3303 if (__predict_false(error)) { 3304 int freed __diagused; 3305 3306 /* 3307 * This should "really rarely" happen. 3308 * 3309 * XXX Too many RX to be acked or too many sideband 3310 * commands to run? Ask netvsc_channel_rollup() 3311 * to kick start later. 3312 */ 3313 txr->hn_has_txeof = 1; 3314 if (!send_failed) { 3315 txr->hn_send_failed++; 3316 send_failed = 1; 3317 /* 3318 * Try sending again after set hn_has_txeof; 3319 * in case that we missed the last 3320 * netvsc_channel_rollup(). 3321 */ 3322 goto again; 3323 } 3324 if_printf(ifp, "send failed\n"); 3325 3326 /* 3327 * Caller will perform further processing on the 3328 * associated mbuf, so don't free it in hn_txdesc_put(); 3329 * only unload it from the DMA map in hn_txdesc_put(), 3330 * if it was loaded. 3331 */ 3332 txd->m = NULL; 3333 freed = hn_txdesc_put(txr, txd); 3334 KASSERT(freed != 0, 3335 ("fail to free txd upon send error")); 3336 3337 txr->hn_send_failed++; 3338 } 3339 3340 /* Reset temporary stats, after this sending is done. */ 3341 txr->hn_stat_size = 0; 3342 txr->hn_stat_pkts = 0; 3343 txr->hn_stat_mcasts = 0; 3344 3345 return (error); 3346 } 3347 3348 /* 3349 * Append the specified data to the indicated mbuf chain, 3350 * Extend the mbuf chain if the new data does not fit in 3351 * existing space. 3352 * 3353 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3354 * There should be an equivalent in the kernel mbuf code, 3355 * but there does not appear to be one yet. 3356 * 3357 * Differs from m_append() in that additional mbufs are 3358 * allocated with cluster size MJUMPAGESIZE, and filled 3359 * accordingly. 3360 * 3361 * Return the last mbuf in the chain or NULL if failed to 3362 * allocate new mbuf. 3363 */ 3364 static struct mbuf * 3365 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3366 { 3367 struct mbuf *m, *n; 3368 int remainder, space; 3369 3370 for (m = m0; m->m_next != NULL; m = m->m_next) 3371 ; 3372 remainder = len; 3373 space = M_TRAILINGSPACE(m); 3374 if (space > 0) { 3375 /* 3376 * Copy into available space. 3377 */ 3378 if (space > remainder) 3379 space = remainder; 3380 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3381 m->m_len += space; 3382 cp += space; 3383 remainder -= space; 3384 } 3385 while (remainder > 0) { 3386 /* 3387 * Allocate a new mbuf; could check space 3388 * and allocate a cluster instead. 3389 */ 3390 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3391 if (n == NULL) 3392 return NULL; 3393 n->m_len = min(MJUMPAGESIZE, remainder); 3394 bcopy(cp, mtod(n, caddr_t), n->m_len); 3395 cp += n->m_len; 3396 remainder -= n->m_len; 3397 m->m_next = n; 3398 m = n; 3399 } 3400 3401 return m; 3402 } 3403 3404 #if defined(INET) || defined(INET6) 3405 static __inline int 3406 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3407 { 3408 if (hn_lro_mbufq_depth) { 3409 tcp_lro_queue_mbuf(lc, m); 3410 return 0; 3411 } 3412 return tcp_lro_rx(lc, m, 0); 3413 } 3414 #endif 3415 3416 static int 3417 hn_rxpkt(struct hn_rx_ring *rxr) 3418 { 3419 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3420 struct mbuf *m_new, *n; 3421 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3422 int hash_type = M_HASHTYPE_NONE; 3423 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3424 int i; 3425 3426 ifp = hn_ifp; 3427 if (rxr->hn_rxvf_ifp != NULL) { 3428 /* 3429 * Non-transparent mode VF; pretend this packet is from 3430 * the VF. 3431 */ 3432 ifp = rxr->hn_rxvf_ifp; 3433 is_vf = 1; 3434 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3435 /* Transparent mode VF. */ 3436 is_vf = 1; 3437 } 3438 3439 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3440 /* 3441 * NOTE: 3442 * See the NOTE of hn_rndis_init_fixat(). This 3443 * function can be reached, immediately after the 3444 * RNDIS is initialized but before the ifnet is 3445 * setup on the hn_attach() path; drop the unexpected 3446 * packets. 3447 */ 3448 return (0); 3449 } 3450 3451 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3452 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3453 return (0); 3454 } 3455 3456 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3457 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3458 if (m_new == NULL) { 3459 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3460 return (0); 3461 } 3462 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3463 rxr->rsc.frag_len[0]); 3464 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3465 } else { 3466 /* 3467 * Get an mbuf with a cluster. For packets 2K or less, 3468 * get a standard 2K cluster. For anything larger, get a 3469 * 4K cluster. Any buffers larger than 4K can cause problems 3470 * if looped around to the Hyper-V TX channel, so avoid them. 3471 */ 3472 size = MCLBYTES; 3473 if (rxr->rsc.pktlen > MCLBYTES) { 3474 /* 4096 */ 3475 size = MJUMPAGESIZE; 3476 } 3477 3478 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3479 if (m_new == NULL) { 3480 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3481 return (0); 3482 } 3483 3484 n = m_new; 3485 for (i = 0; i < rxr->rsc.cnt; i++) { 3486 n = hv_m_append(n, rxr->rsc.frag_len[i], 3487 rxr->rsc.frag_data[i]); 3488 if (n == NULL) { 3489 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3490 return (0); 3491 } else { 3492 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3493 } 3494 } 3495 } 3496 if (rxr->rsc.pktlen <= MHLEN) 3497 rxr->hn_small_pkts++; 3498 3499 m_new->m_pkthdr.rcvif = ifp; 3500 3501 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3502 do_csum = 0; 3503 3504 /* receive side checksum offload */ 3505 if (rxr->rsc.csum_info != NULL) { 3506 /* IP csum offload */ 3507 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3508 m_new->m_pkthdr.csum_flags |= 3509 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3510 rxr->hn_csum_ip++; 3511 } 3512 3513 /* TCP/UDP csum offload */ 3514 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3515 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3516 m_new->m_pkthdr.csum_flags |= 3517 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3518 m_new->m_pkthdr.csum_data = 0xffff; 3519 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3520 rxr->hn_csum_tcp++; 3521 else 3522 rxr->hn_csum_udp++; 3523 } 3524 3525 /* 3526 * XXX 3527 * As of this write (Oct 28th, 2016), host side will turn 3528 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3529 * the do_lro setting here is actually _not_ accurate. We 3530 * depend on the RSS hash type check to reset do_lro. 3531 */ 3532 if ((*(rxr->rsc.csum_info) & 3533 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3534 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3535 do_lro = 1; 3536 } else { 3537 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3538 if (l3proto == ETHERTYPE_IP) { 3539 if (l4proto == IPPROTO_TCP) { 3540 if (do_csum && 3541 (rxr->hn_trust_hcsum & 3542 HN_TRUST_HCSUM_TCP)) { 3543 rxr->hn_csum_trusted++; 3544 m_new->m_pkthdr.csum_flags |= 3545 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3546 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3547 m_new->m_pkthdr.csum_data = 0xffff; 3548 } 3549 do_lro = 1; 3550 } else if (l4proto == IPPROTO_UDP) { 3551 if (do_csum && 3552 (rxr->hn_trust_hcsum & 3553 HN_TRUST_HCSUM_UDP)) { 3554 rxr->hn_csum_trusted++; 3555 m_new->m_pkthdr.csum_flags |= 3556 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3557 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3558 m_new->m_pkthdr.csum_data = 0xffff; 3559 } 3560 } else if (l4proto != IPPROTO_DONE && do_csum && 3561 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3562 rxr->hn_csum_trusted++; 3563 m_new->m_pkthdr.csum_flags |= 3564 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3565 } 3566 } 3567 } 3568 3569 if (rxr->rsc.vlan_info != NULL) { 3570 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3571 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3572 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3573 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3574 m_new->m_flags |= M_VLANTAG; 3575 } 3576 3577 /* 3578 * If VF is activated (tranparent/non-transparent mode does not 3579 * matter here). 3580 * 3581 * - Disable LRO 3582 * 3583 * hn(4) will only receive broadcast packets, multicast packets, 3584 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3585 * packet types. 3586 * 3587 * For non-transparent, we definitely _cannot_ enable LRO at 3588 * all, since the LRO flush will use hn(4) as the receiving 3589 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3590 */ 3591 if (is_vf) 3592 do_lro = 0; 3593 3594 /* 3595 * If VF is activated (tranparent/non-transparent mode does not 3596 * matter here), do _not_ mess with unsupported hash types or 3597 * functions. 3598 */ 3599 if (rxr->rsc.hash_info != NULL) { 3600 rxr->hn_rss_pkts++; 3601 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3602 if (!is_vf) 3603 hash_type = M_HASHTYPE_OPAQUE_HASH; 3604 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3605 NDIS_HASH_FUNCTION_TOEPLITZ) { 3606 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3607 rxr->hn_mbuf_hash); 3608 3609 /* 3610 * NOTE: 3611 * do_lro is resetted, if the hash types are not TCP 3612 * related. See the comment in the above csum_flags 3613 * setup section. 3614 */ 3615 switch (type) { 3616 case NDIS_HASH_IPV4: 3617 hash_type = M_HASHTYPE_RSS_IPV4; 3618 do_lro = 0; 3619 break; 3620 3621 case NDIS_HASH_TCP_IPV4: 3622 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3623 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3624 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3625 3626 if (is_vf) 3627 def_htype = M_HASHTYPE_NONE; 3628 3629 /* 3630 * UDP 4-tuple hash is delivered as 3631 * TCP 4-tuple hash. 3632 */ 3633 if (l3proto == ETHERTYPE_MAX) { 3634 hn_rxpkt_proto(m_new, 3635 &l3proto, &l4proto); 3636 } 3637 if (l3proto == ETHERTYPE_IP) { 3638 if (l4proto == IPPROTO_UDP && 3639 (rxr->hn_mbuf_hash & 3640 NDIS_HASH_UDP_IPV4_X)) { 3641 hash_type = 3642 M_HASHTYPE_RSS_UDP_IPV4; 3643 do_lro = 0; 3644 } else if (l4proto != 3645 IPPROTO_TCP) { 3646 hash_type = def_htype; 3647 do_lro = 0; 3648 } 3649 } else { 3650 hash_type = def_htype; 3651 do_lro = 0; 3652 } 3653 } 3654 break; 3655 3656 case NDIS_HASH_IPV6: 3657 hash_type = M_HASHTYPE_RSS_IPV6; 3658 do_lro = 0; 3659 break; 3660 3661 case NDIS_HASH_IPV6_EX: 3662 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3663 do_lro = 0; 3664 break; 3665 3666 case NDIS_HASH_TCP_IPV6: 3667 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3668 break; 3669 3670 case NDIS_HASH_TCP_IPV6_EX: 3671 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3672 break; 3673 } 3674 } 3675 } else if (!is_vf) { 3676 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3677 hash_type = M_HASHTYPE_OPAQUE; 3678 } 3679 M_HASHTYPE_SET(m_new, hash_type); 3680 3681 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3682 if (hn_ifp != ifp) { 3683 const struct ether_header *eh; 3684 3685 /* 3686 * Non-transparent mode VF is activated. 3687 */ 3688 3689 /* 3690 * Allow tapping on hn(4). 3691 */ 3692 ETHER_BPF_MTAP(hn_ifp, m_new); 3693 3694 /* 3695 * Update hn(4)'s stats. 3696 */ 3697 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3698 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3699 /* Checked at the beginning of this function. */ 3700 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3701 eh = mtod(m_new, struct ether_header *); 3702 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3703 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3704 } 3705 rxr->hn_pkts++; 3706 3707 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3708 #if defined(INET) || defined(INET6) 3709 struct lro_ctrl *lro = &rxr->hn_lro; 3710 3711 if (lro->lro_cnt) { 3712 rxr->hn_lro_tried++; 3713 if (hn_lro_rx(lro, m_new) == 0) { 3714 /* DONE! */ 3715 return 0; 3716 } 3717 } 3718 #endif 3719 } 3720 ifp->if_input(ifp, m_new); 3721 3722 return (0); 3723 } 3724 3725 static int 3726 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3727 { 3728 struct hn_softc *sc = ifp->if_softc; 3729 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3730 struct ifnet *vf_ifp; 3731 int mask, error = 0; 3732 struct ifrsskey *ifrk; 3733 struct ifrsshash *ifrh; 3734 uint32_t mtu; 3735 3736 switch (cmd) { 3737 case SIOCSIFMTU: 3738 if (ifr->ifr_mtu > HN_MTU_MAX) { 3739 error = EINVAL; 3740 break; 3741 } 3742 3743 HN_LOCK(sc); 3744 3745 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3746 HN_UNLOCK(sc); 3747 break; 3748 } 3749 3750 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3751 /* Can't change MTU */ 3752 HN_UNLOCK(sc); 3753 error = EOPNOTSUPP; 3754 break; 3755 } 3756 3757 if (ifp->if_mtu == ifr->ifr_mtu) { 3758 HN_UNLOCK(sc); 3759 break; 3760 } 3761 3762 if (hn_xpnt_vf_isready(sc)) { 3763 vf_ifp = sc->hn_vf_ifp; 3764 ifr_vf = *ifr; 3765 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3766 sizeof(ifr_vf.ifr_name)); 3767 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3768 (caddr_t)&ifr_vf); 3769 if (error) { 3770 HN_UNLOCK(sc); 3771 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3772 vf_ifp->if_xname, ifr->ifr_mtu, error); 3773 break; 3774 } 3775 } 3776 3777 /* 3778 * Suspend this interface before the synthetic parts 3779 * are ripped. 3780 */ 3781 hn_suspend(sc); 3782 3783 /* 3784 * Detach the synthetics parts, i.e. NVS and RNDIS. 3785 */ 3786 hn_synth_detach(sc); 3787 3788 /* 3789 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3790 * with the new MTU setting. 3791 */ 3792 error = hn_synth_attach(sc, ifr->ifr_mtu); 3793 if (error) { 3794 HN_UNLOCK(sc); 3795 break; 3796 } 3797 3798 error = hn_rndis_get_mtu(sc, &mtu); 3799 if (error) 3800 mtu = ifr->ifr_mtu; 3801 else if (bootverbose) 3802 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3803 3804 /* 3805 * Commit the requested MTU, after the synthetic parts 3806 * have been successfully attached. 3807 */ 3808 if (mtu >= ifr->ifr_mtu) { 3809 mtu = ifr->ifr_mtu; 3810 } else { 3811 if_printf(ifp, "fixup mtu %d -> %u\n", 3812 ifr->ifr_mtu, mtu); 3813 } 3814 ifp->if_mtu = mtu; 3815 3816 /* 3817 * Synthetic parts' reattach may change the chimney 3818 * sending size; update it. 3819 */ 3820 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3821 hn_set_chim_size(sc, sc->hn_chim_szmax); 3822 3823 /* 3824 * Make sure that various parameters based on MTU are 3825 * still valid, after the MTU change. 3826 */ 3827 hn_mtu_change_fixup(sc); 3828 3829 /* 3830 * All done! Resume the interface now. 3831 */ 3832 hn_resume(sc); 3833 3834 if ((sc->hn_flags & HN_FLAG_RXVF) || 3835 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3836 /* 3837 * Since we have reattached the NVS part, 3838 * change the datapath to VF again; in case 3839 * that it is lost, after the NVS was detached. 3840 */ 3841 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3842 } 3843 3844 HN_UNLOCK(sc); 3845 break; 3846 3847 case SIOCSIFFLAGS: 3848 HN_LOCK(sc); 3849 3850 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3851 HN_UNLOCK(sc); 3852 break; 3853 } 3854 3855 if (hn_xpnt_vf_isready(sc)) 3856 hn_xpnt_vf_saveifflags(sc); 3857 3858 if (ifp->if_flags & IFF_UP) { 3859 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3860 /* 3861 * Caller meight hold mutex, e.g. 3862 * bpf; use busy-wait for the RNDIS 3863 * reply. 3864 */ 3865 HN_NO_SLEEPING(sc); 3866 hn_rxfilter_config(sc); 3867 HN_SLEEPING_OK(sc); 3868 3869 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3870 error = hn_xpnt_vf_iocsetflags(sc); 3871 } else { 3872 hn_init_locked(sc); 3873 } 3874 } else { 3875 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3876 hn_stop(sc, false); 3877 } 3878 sc->hn_if_flags = ifp->if_flags; 3879 3880 HN_UNLOCK(sc); 3881 break; 3882 3883 case SIOCSIFCAP: 3884 HN_LOCK(sc); 3885 3886 if (hn_xpnt_vf_isready(sc)) { 3887 ifr_vf = *ifr; 3888 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3889 sizeof(ifr_vf.ifr_name)); 3890 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3891 HN_UNLOCK(sc); 3892 break; 3893 } 3894 3895 /* 3896 * Fix up requested capabilities w/ supported capabilities, 3897 * since the supported capabilities could have been changed. 3898 */ 3899 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3900 ifp->if_capenable; 3901 3902 if (mask & IFCAP_TXCSUM) { 3903 ifp->if_capenable ^= IFCAP_TXCSUM; 3904 if (ifp->if_capenable & IFCAP_TXCSUM) 3905 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3906 else 3907 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3908 } 3909 if (mask & IFCAP_TXCSUM_IPV6) { 3910 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3911 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3912 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3913 else 3914 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3915 } 3916 3917 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3918 if (mask & IFCAP_RXCSUM) 3919 ifp->if_capenable ^= IFCAP_RXCSUM; 3920 #ifdef foo 3921 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3922 if (mask & IFCAP_RXCSUM_IPV6) 3923 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3924 #endif 3925 3926 if (mask & IFCAP_LRO) 3927 ifp->if_capenable ^= IFCAP_LRO; 3928 3929 if (mask & IFCAP_TSO4) { 3930 ifp->if_capenable ^= IFCAP_TSO4; 3931 if (ifp->if_capenable & IFCAP_TSO4) 3932 ifp->if_hwassist |= CSUM_IP_TSO; 3933 else 3934 ifp->if_hwassist &= ~CSUM_IP_TSO; 3935 } 3936 if (mask & IFCAP_TSO6) { 3937 ifp->if_capenable ^= IFCAP_TSO6; 3938 if (ifp->if_capenable & IFCAP_TSO6) 3939 ifp->if_hwassist |= CSUM_IP6_TSO; 3940 else 3941 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3942 } 3943 3944 HN_UNLOCK(sc); 3945 break; 3946 3947 case SIOCADDMULTI: 3948 case SIOCDELMULTI: 3949 HN_LOCK(sc); 3950 3951 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3952 HN_UNLOCK(sc); 3953 break; 3954 } 3955 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3956 /* 3957 * Multicast uses mutex; use busy-wait for 3958 * the RNDIS reply. 3959 */ 3960 HN_NO_SLEEPING(sc); 3961 hn_rxfilter_config(sc); 3962 HN_SLEEPING_OK(sc); 3963 } 3964 3965 /* XXX vlan(4) style mcast addr maintenance */ 3966 if (hn_xpnt_vf_isready(sc)) { 3967 int old_if_flags; 3968 3969 old_if_flags = sc->hn_vf_ifp->if_flags; 3970 hn_xpnt_vf_saveifflags(sc); 3971 3972 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3973 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3974 IFF_ALLMULTI)) 3975 error = hn_xpnt_vf_iocsetflags(sc); 3976 } 3977 3978 HN_UNLOCK(sc); 3979 break; 3980 3981 case SIOCSIFMEDIA: 3982 case SIOCGIFMEDIA: 3983 HN_LOCK(sc); 3984 if (hn_xpnt_vf_isready(sc)) { 3985 /* 3986 * SIOCGIFMEDIA expects ifmediareq, so don't 3987 * create and pass ifr_vf to the VF here; just 3988 * replace the ifr_name. 3989 */ 3990 vf_ifp = sc->hn_vf_ifp; 3991 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 3992 sizeof(ifr->ifr_name)); 3993 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 3994 /* Restore the ifr_name. */ 3995 strlcpy(ifr->ifr_name, ifp->if_xname, 3996 sizeof(ifr->ifr_name)); 3997 HN_UNLOCK(sc); 3998 break; 3999 } 4000 HN_UNLOCK(sc); 4001 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4002 break; 4003 4004 case SIOCGIFRSSHASH: 4005 ifrh = (struct ifrsshash *)data; 4006 HN_LOCK(sc); 4007 if (sc->hn_rx_ring_inuse == 1) { 4008 HN_UNLOCK(sc); 4009 ifrh->ifrh_func = RSS_FUNC_NONE; 4010 ifrh->ifrh_types = 0; 4011 break; 4012 } 4013 4014 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4015 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4016 else 4017 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4018 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4019 HN_UNLOCK(sc); 4020 break; 4021 4022 case SIOCGIFRSSKEY: 4023 ifrk = (struct ifrsskey *)data; 4024 HN_LOCK(sc); 4025 if (sc->hn_rx_ring_inuse == 1) { 4026 HN_UNLOCK(sc); 4027 ifrk->ifrk_func = RSS_FUNC_NONE; 4028 ifrk->ifrk_keylen = 0; 4029 break; 4030 } 4031 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4032 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4033 else 4034 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4035 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4036 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4037 NDIS_HASH_KEYSIZE_TOEPLITZ); 4038 HN_UNLOCK(sc); 4039 break; 4040 4041 default: 4042 error = ether_ioctl(ifp, cmd, data); 4043 break; 4044 } 4045 return (error); 4046 } 4047 4048 static void 4049 hn_stop(struct hn_softc *sc, bool detaching) 4050 { 4051 struct ifnet *ifp = sc->hn_ifp; 4052 int i; 4053 4054 HN_LOCK_ASSERT(sc); 4055 4056 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4057 ("synthetic parts were not attached")); 4058 4059 /* Clear RUNNING bit ASAP. */ 4060 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4061 4062 /* Disable polling. */ 4063 hn_polling(sc, 0); 4064 4065 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4066 KASSERT(sc->hn_vf_ifp != NULL, 4067 ("%s: VF is not attached", ifp->if_xname)); 4068 4069 /* Mark transparent mode VF as disabled. */ 4070 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4071 4072 /* 4073 * NOTE: 4074 * Datapath setting must happen _before_ bringing 4075 * the VF down. 4076 */ 4077 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4078 4079 /* 4080 * Bring the VF down. 4081 */ 4082 hn_xpnt_vf_saveifflags(sc); 4083 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4084 hn_xpnt_vf_iocsetflags(sc); 4085 } 4086 4087 /* Suspend data transfers. */ 4088 hn_suspend_data(sc); 4089 4090 /* Clear OACTIVE bit. */ 4091 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4092 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4093 sc->hn_tx_ring[i].hn_oactive = 0; 4094 4095 /* 4096 * If the non-transparent mode VF is active, make sure 4097 * that the RX filter still allows packet reception. 4098 */ 4099 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4100 hn_rxfilter_config(sc); 4101 } 4102 4103 static void 4104 hn_init_locked(struct hn_softc *sc) 4105 { 4106 struct ifnet *ifp = sc->hn_ifp; 4107 int i; 4108 4109 HN_LOCK_ASSERT(sc); 4110 4111 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4112 return; 4113 4114 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4115 return; 4116 4117 /* Configure RX filter */ 4118 hn_rxfilter_config(sc); 4119 4120 /* Clear OACTIVE bit. */ 4121 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4122 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4123 sc->hn_tx_ring[i].hn_oactive = 0; 4124 4125 /* Clear TX 'suspended' bit. */ 4126 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4127 4128 if (hn_xpnt_vf_isready(sc)) { 4129 /* Initialize transparent VF. */ 4130 hn_xpnt_vf_init(sc); 4131 } 4132 4133 /* Everything is ready; unleash! */ 4134 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4135 4136 /* Re-enable polling if requested. */ 4137 if (sc->hn_pollhz > 0) 4138 hn_polling(sc, sc->hn_pollhz); 4139 } 4140 4141 static void 4142 hn_init(void *xsc) 4143 { 4144 struct hn_softc *sc = xsc; 4145 4146 HN_LOCK(sc); 4147 hn_init_locked(sc); 4148 HN_UNLOCK(sc); 4149 } 4150 4151 static int 4152 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4153 { 4154 struct hn_softc *sc = arg1; 4155 unsigned int lenlim; 4156 int error; 4157 4158 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4159 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4160 if (error || req->newptr == NULL) 4161 return error; 4162 4163 HN_LOCK(sc); 4164 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4165 lenlim > TCP_LRO_LENGTH_MAX) { 4166 HN_UNLOCK(sc); 4167 return EINVAL; 4168 } 4169 hn_set_lro_lenlim(sc, lenlim); 4170 HN_UNLOCK(sc); 4171 4172 return 0; 4173 } 4174 4175 static int 4176 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4177 { 4178 struct hn_softc *sc = arg1; 4179 int ackcnt, error, i; 4180 4181 /* 4182 * lro_ackcnt_lim is append count limit, 4183 * +1 to turn it into aggregation limit. 4184 */ 4185 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4186 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4187 if (error || req->newptr == NULL) 4188 return error; 4189 4190 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4191 return EINVAL; 4192 4193 /* 4194 * Convert aggregation limit back to append 4195 * count limit. 4196 */ 4197 --ackcnt; 4198 HN_LOCK(sc); 4199 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4200 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4201 HN_UNLOCK(sc); 4202 return 0; 4203 } 4204 4205 static int 4206 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4207 { 4208 struct hn_softc *sc = arg1; 4209 int hcsum = arg2; 4210 int on, error, i; 4211 4212 on = 0; 4213 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4214 on = 1; 4215 4216 error = sysctl_handle_int(oidp, &on, 0, req); 4217 if (error || req->newptr == NULL) 4218 return error; 4219 4220 HN_LOCK(sc); 4221 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4222 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4223 4224 if (on) 4225 rxr->hn_trust_hcsum |= hcsum; 4226 else 4227 rxr->hn_trust_hcsum &= ~hcsum; 4228 } 4229 HN_UNLOCK(sc); 4230 return 0; 4231 } 4232 4233 static int 4234 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4235 { 4236 struct hn_softc *sc = arg1; 4237 int chim_size, error; 4238 4239 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4240 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4241 if (error || req->newptr == NULL) 4242 return error; 4243 4244 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4245 return EINVAL; 4246 4247 HN_LOCK(sc); 4248 hn_set_chim_size(sc, chim_size); 4249 HN_UNLOCK(sc); 4250 return 0; 4251 } 4252 4253 static int 4254 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4255 { 4256 struct hn_softc *sc = arg1; 4257 int ofs = arg2, i, error; 4258 struct hn_rx_ring *rxr; 4259 uint64_t stat; 4260 4261 stat = 0; 4262 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4263 rxr = &sc->hn_rx_ring[i]; 4264 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4265 } 4266 4267 error = sysctl_handle_64(oidp, &stat, 0, req); 4268 if (error || req->newptr == NULL) 4269 return error; 4270 4271 /* Zero out this stat. */ 4272 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4273 rxr = &sc->hn_rx_ring[i]; 4274 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4275 } 4276 return 0; 4277 } 4278 4279 static int 4280 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4281 { 4282 struct hn_softc *sc = arg1; 4283 int ofs = arg2, i, error; 4284 struct hn_rx_ring *rxr; 4285 u_long stat; 4286 4287 stat = 0; 4288 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4289 rxr = &sc->hn_rx_ring[i]; 4290 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4291 } 4292 4293 error = sysctl_handle_long(oidp, &stat, 0, req); 4294 if (error || req->newptr == NULL) 4295 return error; 4296 4297 /* Zero out this stat. */ 4298 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4299 rxr = &sc->hn_rx_ring[i]; 4300 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4301 } 4302 return 0; 4303 } 4304 4305 static int 4306 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4307 { 4308 struct hn_softc *sc = arg1; 4309 int ofs = arg2, i, error; 4310 struct hn_tx_ring *txr; 4311 u_long stat; 4312 4313 stat = 0; 4314 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4315 txr = &sc->hn_tx_ring[i]; 4316 stat += *((u_long *)((uint8_t *)txr + ofs)); 4317 } 4318 4319 error = sysctl_handle_long(oidp, &stat, 0, req); 4320 if (error || req->newptr == NULL) 4321 return error; 4322 4323 /* Zero out this stat. */ 4324 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4325 txr = &sc->hn_tx_ring[i]; 4326 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4327 } 4328 return 0; 4329 } 4330 4331 static int 4332 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4333 { 4334 struct hn_softc *sc = arg1; 4335 int ofs = arg2, i, error, conf; 4336 struct hn_tx_ring *txr; 4337 4338 txr = &sc->hn_tx_ring[0]; 4339 conf = *((int *)((uint8_t *)txr + ofs)); 4340 4341 error = sysctl_handle_int(oidp, &conf, 0, req); 4342 if (error || req->newptr == NULL) 4343 return error; 4344 4345 HN_LOCK(sc); 4346 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4347 txr = &sc->hn_tx_ring[i]; 4348 *((int *)((uint8_t *)txr + ofs)) = conf; 4349 } 4350 HN_UNLOCK(sc); 4351 4352 return 0; 4353 } 4354 4355 static int 4356 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4357 { 4358 struct hn_softc *sc = arg1; 4359 int error, size; 4360 4361 size = sc->hn_agg_size; 4362 error = sysctl_handle_int(oidp, &size, 0, req); 4363 if (error || req->newptr == NULL) 4364 return (error); 4365 4366 HN_LOCK(sc); 4367 sc->hn_agg_size = size; 4368 hn_set_txagg(sc); 4369 HN_UNLOCK(sc); 4370 4371 return (0); 4372 } 4373 4374 static int 4375 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4376 { 4377 struct hn_softc *sc = arg1; 4378 int error, pkts; 4379 4380 pkts = sc->hn_agg_pkts; 4381 error = sysctl_handle_int(oidp, &pkts, 0, req); 4382 if (error || req->newptr == NULL) 4383 return (error); 4384 4385 HN_LOCK(sc); 4386 sc->hn_agg_pkts = pkts; 4387 hn_set_txagg(sc); 4388 HN_UNLOCK(sc); 4389 4390 return (0); 4391 } 4392 4393 static int 4394 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4395 { 4396 struct hn_softc *sc = arg1; 4397 int pkts; 4398 4399 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4400 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4401 } 4402 4403 static int 4404 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4405 { 4406 struct hn_softc *sc = arg1; 4407 int align; 4408 4409 align = sc->hn_tx_ring[0].hn_agg_align; 4410 return (sysctl_handle_int(oidp, &align, 0, req)); 4411 } 4412 4413 static void 4414 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4415 { 4416 if (pollhz == 0) 4417 vmbus_chan_poll_disable(chan); 4418 else 4419 vmbus_chan_poll_enable(chan, pollhz); 4420 } 4421 4422 static void 4423 hn_polling(struct hn_softc *sc, u_int pollhz) 4424 { 4425 int nsubch = sc->hn_rx_ring_inuse - 1; 4426 4427 HN_LOCK_ASSERT(sc); 4428 4429 if (nsubch > 0) { 4430 struct vmbus_channel **subch; 4431 int i; 4432 4433 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4434 for (i = 0; i < nsubch; ++i) 4435 hn_chan_polling(subch[i], pollhz); 4436 vmbus_subchan_rel(subch, nsubch); 4437 } 4438 hn_chan_polling(sc->hn_prichan, pollhz); 4439 } 4440 4441 static int 4442 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4443 { 4444 struct hn_softc *sc = arg1; 4445 int pollhz, error; 4446 4447 pollhz = sc->hn_pollhz; 4448 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4449 if (error || req->newptr == NULL) 4450 return (error); 4451 4452 if (pollhz != 0 && 4453 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4454 return (EINVAL); 4455 4456 HN_LOCK(sc); 4457 if (sc->hn_pollhz != pollhz) { 4458 sc->hn_pollhz = pollhz; 4459 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4460 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4461 hn_polling(sc, sc->hn_pollhz); 4462 } 4463 HN_UNLOCK(sc); 4464 4465 return (0); 4466 } 4467 4468 static int 4469 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4470 { 4471 struct hn_softc *sc = arg1; 4472 char verstr[16]; 4473 4474 snprintf(verstr, sizeof(verstr), "%u.%u", 4475 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4476 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4477 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4478 } 4479 4480 static int 4481 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4482 { 4483 struct hn_softc *sc = arg1; 4484 char caps_str[128]; 4485 uint32_t caps; 4486 4487 HN_LOCK(sc); 4488 caps = sc->hn_caps; 4489 HN_UNLOCK(sc); 4490 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4491 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4492 } 4493 4494 static int 4495 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4496 { 4497 struct hn_softc *sc = arg1; 4498 char assist_str[128]; 4499 uint32_t hwassist; 4500 4501 HN_LOCK(sc); 4502 hwassist = sc->hn_ifp->if_hwassist; 4503 HN_UNLOCK(sc); 4504 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4505 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4506 } 4507 4508 static int 4509 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4510 { 4511 struct hn_softc *sc = arg1; 4512 char filter_str[128]; 4513 uint32_t filter; 4514 4515 HN_LOCK(sc); 4516 filter = sc->hn_rx_filter; 4517 HN_UNLOCK(sc); 4518 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4519 NDIS_PACKET_TYPES); 4520 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4521 } 4522 4523 static int 4524 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4525 { 4526 struct hn_softc *sc = arg1; 4527 uint32_t mtu; 4528 int error; 4529 HN_LOCK(sc); 4530 error = hn_rndis_get_mtu(sc, &mtu); 4531 if (error) { 4532 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4533 goto back; 4534 } 4535 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4536 if (error || req->newptr == NULL) 4537 goto back; 4538 4539 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4540 if (error) 4541 goto back; 4542 error = hn_rndis_reconf_offload(sc, mtu); 4543 back: 4544 HN_UNLOCK(sc); 4545 return (error); 4546 } 4547 #ifndef RSS 4548 4549 static int 4550 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4551 { 4552 struct hn_softc *sc = arg1; 4553 int error; 4554 4555 HN_LOCK(sc); 4556 4557 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4558 if (error || req->newptr == NULL) 4559 goto back; 4560 4561 if ((sc->hn_flags & HN_FLAG_RXVF) || 4562 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4563 /* 4564 * RSS key is synchronized w/ VF's, don't allow users 4565 * to change it. 4566 */ 4567 error = EBUSY; 4568 goto back; 4569 } 4570 4571 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4572 if (error) 4573 goto back; 4574 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4575 4576 if (sc->hn_rx_ring_inuse > 1) { 4577 error = hn_rss_reconfig(sc); 4578 } else { 4579 /* Not RSS capable, at least for now; just save the RSS key. */ 4580 error = 0; 4581 } 4582 back: 4583 HN_UNLOCK(sc); 4584 return (error); 4585 } 4586 4587 static int 4588 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4589 { 4590 struct hn_softc *sc = arg1; 4591 int error; 4592 4593 HN_LOCK(sc); 4594 4595 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4596 if (error || req->newptr == NULL) 4597 goto back; 4598 4599 /* 4600 * Don't allow RSS indirect table change, if this interface is not 4601 * RSS capable currently. 4602 */ 4603 if (sc->hn_rx_ring_inuse == 1) { 4604 error = EOPNOTSUPP; 4605 goto back; 4606 } 4607 4608 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4609 if (error) 4610 goto back; 4611 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4612 4613 hn_rss_ind_fixup(sc); 4614 error = hn_rss_reconfig(sc); 4615 back: 4616 HN_UNLOCK(sc); 4617 return (error); 4618 } 4619 4620 #endif /* !RSS */ 4621 4622 static int 4623 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4624 { 4625 struct hn_softc *sc = arg1; 4626 char hash_str[128]; 4627 uint32_t hash; 4628 4629 HN_LOCK(sc); 4630 hash = sc->hn_rss_hash; 4631 HN_UNLOCK(sc); 4632 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4633 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4634 } 4635 4636 static int 4637 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4638 { 4639 struct hn_softc *sc = arg1; 4640 char hash_str[128]; 4641 uint32_t hash; 4642 4643 HN_LOCK(sc); 4644 hash = sc->hn_rss_hcap; 4645 HN_UNLOCK(sc); 4646 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4647 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4648 } 4649 4650 static int 4651 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4652 { 4653 struct hn_softc *sc = arg1; 4654 char hash_str[128]; 4655 uint32_t hash; 4656 4657 HN_LOCK(sc); 4658 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4659 HN_UNLOCK(sc); 4660 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4661 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4662 } 4663 4664 static int 4665 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4666 { 4667 struct hn_softc *sc = arg1; 4668 char vf_name[IFNAMSIZ + 1]; 4669 struct ifnet *vf_ifp; 4670 4671 HN_LOCK(sc); 4672 vf_name[0] = '\0'; 4673 vf_ifp = sc->hn_vf_ifp; 4674 if (vf_ifp != NULL) 4675 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4676 HN_UNLOCK(sc); 4677 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4678 } 4679 4680 static int 4681 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4682 { 4683 struct hn_softc *sc = arg1; 4684 char vf_name[IFNAMSIZ + 1]; 4685 struct ifnet *vf_ifp; 4686 4687 HN_LOCK(sc); 4688 vf_name[0] = '\0'; 4689 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4690 if (vf_ifp != NULL) 4691 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4692 HN_UNLOCK(sc); 4693 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4694 } 4695 4696 static int 4697 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4698 { 4699 struct rm_priotracker pt; 4700 struct sbuf *sb; 4701 int error, i; 4702 bool first; 4703 4704 error = sysctl_wire_old_buffer(req, 0); 4705 if (error != 0) 4706 return (error); 4707 4708 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4709 if (sb == NULL) 4710 return (ENOMEM); 4711 4712 rm_rlock(&hn_vfmap_lock, &pt); 4713 4714 first = true; 4715 for (i = 0; i < hn_vfmap_size; ++i) { 4716 struct epoch_tracker et; 4717 struct ifnet *ifp; 4718 4719 if (hn_vfmap[i] == NULL) 4720 continue; 4721 4722 NET_EPOCH_ENTER(et); 4723 ifp = ifnet_byindex(i); 4724 if (ifp != NULL) { 4725 if (first) 4726 sbuf_printf(sb, "%s", ifp->if_xname); 4727 else 4728 sbuf_printf(sb, " %s", ifp->if_xname); 4729 first = false; 4730 } 4731 NET_EPOCH_EXIT(et); 4732 } 4733 4734 rm_runlock(&hn_vfmap_lock, &pt); 4735 4736 error = sbuf_finish(sb); 4737 sbuf_delete(sb); 4738 return (error); 4739 } 4740 4741 static int 4742 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4743 { 4744 struct rm_priotracker pt; 4745 struct sbuf *sb; 4746 int error, i; 4747 bool first; 4748 4749 error = sysctl_wire_old_buffer(req, 0); 4750 if (error != 0) 4751 return (error); 4752 4753 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4754 if (sb == NULL) 4755 return (ENOMEM); 4756 4757 rm_rlock(&hn_vfmap_lock, &pt); 4758 4759 first = true; 4760 for (i = 0; i < hn_vfmap_size; ++i) { 4761 struct epoch_tracker et; 4762 struct ifnet *ifp, *hn_ifp; 4763 4764 hn_ifp = hn_vfmap[i]; 4765 if (hn_ifp == NULL) 4766 continue; 4767 4768 NET_EPOCH_ENTER(et); 4769 ifp = ifnet_byindex(i); 4770 if (ifp != NULL) { 4771 if (first) { 4772 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4773 hn_ifp->if_xname); 4774 } else { 4775 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4776 hn_ifp->if_xname); 4777 } 4778 first = false; 4779 } 4780 NET_EPOCH_EXIT(et); 4781 } 4782 4783 rm_runlock(&hn_vfmap_lock, &pt); 4784 4785 error = sbuf_finish(sb); 4786 sbuf_delete(sb); 4787 return (error); 4788 } 4789 4790 static int 4791 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4792 { 4793 struct hn_softc *sc = arg1; 4794 int error, onoff = 0; 4795 4796 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4797 onoff = 1; 4798 error = sysctl_handle_int(oidp, &onoff, 0, req); 4799 if (error || req->newptr == NULL) 4800 return (error); 4801 4802 HN_LOCK(sc); 4803 /* NOTE: hn_vf_lock for hn_transmit() */ 4804 rm_wlock(&sc->hn_vf_lock); 4805 if (onoff) 4806 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4807 else 4808 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4809 rm_wunlock(&sc->hn_vf_lock); 4810 HN_UNLOCK(sc); 4811 4812 return (0); 4813 } 4814 4815 static int 4816 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4817 { 4818 struct hn_softc *sc = arg1; 4819 int enabled = 0; 4820 4821 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4822 enabled = 1; 4823 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4824 } 4825 4826 static int 4827 hn_check_iplen(const struct mbuf *m, int hoff) 4828 { 4829 const struct ip *ip; 4830 int len, iphlen, iplen; 4831 const struct tcphdr *th; 4832 int thoff; /* TCP data offset */ 4833 4834 len = hoff + sizeof(struct ip); 4835 4836 /* The packet must be at least the size of an IP header. */ 4837 if (m->m_pkthdr.len < len) 4838 return IPPROTO_DONE; 4839 4840 /* The fixed IP header must reside completely in the first mbuf. */ 4841 if (m->m_len < len) 4842 return IPPROTO_DONE; 4843 4844 ip = mtodo(m, hoff); 4845 4846 /* Bound check the packet's stated IP header length. */ 4847 iphlen = ip->ip_hl << 2; 4848 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4849 return IPPROTO_DONE; 4850 4851 /* The full IP header must reside completely in the one mbuf. */ 4852 if (m->m_len < hoff + iphlen) 4853 return IPPROTO_DONE; 4854 4855 iplen = ntohs(ip->ip_len); 4856 4857 /* 4858 * Check that the amount of data in the buffers is as 4859 * at least much as the IP header would have us expect. 4860 */ 4861 if (m->m_pkthdr.len < hoff + iplen) 4862 return IPPROTO_DONE; 4863 4864 /* 4865 * Ignore IP fragments. 4866 */ 4867 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4868 return IPPROTO_DONE; 4869 4870 /* 4871 * The TCP/IP or UDP/IP header must be entirely contained within 4872 * the first fragment of a packet. 4873 */ 4874 switch (ip->ip_p) { 4875 case IPPROTO_TCP: 4876 if (iplen < iphlen + sizeof(struct tcphdr)) 4877 return IPPROTO_DONE; 4878 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4879 return IPPROTO_DONE; 4880 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4881 thoff = th->th_off << 2; 4882 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4883 return IPPROTO_DONE; 4884 if (m->m_len < hoff + iphlen + thoff) 4885 return IPPROTO_DONE; 4886 break; 4887 case IPPROTO_UDP: 4888 if (iplen < iphlen + sizeof(struct udphdr)) 4889 return IPPROTO_DONE; 4890 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4891 return IPPROTO_DONE; 4892 break; 4893 default: 4894 if (iplen < iphlen) 4895 return IPPROTO_DONE; 4896 break; 4897 } 4898 return ip->ip_p; 4899 } 4900 4901 static void 4902 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4903 { 4904 const struct ether_header *eh; 4905 uint16_t etype; 4906 int hoff; 4907 4908 hoff = sizeof(*eh); 4909 /* Checked at the beginning of this function. */ 4910 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4911 4912 eh = mtod(m_new, const struct ether_header *); 4913 etype = ntohs(eh->ether_type); 4914 if (etype == ETHERTYPE_VLAN) { 4915 const struct ether_vlan_header *evl; 4916 4917 hoff = sizeof(*evl); 4918 if (m_new->m_len < hoff) 4919 return; 4920 evl = mtod(m_new, const struct ether_vlan_header *); 4921 etype = ntohs(evl->evl_proto); 4922 } 4923 *l3proto = etype; 4924 4925 if (etype == ETHERTYPE_IP) 4926 *l4proto = hn_check_iplen(m_new, hoff); 4927 else 4928 *l4proto = IPPROTO_DONE; 4929 } 4930 4931 static int 4932 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4933 { 4934 struct sysctl_oid_list *child; 4935 struct sysctl_ctx_list *ctx; 4936 device_t dev = sc->hn_dev; 4937 #if defined(INET) || defined(INET6) 4938 int lroent_cnt; 4939 #endif 4940 int i; 4941 4942 /* 4943 * Create RXBUF for reception. 4944 * 4945 * NOTE: 4946 * - It is shared by all channels. 4947 * - A large enough buffer is allocated, certain version of NVSes 4948 * may further limit the usable space. 4949 */ 4950 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4951 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4952 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4953 if (sc->hn_rxbuf == NULL) { 4954 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4955 return (ENOMEM); 4956 } 4957 4958 sc->hn_rx_ring_cnt = ring_cnt; 4959 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4960 4961 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4962 M_DEVBUF, M_WAITOK | M_ZERO); 4963 4964 #if defined(INET) || defined(INET6) 4965 lroent_cnt = hn_lro_entry_count; 4966 if (lroent_cnt < TCP_LRO_ENTRIES) 4967 lroent_cnt = TCP_LRO_ENTRIES; 4968 if (bootverbose) 4969 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4970 #endif /* INET || INET6 */ 4971 4972 ctx = device_get_sysctl_ctx(dev); 4973 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4974 4975 /* Create dev.hn.UNIT.rx sysctl tree */ 4976 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4977 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4978 4979 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4980 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4981 4982 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4983 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 4984 &rxr->hn_br_dma, BUS_DMA_WAITOK); 4985 if (rxr->hn_br == NULL) { 4986 device_printf(dev, "allocate bufring failed\n"); 4987 return (ENOMEM); 4988 } 4989 4990 if (hn_trust_hosttcp) 4991 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 4992 if (hn_trust_hostudp) 4993 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 4994 if (hn_trust_hostip) 4995 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 4996 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 4997 rxr->hn_ifp = sc->hn_ifp; 4998 if (i < sc->hn_tx_ring_cnt) 4999 rxr->hn_txr = &sc->hn_tx_ring[i]; 5000 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5001 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5002 rxr->hn_rx_idx = i; 5003 rxr->hn_rxbuf = sc->hn_rxbuf; 5004 5005 /* 5006 * Initialize LRO. 5007 */ 5008 #if defined(INET) || defined(INET6) 5009 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5010 hn_lro_mbufq_depth); 5011 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5012 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5013 #endif /* INET || INET6 */ 5014 5015 if (sc->hn_rx_sysctl_tree != NULL) { 5016 char name[16]; 5017 5018 /* 5019 * Create per RX ring sysctl tree: 5020 * dev.hn.UNIT.rx.RINGID 5021 */ 5022 snprintf(name, sizeof(name), "%d", i); 5023 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5024 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5025 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5026 5027 if (rxr->hn_rx_sysctl_tree != NULL) { 5028 SYSCTL_ADD_ULONG(ctx, 5029 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5030 OID_AUTO, "packets", 5031 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5032 "# of packets received"); 5033 SYSCTL_ADD_ULONG(ctx, 5034 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5035 OID_AUTO, "rss_pkts", 5036 CTLFLAG_RW | CTLFLAG_STATS, 5037 &rxr->hn_rss_pkts, 5038 "# of packets w/ RSS info received"); 5039 SYSCTL_ADD_ULONG(ctx, 5040 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5041 OID_AUTO, "rsc_pkts", 5042 CTLFLAG_RW | CTLFLAG_STATS, 5043 &rxr->hn_rsc_pkts, 5044 "# of RSC packets received"); 5045 SYSCTL_ADD_ULONG(ctx, 5046 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5047 OID_AUTO, "rsc_drop", 5048 CTLFLAG_RW | CTLFLAG_STATS, 5049 &rxr->hn_rsc_drop, 5050 "# of RSC fragments dropped"); 5051 SYSCTL_ADD_INT(ctx, 5052 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5053 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5054 &rxr->hn_pktbuf_len, 0, 5055 "Temporary channel packet buffer length"); 5056 } 5057 } 5058 } 5059 5060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5061 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5062 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5063 hn_rx_stat_u64_sysctl, 5064 "LU", "LRO queued"); 5065 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5066 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5067 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5068 hn_rx_stat_u64_sysctl, 5069 "LU", "LRO flushed"); 5070 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5071 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5072 __offsetof(struct hn_rx_ring, hn_lro_tried), 5073 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5074 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5075 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5076 hn_lro_lenlim_sysctl, "IU", 5077 "Max # of data bytes to be aggregated by LRO"); 5078 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5079 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5080 hn_lro_ackcnt_sysctl, "I", 5081 "Max # of ACKs to be aggregated by LRO"); 5082 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5083 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5084 hn_trust_hcsum_sysctl, "I", 5085 "Trust tcp segment verification on host side, " 5086 "when csum info is missing"); 5087 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5088 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5089 hn_trust_hcsum_sysctl, "I", 5090 "Trust udp datagram verification on host side, " 5091 "when csum info is missing"); 5092 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5093 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5094 hn_trust_hcsum_sysctl, "I", 5095 "Trust ip packet verification on host side, " 5096 "when csum info is missing"); 5097 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5098 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5099 __offsetof(struct hn_rx_ring, hn_csum_ip), 5100 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5102 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5103 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5104 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5105 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5106 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5107 __offsetof(struct hn_rx_ring, hn_csum_udp), 5108 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5109 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5110 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5111 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5112 hn_rx_stat_ulong_sysctl, "LU", 5113 "# of packets that we trust host's csum verification"); 5114 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5115 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5116 __offsetof(struct hn_rx_ring, hn_small_pkts), 5117 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5118 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5119 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5120 __offsetof(struct hn_rx_ring, hn_ack_failed), 5121 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5122 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5123 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5124 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5125 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5126 5127 return (0); 5128 } 5129 5130 static void 5131 hn_destroy_rx_data(struct hn_softc *sc) 5132 { 5133 int i; 5134 5135 if (sc->hn_rxbuf != NULL) { 5136 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5137 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5138 else 5139 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5140 sc->hn_rxbuf = NULL; 5141 } 5142 5143 if (sc->hn_rx_ring_cnt == 0) 5144 return; 5145 5146 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5147 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5148 5149 if (rxr->hn_br == NULL) 5150 continue; 5151 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5152 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5153 } else { 5154 device_printf(sc->hn_dev, 5155 "%dth channel bufring is referenced", i); 5156 } 5157 rxr->hn_br = NULL; 5158 5159 #if defined(INET) || defined(INET6) 5160 tcp_lro_free(&rxr->hn_lro); 5161 #endif 5162 free(rxr->hn_pktbuf, M_DEVBUF); 5163 } 5164 free(sc->hn_rx_ring, M_DEVBUF); 5165 sc->hn_rx_ring = NULL; 5166 5167 sc->hn_rx_ring_cnt = 0; 5168 sc->hn_rx_ring_inuse = 0; 5169 } 5170 5171 static int 5172 hn_tx_ring_create(struct hn_softc *sc, int id) 5173 { 5174 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5175 device_t dev = sc->hn_dev; 5176 bus_dma_tag_t parent_dtag; 5177 int error, i; 5178 5179 txr->hn_sc = sc; 5180 txr->hn_tx_idx = id; 5181 5182 #ifndef HN_USE_TXDESC_BUFRING 5183 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5184 #endif 5185 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5186 5187 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5188 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5189 M_DEVBUF, M_WAITOK | M_ZERO); 5190 #ifndef HN_USE_TXDESC_BUFRING 5191 SLIST_INIT(&txr->hn_txlist); 5192 #else 5193 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5194 M_WAITOK, &txr->hn_tx_lock); 5195 #endif 5196 5197 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5198 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5199 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5200 } else { 5201 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5202 } 5203 5204 #ifdef HN_IFSTART_SUPPORT 5205 if (hn_use_if_start) { 5206 txr->hn_txeof = hn_start_txeof; 5207 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5208 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5209 } else 5210 #endif 5211 { 5212 int br_depth; 5213 5214 txr->hn_txeof = hn_xmit_txeof; 5215 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5216 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5217 5218 br_depth = hn_get_txswq_depth(txr); 5219 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5220 M_WAITOK, &txr->hn_tx_lock); 5221 } 5222 5223 txr->hn_direct_tx_size = hn_direct_tx_size; 5224 5225 /* 5226 * Always schedule transmission instead of trying to do direct 5227 * transmission. This one gives the best performance so far. 5228 */ 5229 txr->hn_sched_tx = 1; 5230 5231 parent_dtag = bus_get_dma_tag(dev); 5232 5233 /* DMA tag for RNDIS packet messages. */ 5234 error = bus_dma_tag_create(parent_dtag, /* parent */ 5235 HN_RNDIS_PKT_ALIGN, /* alignment */ 5236 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5237 BUS_SPACE_MAXADDR, /* lowaddr */ 5238 BUS_SPACE_MAXADDR, /* highaddr */ 5239 NULL, NULL, /* filter, filterarg */ 5240 HN_RNDIS_PKT_LEN, /* maxsize */ 5241 1, /* nsegments */ 5242 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5243 0, /* flags */ 5244 NULL, /* lockfunc */ 5245 NULL, /* lockfuncarg */ 5246 &txr->hn_tx_rndis_dtag); 5247 if (error) { 5248 device_printf(dev, "failed to create rndis dmatag\n"); 5249 return error; 5250 } 5251 5252 /* DMA tag for data. */ 5253 error = bus_dma_tag_create(parent_dtag, /* parent */ 5254 1, /* alignment */ 5255 HN_TX_DATA_BOUNDARY, /* boundary */ 5256 BUS_SPACE_MAXADDR, /* lowaddr */ 5257 BUS_SPACE_MAXADDR, /* highaddr */ 5258 NULL, NULL, /* filter, filterarg */ 5259 HN_TX_DATA_MAXSIZE, /* maxsize */ 5260 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5261 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5262 0, /* flags */ 5263 NULL, /* lockfunc */ 5264 NULL, /* lockfuncarg */ 5265 &txr->hn_tx_data_dtag); 5266 if (error) { 5267 device_printf(dev, "failed to create data dmatag\n"); 5268 return error; 5269 } 5270 5271 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5272 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5273 5274 txd->txr = txr; 5275 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5276 STAILQ_INIT(&txd->agg_list); 5277 5278 /* 5279 * Allocate and load RNDIS packet message. 5280 */ 5281 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5282 (void **)&txd->rndis_pkt, 5283 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5284 &txd->rndis_pkt_dmap); 5285 if (error) { 5286 device_printf(dev, 5287 "failed to allocate rndis_packet_msg, %d\n", i); 5288 return error; 5289 } 5290 5291 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5292 txd->rndis_pkt_dmap, 5293 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5294 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5295 BUS_DMA_NOWAIT); 5296 if (error) { 5297 device_printf(dev, 5298 "failed to load rndis_packet_msg, %d\n", i); 5299 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5300 txd->rndis_pkt, txd->rndis_pkt_dmap); 5301 return error; 5302 } 5303 5304 /* DMA map for TX data. */ 5305 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5306 &txd->data_dmap); 5307 if (error) { 5308 device_printf(dev, 5309 "failed to allocate tx data dmamap\n"); 5310 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5311 txd->rndis_pkt_dmap); 5312 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5313 txd->rndis_pkt, txd->rndis_pkt_dmap); 5314 return error; 5315 } 5316 5317 /* All set, put it to list */ 5318 txd->flags |= HN_TXD_FLAG_ONLIST; 5319 #ifndef HN_USE_TXDESC_BUFRING 5320 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5321 #else 5322 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5323 #endif 5324 } 5325 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5326 5327 if (sc->hn_tx_sysctl_tree != NULL) { 5328 struct sysctl_oid_list *child; 5329 struct sysctl_ctx_list *ctx; 5330 char name[16]; 5331 5332 /* 5333 * Create per TX ring sysctl tree: 5334 * dev.hn.UNIT.tx.RINGID 5335 */ 5336 ctx = device_get_sysctl_ctx(dev); 5337 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5338 5339 snprintf(name, sizeof(name), "%d", id); 5340 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5341 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5342 5343 if (txr->hn_tx_sysctl_tree != NULL) { 5344 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5345 5346 #ifdef HN_DEBUG 5347 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5348 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5349 "# of available TX descs"); 5350 #endif 5351 #ifdef HN_IFSTART_SUPPORT 5352 if (!hn_use_if_start) 5353 #endif 5354 { 5355 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5356 CTLFLAG_RD, &txr->hn_oactive, 0, 5357 "over active"); 5358 } 5359 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5360 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5361 "# of packets transmitted"); 5362 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5363 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5364 "# of sends"); 5365 } 5366 } 5367 5368 return 0; 5369 } 5370 5371 static void 5372 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5373 { 5374 struct hn_tx_ring *txr = txd->txr; 5375 5376 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5377 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5378 5379 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5380 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5381 txd->rndis_pkt_dmap); 5382 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5383 } 5384 5385 static void 5386 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5387 { 5388 5389 KASSERT(txd->refs == 0 || txd->refs == 1, 5390 ("invalid txd refs %d", txd->refs)); 5391 5392 /* Aggregated txds will be freed by their aggregating txd. */ 5393 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5394 int freed __diagused; 5395 5396 freed = hn_txdesc_put(txr, txd); 5397 KASSERT(freed, ("can't free txdesc")); 5398 } 5399 } 5400 5401 static void 5402 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5403 { 5404 int i; 5405 5406 if (txr->hn_txdesc == NULL) 5407 return; 5408 5409 /* 5410 * NOTE: 5411 * Because the freeing of aggregated txds will be deferred 5412 * to the aggregating txd, two passes are used here: 5413 * - The first pass GCes any pending txds. This GC is necessary, 5414 * since if the channels are revoked, hypervisor will not 5415 * deliver send-done for all pending txds. 5416 * - The second pass frees the busdma stuffs, i.e. after all txds 5417 * were freed. 5418 */ 5419 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5420 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5421 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5422 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5423 5424 if (txr->hn_tx_data_dtag != NULL) 5425 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5426 if (txr->hn_tx_rndis_dtag != NULL) 5427 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5428 5429 #ifdef HN_USE_TXDESC_BUFRING 5430 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5431 #endif 5432 5433 free(txr->hn_txdesc, M_DEVBUF); 5434 txr->hn_txdesc = NULL; 5435 5436 if (txr->hn_mbuf_br != NULL) 5437 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5438 5439 #ifndef HN_USE_TXDESC_BUFRING 5440 mtx_destroy(&txr->hn_txlist_spin); 5441 #endif 5442 mtx_destroy(&txr->hn_tx_lock); 5443 } 5444 5445 static int 5446 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5447 { 5448 struct sysctl_oid_list *child; 5449 struct sysctl_ctx_list *ctx; 5450 int i; 5451 5452 /* 5453 * Create TXBUF for chimney sending. 5454 * 5455 * NOTE: It is shared by all channels. 5456 */ 5457 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5458 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5459 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5460 if (sc->hn_chim == NULL) { 5461 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5462 return (ENOMEM); 5463 } 5464 5465 sc->hn_tx_ring_cnt = ring_cnt; 5466 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5467 5468 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5469 M_DEVBUF, M_WAITOK | M_ZERO); 5470 5471 ctx = device_get_sysctl_ctx(sc->hn_dev); 5472 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5473 5474 /* Create dev.hn.UNIT.tx sysctl tree */ 5475 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5476 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5477 5478 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5479 int error; 5480 5481 error = hn_tx_ring_create(sc, i); 5482 if (error) 5483 return error; 5484 } 5485 5486 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5487 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5488 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5489 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5490 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5491 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5492 __offsetof(struct hn_tx_ring, hn_send_failed), 5493 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5494 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5495 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5496 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5497 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5498 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5499 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5500 __offsetof(struct hn_tx_ring, hn_flush_failed), 5501 hn_tx_stat_ulong_sysctl, "LU", 5502 "# of packet transmission aggregation flush failure"); 5503 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5504 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5505 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5506 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5507 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5508 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5509 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5510 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5511 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5512 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5513 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5514 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5515 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5516 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5517 "# of total TX descs"); 5518 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5519 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5520 "Chimney send packet size upper boundary"); 5521 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5522 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5523 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5524 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5525 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5526 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5527 hn_tx_conf_int_sysctl, "I", 5528 "Size of the packet for direct transmission"); 5529 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5530 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5531 __offsetof(struct hn_tx_ring, hn_sched_tx), 5532 hn_tx_conf_int_sysctl, "I", 5533 "Always schedule transmission " 5534 "instead of doing direct transmission"); 5535 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5536 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5537 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5538 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5539 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5540 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5541 "Applied packet transmission aggregation size"); 5542 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5543 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5544 hn_txagg_pktmax_sysctl, "I", 5545 "Applied packet transmission aggregation packets"); 5546 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5547 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5548 hn_txagg_align_sysctl, "I", 5549 "Applied packet transmission aggregation alignment"); 5550 5551 return 0; 5552 } 5553 5554 static void 5555 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5556 { 5557 int i; 5558 5559 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5560 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5561 } 5562 5563 static void 5564 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5565 { 5566 struct ifnet *ifp = sc->hn_ifp; 5567 u_int hw_tsomax; 5568 int tso_minlen; 5569 5570 HN_LOCK_ASSERT(sc); 5571 5572 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5573 return; 5574 5575 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5576 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5577 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5578 5579 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5580 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5581 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5582 5583 if (tso_maxlen < tso_minlen) 5584 tso_maxlen = tso_minlen; 5585 else if (tso_maxlen > IP_MAXPACKET) 5586 tso_maxlen = IP_MAXPACKET; 5587 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5588 tso_maxlen = sc->hn_ndis_tso_szmax; 5589 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5590 5591 if (hn_xpnt_vf_isready(sc)) { 5592 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5593 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5594 } 5595 ifp->if_hw_tsomax = hw_tsomax; 5596 if (bootverbose) 5597 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5598 } 5599 5600 static void 5601 hn_fixup_tx_data(struct hn_softc *sc) 5602 { 5603 uint64_t csum_assist; 5604 int i; 5605 5606 hn_set_chim_size(sc, sc->hn_chim_szmax); 5607 if (hn_tx_chimney_size > 0 && 5608 hn_tx_chimney_size < sc->hn_chim_szmax) 5609 hn_set_chim_size(sc, hn_tx_chimney_size); 5610 5611 csum_assist = 0; 5612 if (sc->hn_caps & HN_CAP_IPCS) 5613 csum_assist |= CSUM_IP; 5614 if (sc->hn_caps & HN_CAP_TCP4CS) 5615 csum_assist |= CSUM_IP_TCP; 5616 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5617 csum_assist |= CSUM_IP_UDP; 5618 if (sc->hn_caps & HN_CAP_TCP6CS) 5619 csum_assist |= CSUM_IP6_TCP; 5620 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5621 csum_assist |= CSUM_IP6_UDP; 5622 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5623 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5624 5625 if (sc->hn_caps & HN_CAP_HASHVAL) { 5626 /* 5627 * Support HASHVAL pktinfo on TX path. 5628 */ 5629 if (bootverbose) 5630 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5631 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5632 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5633 } 5634 } 5635 5636 static void 5637 hn_fixup_rx_data(struct hn_softc *sc) 5638 { 5639 5640 if (sc->hn_caps & HN_CAP_UDPHASH) { 5641 int i; 5642 5643 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5644 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5645 } 5646 } 5647 5648 static void 5649 hn_destroy_tx_data(struct hn_softc *sc) 5650 { 5651 int i; 5652 5653 if (sc->hn_chim != NULL) { 5654 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5655 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5656 } else { 5657 device_printf(sc->hn_dev, 5658 "chimney sending buffer is referenced"); 5659 } 5660 sc->hn_chim = NULL; 5661 } 5662 5663 if (sc->hn_tx_ring_cnt == 0) 5664 return; 5665 5666 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5667 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5668 5669 free(sc->hn_tx_ring, M_DEVBUF); 5670 sc->hn_tx_ring = NULL; 5671 5672 sc->hn_tx_ring_cnt = 0; 5673 sc->hn_tx_ring_inuse = 0; 5674 } 5675 5676 #ifdef HN_IFSTART_SUPPORT 5677 5678 static void 5679 hn_start_taskfunc(void *xtxr, int pending __unused) 5680 { 5681 struct hn_tx_ring *txr = xtxr; 5682 5683 mtx_lock(&txr->hn_tx_lock); 5684 hn_start_locked(txr, 0); 5685 mtx_unlock(&txr->hn_tx_lock); 5686 } 5687 5688 static int 5689 hn_start_locked(struct hn_tx_ring *txr, int len) 5690 { 5691 struct hn_softc *sc = txr->hn_sc; 5692 struct ifnet *ifp = sc->hn_ifp; 5693 int sched = 0; 5694 5695 KASSERT(hn_use_if_start, 5696 ("hn_start_locked is called, when if_start is disabled")); 5697 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5698 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5699 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5700 5701 if (__predict_false(txr->hn_suspended)) 5702 return (0); 5703 5704 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5705 IFF_DRV_RUNNING) 5706 return (0); 5707 5708 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5709 struct hn_txdesc *txd; 5710 struct mbuf *m_head; 5711 int error; 5712 5713 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5714 if (m_head == NULL) 5715 break; 5716 5717 if (len > 0 && m_head->m_pkthdr.len > len) { 5718 /* 5719 * This sending could be time consuming; let callers 5720 * dispatch this packet sending (and sending of any 5721 * following up packets) to tx taskqueue. 5722 */ 5723 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5724 sched = 1; 5725 break; 5726 } 5727 5728 #if defined(INET6) || defined(INET) 5729 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5730 m_head = hn_tso_fixup(m_head); 5731 if (__predict_false(m_head == NULL)) { 5732 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5733 continue; 5734 } 5735 } else if (m_head->m_pkthdr.csum_flags & 5736 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5737 m_head = hn_set_hlen(m_head); 5738 if (__predict_false(m_head == NULL)) { 5739 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5740 continue; 5741 } 5742 } 5743 #endif 5744 5745 txd = hn_txdesc_get(txr); 5746 if (txd == NULL) { 5747 txr->hn_no_txdescs++; 5748 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5749 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5750 break; 5751 } 5752 5753 error = hn_encap(ifp, txr, txd, &m_head); 5754 if (error) { 5755 /* Both txd and m_head are freed */ 5756 KASSERT(txr->hn_agg_txd == NULL, 5757 ("encap failed w/ pending aggregating txdesc")); 5758 continue; 5759 } 5760 5761 if (txr->hn_agg_pktleft == 0) { 5762 if (txr->hn_agg_txd != NULL) { 5763 KASSERT(m_head == NULL, 5764 ("pending mbuf for aggregating txdesc")); 5765 error = hn_flush_txagg(ifp, txr); 5766 if (__predict_false(error)) { 5767 atomic_set_int(&ifp->if_drv_flags, 5768 IFF_DRV_OACTIVE); 5769 break; 5770 } 5771 } else { 5772 KASSERT(m_head != NULL, ("mbuf was freed")); 5773 error = hn_txpkt(ifp, txr, txd); 5774 if (__predict_false(error)) { 5775 /* txd is freed, but m_head is not */ 5776 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5777 atomic_set_int(&ifp->if_drv_flags, 5778 IFF_DRV_OACTIVE); 5779 break; 5780 } 5781 } 5782 } 5783 #ifdef INVARIANTS 5784 else { 5785 KASSERT(txr->hn_agg_txd != NULL, 5786 ("no aggregating txdesc")); 5787 KASSERT(m_head == NULL, 5788 ("pending mbuf for aggregating txdesc")); 5789 } 5790 #endif 5791 } 5792 5793 /* Flush pending aggerated transmission. */ 5794 if (txr->hn_agg_txd != NULL) 5795 hn_flush_txagg(ifp, txr); 5796 return (sched); 5797 } 5798 5799 static void 5800 hn_start(struct ifnet *ifp) 5801 { 5802 struct hn_softc *sc = ifp->if_softc; 5803 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5804 5805 if (txr->hn_sched_tx) 5806 goto do_sched; 5807 5808 if (mtx_trylock(&txr->hn_tx_lock)) { 5809 int sched; 5810 5811 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5812 mtx_unlock(&txr->hn_tx_lock); 5813 if (!sched) 5814 return; 5815 } 5816 do_sched: 5817 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5818 } 5819 5820 static void 5821 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5822 { 5823 struct hn_tx_ring *txr = xtxr; 5824 5825 mtx_lock(&txr->hn_tx_lock); 5826 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5827 hn_start_locked(txr, 0); 5828 mtx_unlock(&txr->hn_tx_lock); 5829 } 5830 5831 static void 5832 hn_start_txeof(struct hn_tx_ring *txr) 5833 { 5834 struct hn_softc *sc = txr->hn_sc; 5835 struct ifnet *ifp = sc->hn_ifp; 5836 5837 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5838 5839 if (txr->hn_sched_tx) 5840 goto do_sched; 5841 5842 if (mtx_trylock(&txr->hn_tx_lock)) { 5843 int sched; 5844 5845 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5846 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5847 mtx_unlock(&txr->hn_tx_lock); 5848 if (sched) { 5849 taskqueue_enqueue(txr->hn_tx_taskq, 5850 &txr->hn_tx_task); 5851 } 5852 } else { 5853 do_sched: 5854 /* 5855 * Release the OACTIVE earlier, with the hope, that 5856 * others could catch up. The task will clear the 5857 * flag again with the hn_tx_lock to avoid possible 5858 * races. 5859 */ 5860 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5861 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5862 } 5863 } 5864 5865 #endif /* HN_IFSTART_SUPPORT */ 5866 5867 static int 5868 hn_xmit(struct hn_tx_ring *txr, int len) 5869 { 5870 struct hn_softc *sc = txr->hn_sc; 5871 struct ifnet *ifp = sc->hn_ifp; 5872 struct mbuf *m_head; 5873 int sched = 0; 5874 5875 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5876 #ifdef HN_IFSTART_SUPPORT 5877 KASSERT(hn_use_if_start == 0, 5878 ("hn_xmit is called, when if_start is enabled")); 5879 #endif 5880 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5881 5882 if (__predict_false(txr->hn_suspended)) 5883 return (0); 5884 5885 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5886 return (0); 5887 5888 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5889 struct hn_txdesc *txd; 5890 int error; 5891 5892 if (len > 0 && m_head->m_pkthdr.len > len) { 5893 /* 5894 * This sending could be time consuming; let callers 5895 * dispatch this packet sending (and sending of any 5896 * following up packets) to tx taskqueue. 5897 */ 5898 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5899 sched = 1; 5900 break; 5901 } 5902 5903 txd = hn_txdesc_get(txr); 5904 if (txd == NULL) { 5905 txr->hn_no_txdescs++; 5906 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5907 txr->hn_oactive = 1; 5908 break; 5909 } 5910 5911 error = hn_encap(ifp, txr, txd, &m_head); 5912 if (error) { 5913 /* Both txd and m_head are freed; discard */ 5914 KASSERT(txr->hn_agg_txd == NULL, 5915 ("encap failed w/ pending aggregating txdesc")); 5916 drbr_advance(ifp, txr->hn_mbuf_br); 5917 continue; 5918 } 5919 5920 if (txr->hn_agg_pktleft == 0) { 5921 if (txr->hn_agg_txd != NULL) { 5922 KASSERT(m_head == NULL, 5923 ("pending mbuf for aggregating txdesc")); 5924 error = hn_flush_txagg(ifp, txr); 5925 if (__predict_false(error)) { 5926 txr->hn_oactive = 1; 5927 break; 5928 } 5929 } else { 5930 KASSERT(m_head != NULL, ("mbuf was freed")); 5931 error = hn_txpkt(ifp, txr, txd); 5932 if (__predict_false(error)) { 5933 /* txd is freed, but m_head is not */ 5934 drbr_putback(ifp, txr->hn_mbuf_br, 5935 m_head); 5936 txr->hn_oactive = 1; 5937 break; 5938 } 5939 } 5940 } 5941 #ifdef INVARIANTS 5942 else { 5943 KASSERT(txr->hn_agg_txd != NULL, 5944 ("no aggregating txdesc")); 5945 KASSERT(m_head == NULL, 5946 ("pending mbuf for aggregating txdesc")); 5947 } 5948 #endif 5949 5950 /* Sent */ 5951 drbr_advance(ifp, txr->hn_mbuf_br); 5952 } 5953 5954 /* Flush pending aggerated transmission. */ 5955 if (txr->hn_agg_txd != NULL) 5956 hn_flush_txagg(ifp, txr); 5957 return (sched); 5958 } 5959 5960 static int 5961 hn_transmit(struct ifnet *ifp, struct mbuf *m) 5962 { 5963 struct hn_softc *sc = ifp->if_softc; 5964 struct hn_tx_ring *txr; 5965 int error, idx = 0; 5966 5967 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5968 struct rm_priotracker pt; 5969 5970 rm_rlock(&sc->hn_vf_lock, &pt); 5971 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5972 struct mbuf *m_bpf = NULL; 5973 int obytes, omcast; 5974 5975 obytes = m->m_pkthdr.len; 5976 omcast = (m->m_flags & M_MCAST) != 0; 5977 5978 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 5979 if (bpf_peers_present(ifp->if_bpf)) { 5980 m_bpf = m_copypacket(m, M_NOWAIT); 5981 if (m_bpf == NULL) { 5982 /* 5983 * Failed to grab a shallow 5984 * copy; tap now. 5985 */ 5986 ETHER_BPF_MTAP(ifp, m); 5987 } 5988 } 5989 } else { 5990 ETHER_BPF_MTAP(ifp, m); 5991 } 5992 5993 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 5994 rm_runlock(&sc->hn_vf_lock, &pt); 5995 5996 if (m_bpf != NULL) { 5997 if (!error) 5998 ETHER_BPF_MTAP(ifp, m_bpf); 5999 m_freem(m_bpf); 6000 } 6001 6002 if (error == ENOBUFS) { 6003 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6004 } else if (error) { 6005 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6006 } else { 6007 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6008 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6009 if (omcast) { 6010 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6011 omcast); 6012 } 6013 } 6014 return (error); 6015 } 6016 rm_runlock(&sc->hn_vf_lock, &pt); 6017 } 6018 6019 #if defined(INET6) || defined(INET) 6020 /* 6021 * Perform TSO packet header fixup or get l2/l3 header length now, 6022 * since packet headers should be cache-hot. 6023 */ 6024 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6025 m = hn_tso_fixup(m); 6026 if (__predict_false(m == NULL)) { 6027 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6028 return EIO; 6029 } 6030 } else if (m->m_pkthdr.csum_flags & 6031 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6032 m = hn_set_hlen(m); 6033 if (__predict_false(m == NULL)) { 6034 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6035 return EIO; 6036 } 6037 } 6038 #endif 6039 6040 /* 6041 * Select the TX ring based on flowid 6042 */ 6043 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6044 #ifdef RSS 6045 uint32_t bid; 6046 6047 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6048 &bid) == 0) 6049 idx = bid % sc->hn_tx_ring_inuse; 6050 else 6051 #endif 6052 { 6053 #if defined(INET6) || defined(INET) 6054 int tcpsyn = 0; 6055 6056 if (m->m_pkthdr.len < 128 && 6057 (m->m_pkthdr.csum_flags & 6058 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6059 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6060 m = hn_check_tcpsyn(m, &tcpsyn); 6061 if (__predict_false(m == NULL)) { 6062 if_inc_counter(ifp, 6063 IFCOUNTER_OERRORS, 1); 6064 return (EIO); 6065 } 6066 } 6067 #else 6068 const int tcpsyn = 0; 6069 #endif 6070 if (tcpsyn) 6071 idx = 0; 6072 else 6073 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6074 } 6075 } 6076 txr = &sc->hn_tx_ring[idx]; 6077 6078 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6079 if (error) { 6080 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6081 return error; 6082 } 6083 6084 if (txr->hn_oactive) 6085 return 0; 6086 6087 if (txr->hn_sched_tx) 6088 goto do_sched; 6089 6090 if (mtx_trylock(&txr->hn_tx_lock)) { 6091 int sched; 6092 6093 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6094 mtx_unlock(&txr->hn_tx_lock); 6095 if (!sched) 6096 return 0; 6097 } 6098 do_sched: 6099 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6100 return 0; 6101 } 6102 6103 static void 6104 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6105 { 6106 struct mbuf *m; 6107 6108 mtx_lock(&txr->hn_tx_lock); 6109 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6110 m_freem(m); 6111 mtx_unlock(&txr->hn_tx_lock); 6112 } 6113 6114 static void 6115 hn_xmit_qflush(struct ifnet *ifp) 6116 { 6117 struct hn_softc *sc = ifp->if_softc; 6118 struct rm_priotracker pt; 6119 int i; 6120 6121 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6122 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6123 if_qflush(ifp); 6124 6125 rm_rlock(&sc->hn_vf_lock, &pt); 6126 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6127 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6128 rm_runlock(&sc->hn_vf_lock, &pt); 6129 } 6130 6131 static void 6132 hn_xmit_txeof(struct hn_tx_ring *txr) 6133 { 6134 6135 if (txr->hn_sched_tx) 6136 goto do_sched; 6137 6138 if (mtx_trylock(&txr->hn_tx_lock)) { 6139 int sched; 6140 6141 txr->hn_oactive = 0; 6142 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6143 mtx_unlock(&txr->hn_tx_lock); 6144 if (sched) { 6145 taskqueue_enqueue(txr->hn_tx_taskq, 6146 &txr->hn_tx_task); 6147 } 6148 } else { 6149 do_sched: 6150 /* 6151 * Release the oactive earlier, with the hope, that 6152 * others could catch up. The task will clear the 6153 * oactive again with the hn_tx_lock to avoid possible 6154 * races. 6155 */ 6156 txr->hn_oactive = 0; 6157 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6158 } 6159 } 6160 6161 static void 6162 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6163 { 6164 struct hn_tx_ring *txr = xtxr; 6165 6166 mtx_lock(&txr->hn_tx_lock); 6167 hn_xmit(txr, 0); 6168 mtx_unlock(&txr->hn_tx_lock); 6169 } 6170 6171 static void 6172 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6173 { 6174 struct hn_tx_ring *txr = xtxr; 6175 6176 mtx_lock(&txr->hn_tx_lock); 6177 txr->hn_oactive = 0; 6178 hn_xmit(txr, 0); 6179 mtx_unlock(&txr->hn_tx_lock); 6180 } 6181 6182 static int 6183 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6184 { 6185 struct vmbus_chan_br cbr; 6186 struct hn_rx_ring *rxr; 6187 struct hn_tx_ring *txr = NULL; 6188 int idx, error; 6189 6190 idx = vmbus_chan_subidx(chan); 6191 6192 /* 6193 * Link this channel to RX/TX ring. 6194 */ 6195 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6196 ("invalid channel index %d, should > 0 && < %d", 6197 idx, sc->hn_rx_ring_inuse)); 6198 rxr = &sc->hn_rx_ring[idx]; 6199 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6200 ("RX ring %d already attached", idx)); 6201 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6202 rxr->hn_chan = chan; 6203 6204 if (bootverbose) { 6205 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6206 idx, vmbus_chan_id(chan)); 6207 } 6208 6209 if (idx < sc->hn_tx_ring_inuse) { 6210 txr = &sc->hn_tx_ring[idx]; 6211 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6212 ("TX ring %d already attached", idx)); 6213 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6214 6215 txr->hn_chan = chan; 6216 if (bootverbose) { 6217 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6218 idx, vmbus_chan_id(chan)); 6219 } 6220 } 6221 6222 /* Bind this channel to a proper CPU. */ 6223 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6224 6225 /* 6226 * Open this channel 6227 */ 6228 cbr.cbr = rxr->hn_br; 6229 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6230 cbr.cbr_txsz = HN_TXBR_SIZE; 6231 cbr.cbr_rxsz = HN_RXBR_SIZE; 6232 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6233 if (error) { 6234 if (error == EISCONN) { 6235 if_printf(sc->hn_ifp, "bufring is connected after " 6236 "chan%u open failure\n", vmbus_chan_id(chan)); 6237 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6238 } else { 6239 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6240 vmbus_chan_id(chan), error); 6241 } 6242 } 6243 return (error); 6244 } 6245 6246 static void 6247 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6248 { 6249 struct hn_rx_ring *rxr; 6250 int idx, error; 6251 6252 idx = vmbus_chan_subidx(chan); 6253 6254 /* 6255 * Link this channel to RX/TX ring. 6256 */ 6257 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6258 ("invalid channel index %d, should > 0 && < %d", 6259 idx, sc->hn_rx_ring_inuse)); 6260 rxr = &sc->hn_rx_ring[idx]; 6261 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6262 ("RX ring %d is not attached", idx)); 6263 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6264 6265 if (idx < sc->hn_tx_ring_inuse) { 6266 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6267 6268 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6269 ("TX ring %d is not attached attached", idx)); 6270 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6271 } 6272 6273 /* 6274 * Close this channel. 6275 * 6276 * NOTE: 6277 * Channel closing does _not_ destroy the target channel. 6278 */ 6279 error = vmbus_chan_close_direct(chan); 6280 if (error == EISCONN) { 6281 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6282 "after being closed\n", vmbus_chan_id(chan)); 6283 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6284 } else if (error) { 6285 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6286 vmbus_chan_id(chan), error); 6287 } 6288 } 6289 6290 static int 6291 hn_attach_subchans(struct hn_softc *sc) 6292 { 6293 struct vmbus_channel **subchans; 6294 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6295 int i, error = 0; 6296 6297 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6298 6299 /* Attach the sub-channels. */ 6300 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6301 for (i = 0; i < subchan_cnt; ++i) { 6302 int error1; 6303 6304 error1 = hn_chan_attach(sc, subchans[i]); 6305 if (error1) { 6306 error = error1; 6307 /* Move on; all channels will be detached later. */ 6308 } 6309 } 6310 vmbus_subchan_rel(subchans, subchan_cnt); 6311 6312 if (error) { 6313 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6314 } else { 6315 if (bootverbose) { 6316 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6317 subchan_cnt); 6318 } 6319 } 6320 return (error); 6321 } 6322 6323 static void 6324 hn_detach_allchans(struct hn_softc *sc) 6325 { 6326 struct vmbus_channel **subchans; 6327 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6328 int i; 6329 6330 if (subchan_cnt == 0) 6331 goto back; 6332 6333 /* Detach the sub-channels. */ 6334 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6335 for (i = 0; i < subchan_cnt; ++i) 6336 hn_chan_detach(sc, subchans[i]); 6337 vmbus_subchan_rel(subchans, subchan_cnt); 6338 6339 back: 6340 /* 6341 * Detach the primary channel, _after_ all sub-channels 6342 * are detached. 6343 */ 6344 hn_chan_detach(sc, sc->hn_prichan); 6345 6346 /* Wait for sub-channels to be destroyed, if any. */ 6347 vmbus_subchan_drain(sc->hn_prichan); 6348 6349 #ifdef INVARIANTS 6350 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6351 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6352 HN_RX_FLAG_ATTACHED) == 0, 6353 ("%dth RX ring is still attached", i)); 6354 } 6355 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6356 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6357 HN_TX_FLAG_ATTACHED) == 0, 6358 ("%dth TX ring is still attached", i)); 6359 } 6360 #endif 6361 } 6362 6363 static int 6364 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6365 { 6366 struct vmbus_channel **subchans; 6367 int nchan, rxr_cnt, error; 6368 6369 nchan = *nsubch + 1; 6370 if (nchan == 1) { 6371 /* 6372 * Multiple RX/TX rings are not requested. 6373 */ 6374 *nsubch = 0; 6375 return (0); 6376 } 6377 6378 /* 6379 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6380 * table entries. 6381 */ 6382 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6383 if (error) { 6384 /* No RSS; this is benign. */ 6385 *nsubch = 0; 6386 return (0); 6387 } 6388 if (bootverbose) { 6389 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6390 rxr_cnt, nchan); 6391 } 6392 6393 if (nchan > rxr_cnt) 6394 nchan = rxr_cnt; 6395 if (nchan == 1) { 6396 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6397 *nsubch = 0; 6398 return (0); 6399 } 6400 6401 /* 6402 * Allocate sub-channels from NVS. 6403 */ 6404 *nsubch = nchan - 1; 6405 error = hn_nvs_alloc_subchans(sc, nsubch); 6406 if (error || *nsubch == 0) { 6407 /* Failed to allocate sub-channels. */ 6408 *nsubch = 0; 6409 return (0); 6410 } 6411 6412 /* 6413 * Wait for all sub-channels to become ready before moving on. 6414 */ 6415 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6416 vmbus_subchan_rel(subchans, *nsubch); 6417 return (0); 6418 } 6419 6420 static bool 6421 hn_synth_attachable(const struct hn_softc *sc) 6422 { 6423 int i; 6424 6425 if (sc->hn_flags & HN_FLAG_ERRORS) 6426 return (false); 6427 6428 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6429 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6430 6431 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6432 return (false); 6433 } 6434 return (true); 6435 } 6436 6437 /* 6438 * Make sure that the RX filter is zero after the successful 6439 * RNDIS initialization. 6440 * 6441 * NOTE: 6442 * Under certain conditions on certain versions of Hyper-V, 6443 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6444 * after the successful RNDIS initialization, which breaks 6445 * the assumption of any following code (well, it breaks the 6446 * RNDIS API contract actually). Clear the RNDIS rxfilter 6447 * explicitly, drain packets sneaking through, and drain the 6448 * interrupt taskqueues scheduled due to the stealth packets. 6449 */ 6450 static void 6451 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6452 { 6453 6454 hn_disable_rx(sc); 6455 hn_drain_rxtx(sc, nchan); 6456 } 6457 6458 static int 6459 hn_synth_attach(struct hn_softc *sc, int mtu) 6460 { 6461 #define ATTACHED_NVS 0x0002 6462 #define ATTACHED_RNDIS 0x0004 6463 6464 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6465 int error, nsubch, nchan = 1, i, rndis_inited; 6466 uint32_t old_caps, attached = 0; 6467 6468 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6469 ("synthetic parts were attached")); 6470 6471 if (!hn_synth_attachable(sc)) 6472 return (ENXIO); 6473 6474 /* Save capabilities for later verification. */ 6475 old_caps = sc->hn_caps; 6476 sc->hn_caps = 0; 6477 6478 /* Clear RSS stuffs. */ 6479 sc->hn_rss_ind_size = 0; 6480 sc->hn_rss_hash = 0; 6481 sc->hn_rss_hcap = 0; 6482 6483 /* 6484 * Attach the primary channel _before_ attaching NVS and RNDIS. 6485 */ 6486 error = hn_chan_attach(sc, sc->hn_prichan); 6487 if (error) 6488 goto failed; 6489 6490 /* 6491 * Attach NVS. 6492 */ 6493 error = hn_nvs_attach(sc, mtu); 6494 if (error) 6495 goto failed; 6496 attached |= ATTACHED_NVS; 6497 6498 /* 6499 * Attach RNDIS _after_ NVS is attached. 6500 */ 6501 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6502 if (rndis_inited) 6503 attached |= ATTACHED_RNDIS; 6504 if (error) 6505 goto failed; 6506 6507 /* 6508 * Make sure capabilities are not changed. 6509 */ 6510 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6511 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6512 old_caps, sc->hn_caps); 6513 error = ENXIO; 6514 goto failed; 6515 } 6516 6517 /* 6518 * Allocate sub-channels for multi-TX/RX rings. 6519 * 6520 * NOTE: 6521 * The # of RX rings that can be used is equivalent to the # of 6522 * channels to be requested. 6523 */ 6524 nsubch = sc->hn_rx_ring_cnt - 1; 6525 error = hn_synth_alloc_subchans(sc, &nsubch); 6526 if (error) 6527 goto failed; 6528 /* NOTE: _Full_ synthetic parts detach is required now. */ 6529 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6530 6531 /* 6532 * Set the # of TX/RX rings that could be used according to 6533 * the # of channels that NVS offered. 6534 */ 6535 nchan = nsubch + 1; 6536 hn_set_ring_inuse(sc, nchan); 6537 if (nchan == 1) { 6538 /* Only the primary channel can be used; done */ 6539 goto back; 6540 } 6541 6542 /* 6543 * Attach the sub-channels. 6544 * 6545 * NOTE: hn_set_ring_inuse() _must_ have been called. 6546 */ 6547 error = hn_attach_subchans(sc); 6548 if (error) 6549 goto failed; 6550 6551 /* 6552 * Configure RSS key and indirect table _after_ all sub-channels 6553 * are attached. 6554 */ 6555 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6556 /* 6557 * RSS key is not set yet; set it to the default RSS key. 6558 */ 6559 if (bootverbose) 6560 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6561 #ifdef RSS 6562 rss_getkey(rss->rss_key); 6563 #else 6564 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6565 #endif 6566 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6567 } 6568 6569 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6570 /* 6571 * RSS indirect table is not set yet; set it up in round- 6572 * robin fashion. 6573 */ 6574 if (bootverbose) { 6575 if_printf(sc->hn_ifp, "setup default RSS indirect " 6576 "table\n"); 6577 } 6578 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6579 uint32_t subidx; 6580 6581 #ifdef RSS 6582 subidx = rss_get_indirection_to_bucket(i); 6583 #else 6584 subidx = i; 6585 #endif 6586 rss->rss_ind[i] = subidx % nchan; 6587 } 6588 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6589 } else { 6590 /* 6591 * # of usable channels may be changed, so we have to 6592 * make sure that all entries in RSS indirect table 6593 * are valid. 6594 * 6595 * NOTE: hn_set_ring_inuse() _must_ have been called. 6596 */ 6597 hn_rss_ind_fixup(sc); 6598 } 6599 6600 sc->hn_rss_hash = sc->hn_rss_hcap; 6601 if ((sc->hn_flags & HN_FLAG_RXVF) || 6602 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6603 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6604 hn_vf_rss_fixup(sc, false); 6605 } 6606 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6607 if (error) 6608 goto failed; 6609 back: 6610 /* 6611 * Fixup transmission aggregation setup. 6612 */ 6613 hn_set_txagg(sc); 6614 hn_rndis_init_fixat(sc, nchan); 6615 return (0); 6616 6617 failed: 6618 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6619 hn_rndis_init_fixat(sc, nchan); 6620 hn_synth_detach(sc); 6621 } else { 6622 if (attached & ATTACHED_RNDIS) { 6623 hn_rndis_init_fixat(sc, nchan); 6624 hn_rndis_detach(sc); 6625 } 6626 if (attached & ATTACHED_NVS) 6627 hn_nvs_detach(sc); 6628 hn_chan_detach(sc, sc->hn_prichan); 6629 /* Restore old capabilities. */ 6630 sc->hn_caps = old_caps; 6631 } 6632 return (error); 6633 6634 #undef ATTACHED_RNDIS 6635 #undef ATTACHED_NVS 6636 } 6637 6638 /* 6639 * NOTE: 6640 * The interface must have been suspended though hn_suspend(), before 6641 * this function get called. 6642 */ 6643 static void 6644 hn_synth_detach(struct hn_softc *sc) 6645 { 6646 6647 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6648 ("synthetic parts were not attached")); 6649 6650 /* Detach the RNDIS first. */ 6651 hn_rndis_detach(sc); 6652 6653 /* Detach NVS. */ 6654 hn_nvs_detach(sc); 6655 6656 /* Detach all of the channels. */ 6657 hn_detach_allchans(sc); 6658 6659 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6660 /* 6661 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6662 */ 6663 int error; 6664 6665 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6666 sc->hn_rxbuf_gpadl); 6667 if (error) { 6668 if_printf(sc->hn_ifp, 6669 "rxbuf gpadl disconn failed: %d\n", error); 6670 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6671 } 6672 sc->hn_rxbuf_gpadl = 0; 6673 } 6674 6675 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6676 /* 6677 * Host is post-Win2016, disconnect chimney sending buffer from 6678 * primary channel here. 6679 */ 6680 int error; 6681 6682 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6683 sc->hn_chim_gpadl); 6684 if (error) { 6685 if_printf(sc->hn_ifp, 6686 "chim gpadl disconn failed: %d\n", error); 6687 sc->hn_flags |= HN_FLAG_CHIM_REF; 6688 } 6689 sc->hn_chim_gpadl = 0; 6690 } 6691 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6692 } 6693 6694 static void 6695 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6696 { 6697 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6698 ("invalid ring count %d", ring_cnt)); 6699 6700 if (sc->hn_tx_ring_cnt > ring_cnt) 6701 sc->hn_tx_ring_inuse = ring_cnt; 6702 else 6703 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6704 sc->hn_rx_ring_inuse = ring_cnt; 6705 6706 #ifdef RSS 6707 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6708 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6709 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6710 rss_getnumbuckets()); 6711 } 6712 #endif 6713 6714 if (bootverbose) { 6715 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6716 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6717 } 6718 } 6719 6720 static void 6721 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6722 { 6723 6724 /* 6725 * NOTE: 6726 * The TX bufring will not be drained by the hypervisor, 6727 * if the primary channel is revoked. 6728 */ 6729 while (!vmbus_chan_rx_empty(chan) || 6730 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6731 !vmbus_chan_tx_empty(chan))) 6732 pause("waitch", 1); 6733 vmbus_chan_intr_drain(chan); 6734 } 6735 6736 static void 6737 hn_disable_rx(struct hn_softc *sc) 6738 { 6739 6740 /* 6741 * Disable RX by clearing RX filter forcefully. 6742 */ 6743 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6744 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6745 6746 /* 6747 * Give RNDIS enough time to flush all pending data packets. 6748 */ 6749 pause("waitrx", (200 * hz) / 1000); 6750 } 6751 6752 /* 6753 * NOTE: 6754 * RX/TX _must_ have been suspended/disabled, before this function 6755 * is called. 6756 */ 6757 static void 6758 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6759 { 6760 struct vmbus_channel **subch = NULL; 6761 int nsubch; 6762 6763 /* 6764 * Drain RX/TX bufrings and interrupts. 6765 */ 6766 nsubch = nchan - 1; 6767 if (nsubch > 0) 6768 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6769 6770 if (subch != NULL) { 6771 int i; 6772 6773 for (i = 0; i < nsubch; ++i) 6774 hn_chan_drain(sc, subch[i]); 6775 } 6776 hn_chan_drain(sc, sc->hn_prichan); 6777 6778 if (subch != NULL) 6779 vmbus_subchan_rel(subch, nsubch); 6780 } 6781 6782 static void 6783 hn_suspend_data(struct hn_softc *sc) 6784 { 6785 struct hn_tx_ring *txr; 6786 int i; 6787 6788 HN_LOCK_ASSERT(sc); 6789 6790 /* 6791 * Suspend TX. 6792 */ 6793 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6794 txr = &sc->hn_tx_ring[i]; 6795 6796 mtx_lock(&txr->hn_tx_lock); 6797 txr->hn_suspended = 1; 6798 mtx_unlock(&txr->hn_tx_lock); 6799 /* No one is able send more packets now. */ 6800 6801 /* 6802 * Wait for all pending sends to finish. 6803 * 6804 * NOTE: 6805 * We will _not_ receive all pending send-done, if the 6806 * primary channel is revoked. 6807 */ 6808 while (hn_tx_ring_pending(txr) && 6809 !vmbus_chan_is_revoked(sc->hn_prichan)) 6810 pause("hnwtx", 1 /* 1 tick */); 6811 } 6812 6813 /* 6814 * Disable RX. 6815 */ 6816 hn_disable_rx(sc); 6817 6818 /* 6819 * Drain RX/TX. 6820 */ 6821 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6822 6823 /* 6824 * Drain any pending TX tasks. 6825 * 6826 * NOTE: 6827 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6828 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6829 */ 6830 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6831 txr = &sc->hn_tx_ring[i]; 6832 6833 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6834 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6835 } 6836 } 6837 6838 static void 6839 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6840 { 6841 6842 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6843 } 6844 6845 static void 6846 hn_suspend_mgmt(struct hn_softc *sc) 6847 { 6848 struct task task; 6849 6850 HN_LOCK_ASSERT(sc); 6851 6852 /* 6853 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6854 * through hn_mgmt_taskq. 6855 */ 6856 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6857 vmbus_chan_run_task(sc->hn_prichan, &task); 6858 6859 /* 6860 * Make sure that all pending management tasks are completed. 6861 */ 6862 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6863 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6864 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6865 } 6866 6867 static void 6868 hn_suspend(struct hn_softc *sc) 6869 { 6870 6871 /* Disable polling. */ 6872 hn_polling(sc, 0); 6873 6874 /* 6875 * If the non-transparent mode VF is activated, the synthetic 6876 * device is receiving packets, so the data path of the 6877 * synthetic device must be suspended. 6878 */ 6879 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6880 (sc->hn_flags & HN_FLAG_RXVF)) 6881 hn_suspend_data(sc); 6882 hn_suspend_mgmt(sc); 6883 } 6884 6885 static void 6886 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6887 { 6888 int i; 6889 6890 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6891 ("invalid TX ring count %d", tx_ring_cnt)); 6892 6893 for (i = 0; i < tx_ring_cnt; ++i) { 6894 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6895 6896 mtx_lock(&txr->hn_tx_lock); 6897 txr->hn_suspended = 0; 6898 mtx_unlock(&txr->hn_tx_lock); 6899 } 6900 } 6901 6902 static void 6903 hn_resume_data(struct hn_softc *sc) 6904 { 6905 int i; 6906 6907 HN_LOCK_ASSERT(sc); 6908 6909 /* 6910 * Re-enable RX. 6911 */ 6912 hn_rxfilter_config(sc); 6913 6914 /* 6915 * Make sure to clear suspend status on "all" TX rings, 6916 * since hn_tx_ring_inuse can be changed after 6917 * hn_suspend_data(). 6918 */ 6919 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6920 6921 #ifdef HN_IFSTART_SUPPORT 6922 if (!hn_use_if_start) 6923 #endif 6924 { 6925 /* 6926 * Flush unused drbrs, since hn_tx_ring_inuse may be 6927 * reduced. 6928 */ 6929 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6930 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6931 } 6932 6933 /* 6934 * Kick start TX. 6935 */ 6936 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6937 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6938 6939 /* 6940 * Use txeof task, so that any pending oactive can be 6941 * cleared properly. 6942 */ 6943 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6944 } 6945 } 6946 6947 static void 6948 hn_resume_mgmt(struct hn_softc *sc) 6949 { 6950 6951 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6952 6953 /* 6954 * Kick off network change detection, if it was pending. 6955 * If no network change was pending, start link status 6956 * checks, which is more lightweight than network change 6957 * detection. 6958 */ 6959 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6960 hn_change_network(sc); 6961 else 6962 hn_update_link_status(sc); 6963 } 6964 6965 static void 6966 hn_resume(struct hn_softc *sc) 6967 { 6968 6969 /* 6970 * If the non-transparent mode VF is activated, the synthetic 6971 * device have to receive packets, so the data path of the 6972 * synthetic device must be resumed. 6973 */ 6974 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6975 (sc->hn_flags & HN_FLAG_RXVF)) 6976 hn_resume_data(sc); 6977 6978 /* 6979 * Don't resume link status change if VF is attached/activated. 6980 * - In the non-transparent VF mode, the synthetic device marks 6981 * link down until the VF is deactivated; i.e. VF is down. 6982 * - In transparent VF mode, VF's media status is used until 6983 * the VF is detached. 6984 */ 6985 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 6986 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 6987 hn_resume_mgmt(sc); 6988 6989 /* 6990 * Re-enable polling if this interface is running and 6991 * the polling is requested. 6992 */ 6993 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 6994 hn_polling(sc, sc->hn_pollhz); 6995 } 6996 6997 static void 6998 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 6999 { 7000 const struct rndis_status_msg *msg; 7001 int ofs; 7002 7003 if (dlen < sizeof(*msg)) { 7004 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7005 return; 7006 } 7007 msg = data; 7008 7009 switch (msg->rm_status) { 7010 case RNDIS_STATUS_MEDIA_CONNECT: 7011 case RNDIS_STATUS_MEDIA_DISCONNECT: 7012 hn_update_link_status(sc); 7013 break; 7014 7015 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7016 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7017 /* Not really useful; ignore. */ 7018 break; 7019 7020 case RNDIS_STATUS_NETWORK_CHANGE: 7021 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7022 if (dlen < ofs + msg->rm_stbuflen || 7023 msg->rm_stbuflen < sizeof(uint32_t)) { 7024 if_printf(sc->hn_ifp, "network changed\n"); 7025 } else { 7026 uint32_t change; 7027 7028 memcpy(&change, ((const uint8_t *)msg) + ofs, 7029 sizeof(change)); 7030 if_printf(sc->hn_ifp, "network changed, change %u\n", 7031 change); 7032 } 7033 hn_change_network(sc); 7034 break; 7035 7036 default: 7037 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7038 msg->rm_status); 7039 break; 7040 } 7041 } 7042 7043 static int 7044 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7045 { 7046 const struct rndis_pktinfo *pi = info_data; 7047 uint32_t mask = 0; 7048 7049 while (info_dlen != 0) { 7050 const void *data; 7051 uint32_t dlen; 7052 7053 if (__predict_false(info_dlen < sizeof(*pi))) 7054 return (EINVAL); 7055 if (__predict_false(info_dlen < pi->rm_size)) 7056 return (EINVAL); 7057 info_dlen -= pi->rm_size; 7058 7059 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7060 return (EINVAL); 7061 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7062 return (EINVAL); 7063 dlen = pi->rm_size - pi->rm_pktinfooffset; 7064 data = pi->rm_data; 7065 7066 if (pi->rm_internal == 1) { 7067 switch (pi->rm_type) { 7068 case NDIS_PKTINFO_IT_PKTINFO_ID: 7069 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7070 return (EINVAL); 7071 info->pktinfo_id = 7072 (const struct packet_info_id *)data; 7073 mask |= HN_RXINFO_PKTINFO_ID; 7074 break; 7075 7076 default: 7077 goto next; 7078 } 7079 } else { 7080 switch (pi->rm_type) { 7081 case NDIS_PKTINFO_TYPE_VLAN: 7082 if (__predict_false(dlen 7083 < NDIS_VLAN_INFO_SIZE)) 7084 return (EINVAL); 7085 info->vlan_info = (const uint32_t *)data; 7086 mask |= HN_RXINFO_VLAN; 7087 break; 7088 7089 case NDIS_PKTINFO_TYPE_CSUM: 7090 if (__predict_false(dlen 7091 < NDIS_RXCSUM_INFO_SIZE)) 7092 return (EINVAL); 7093 info->csum_info = (const uint32_t *)data; 7094 mask |= HN_RXINFO_CSUM; 7095 break; 7096 7097 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7098 if (__predict_false(dlen 7099 < HN_NDIS_HASH_VALUE_SIZE)) 7100 return (EINVAL); 7101 info->hash_value = (const uint32_t *)data; 7102 mask |= HN_RXINFO_HASHVAL; 7103 break; 7104 7105 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7106 if (__predict_false(dlen 7107 < HN_NDIS_HASH_INFO_SIZE)) 7108 return (EINVAL); 7109 info->hash_info = (const uint32_t *)data; 7110 mask |= HN_RXINFO_HASHINF; 7111 break; 7112 7113 default: 7114 goto next; 7115 } 7116 } 7117 7118 if (mask == HN_RXINFO_ALL) { 7119 /* All found; done */ 7120 break; 7121 } 7122 next: 7123 pi = (const struct rndis_pktinfo *) 7124 ((const uint8_t *)pi + pi->rm_size); 7125 } 7126 7127 /* 7128 * Final fixup. 7129 * - If there is no hash value, invalidate the hash info. 7130 */ 7131 if ((mask & HN_RXINFO_HASHVAL) == 0) 7132 info->hash_info = NULL; 7133 return (0); 7134 } 7135 7136 static __inline bool 7137 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7138 { 7139 7140 if (off < check_off) { 7141 if (__predict_true(off + len <= check_off)) 7142 return (false); 7143 } else if (off > check_off) { 7144 if (__predict_true(check_off + check_len <= off)) 7145 return (false); 7146 } 7147 return (true); 7148 } 7149 7150 static __inline void 7151 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7152 uint32_t len, struct hn_rxinfo *info) 7153 { 7154 uint32_t cnt = rxr->rsc.cnt; 7155 7156 if (cnt) { 7157 rxr->rsc.pktlen += len; 7158 } else { 7159 rxr->rsc.vlan_info = info->vlan_info; 7160 rxr->rsc.csum_info = info->csum_info; 7161 rxr->rsc.hash_info = info->hash_info; 7162 rxr->rsc.hash_value = info->hash_value; 7163 rxr->rsc.pktlen = len; 7164 } 7165 7166 rxr->rsc.frag_data[cnt] = data; 7167 rxr->rsc.frag_len[cnt] = len; 7168 rxr->rsc.cnt++; 7169 } 7170 7171 static void 7172 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7173 { 7174 const struct rndis_packet_msg *pkt; 7175 struct hn_rxinfo info; 7176 int data_off, pktinfo_off, data_len, pktinfo_len; 7177 bool rsc_more= false; 7178 7179 /* 7180 * Check length. 7181 */ 7182 if (__predict_false(dlen < sizeof(*pkt))) { 7183 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7184 return; 7185 } 7186 pkt = data; 7187 7188 if (__predict_false(dlen < pkt->rm_len)) { 7189 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7190 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7191 return; 7192 } 7193 if (__predict_false(pkt->rm_len < 7194 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7195 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7196 "msglen %u, data %u, oob %u, pktinfo %u\n", 7197 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7198 pkt->rm_pktinfolen); 7199 return; 7200 } 7201 if (__predict_false(pkt->rm_datalen == 0)) { 7202 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7203 return; 7204 } 7205 7206 /* 7207 * Check offests. 7208 */ 7209 #define IS_OFFSET_INVALID(ofs) \ 7210 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7211 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7212 7213 /* XXX Hyper-V does not meet data offset alignment requirement */ 7214 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7215 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7216 "data offset %u\n", pkt->rm_dataoffset); 7217 return; 7218 } 7219 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7220 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7221 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7222 "oob offset %u\n", pkt->rm_oobdataoffset); 7223 return; 7224 } 7225 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7226 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7227 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7228 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7229 return; 7230 } 7231 7232 #undef IS_OFFSET_INVALID 7233 7234 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7235 data_len = pkt->rm_datalen; 7236 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7237 pktinfo_len = pkt->rm_pktinfolen; 7238 7239 /* 7240 * Check OOB coverage. 7241 */ 7242 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7243 int oob_off, oob_len; 7244 7245 if_printf(rxr->hn_ifp, "got oobdata\n"); 7246 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7247 oob_len = pkt->rm_oobdatalen; 7248 7249 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7250 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7251 "oob overflow, msglen %u, oob abs %d len %d\n", 7252 pkt->rm_len, oob_off, oob_len); 7253 return; 7254 } 7255 7256 /* 7257 * Check against data. 7258 */ 7259 if (hn_rndis_check_overlap(oob_off, oob_len, 7260 data_off, data_len)) { 7261 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7262 "oob overlaps data, oob abs %d len %d, " 7263 "data abs %d len %d\n", 7264 oob_off, oob_len, data_off, data_len); 7265 return; 7266 } 7267 7268 /* 7269 * Check against pktinfo. 7270 */ 7271 if (pktinfo_len != 0 && 7272 hn_rndis_check_overlap(oob_off, oob_len, 7273 pktinfo_off, pktinfo_len)) { 7274 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7275 "oob overlaps pktinfo, oob abs %d len %d, " 7276 "pktinfo abs %d len %d\n", 7277 oob_off, oob_len, pktinfo_off, pktinfo_len); 7278 return; 7279 } 7280 } 7281 7282 /* 7283 * Check per-packet-info coverage and find useful per-packet-info. 7284 */ 7285 info.vlan_info = NULL; 7286 info.csum_info = NULL; 7287 info.hash_info = NULL; 7288 info.pktinfo_id = NULL; 7289 7290 if (__predict_true(pktinfo_len != 0)) { 7291 bool overlap; 7292 int error; 7293 7294 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7295 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7296 "pktinfo overflow, msglen %u, " 7297 "pktinfo abs %d len %d\n", 7298 pkt->rm_len, pktinfo_off, pktinfo_len); 7299 return; 7300 } 7301 7302 /* 7303 * Check packet info coverage. 7304 */ 7305 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7306 data_off, data_len); 7307 if (__predict_false(overlap)) { 7308 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7309 "pktinfo overlap data, pktinfo abs %d len %d, " 7310 "data abs %d len %d\n", 7311 pktinfo_off, pktinfo_len, data_off, data_len); 7312 return; 7313 } 7314 7315 /* 7316 * Find useful per-packet-info. 7317 */ 7318 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7319 pktinfo_len, &info); 7320 if (__predict_false(error)) { 7321 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7322 "pktinfo\n"); 7323 return; 7324 } 7325 } 7326 7327 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7328 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7329 "data overflow, msglen %u, data abs %d len %d\n", 7330 pkt->rm_len, data_off, data_len); 7331 return; 7332 } 7333 7334 /* Identify RSC fragments, drop invalid packets */ 7335 if ((info.pktinfo_id != NULL) && 7336 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7337 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7338 rxr->rsc.cnt = 0; 7339 rxr->hn_rsc_pkts++; 7340 } else if (rxr->rsc.cnt == 0) 7341 goto drop; 7342 7343 rsc_more = true; 7344 7345 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7346 rsc_more = false; 7347 7348 if (rsc_more && rxr->rsc.is_last) 7349 goto drop; 7350 } else { 7351 rxr->rsc.cnt = 0; 7352 } 7353 7354 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7355 goto drop; 7356 7357 /* Store data in per rx ring structure */ 7358 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7359 data_len, &info); 7360 7361 if (rsc_more) 7362 return; 7363 7364 hn_rxpkt(rxr); 7365 rxr->rsc.cnt = 0; 7366 return; 7367 drop: 7368 rxr->hn_rsc_drop++; 7369 return; 7370 } 7371 7372 static __inline void 7373 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7374 { 7375 const struct rndis_msghdr *hdr; 7376 7377 if (__predict_false(dlen < sizeof(*hdr))) { 7378 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7379 return; 7380 } 7381 hdr = data; 7382 7383 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7384 /* Hot data path. */ 7385 hn_rndis_rx_data(rxr, data, dlen); 7386 /* Done! */ 7387 return; 7388 } 7389 7390 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7391 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7392 else 7393 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7394 } 7395 7396 static void 7397 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7398 { 7399 const struct hn_nvs_hdr *hdr; 7400 7401 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7402 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7403 return; 7404 } 7405 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7406 7407 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7408 /* Useless; ignore */ 7409 return; 7410 } 7411 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7412 } 7413 7414 static void 7415 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7416 const struct vmbus_chanpkt_hdr *pkt) 7417 { 7418 struct hn_nvs_sendctx *sndc; 7419 7420 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7421 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7422 VMBUS_CHANPKT_DATALEN(pkt)); 7423 /* 7424 * NOTE: 7425 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7426 * its callback. 7427 */ 7428 } 7429 7430 static void 7431 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7432 const struct vmbus_chanpkt_hdr *pkthdr) 7433 { 7434 struct epoch_tracker et; 7435 const struct vmbus_chanpkt_rxbuf *pkt; 7436 const struct hn_nvs_hdr *nvs_hdr; 7437 int count, i, hlen; 7438 7439 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7440 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7441 return; 7442 } 7443 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7444 7445 /* Make sure that this is a RNDIS message. */ 7446 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7447 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7448 nvs_hdr->nvs_type); 7449 return; 7450 } 7451 7452 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7453 if (__predict_false(hlen < sizeof(*pkt))) { 7454 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7455 return; 7456 } 7457 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7458 7459 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7460 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7461 pkt->cp_rxbuf_id); 7462 return; 7463 } 7464 7465 count = pkt->cp_rxbuf_cnt; 7466 if (__predict_false(hlen < 7467 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7468 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7469 return; 7470 } 7471 7472 NET_EPOCH_ENTER(et); 7473 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7474 for (i = 0; i < count; ++i) { 7475 int ofs, len; 7476 7477 ofs = pkt->cp_rxbuf[i].rb_ofs; 7478 len = pkt->cp_rxbuf[i].rb_len; 7479 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7480 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7481 "ofs %d, len %d\n", i, ofs, len); 7482 continue; 7483 } 7484 7485 rxr->rsc.is_last = (i == (count - 1)); 7486 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7487 } 7488 NET_EPOCH_EXIT(et); 7489 7490 /* 7491 * Ack the consumed RXBUF associated w/ this channel packet, 7492 * so that this RXBUF can be recycled by the hypervisor. 7493 */ 7494 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7495 } 7496 7497 static void 7498 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7499 uint64_t tid) 7500 { 7501 struct hn_nvs_rndis_ack ack; 7502 int retries, error; 7503 7504 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7505 ack.nvs_status = HN_NVS_STATUS_OK; 7506 7507 retries = 0; 7508 again: 7509 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7510 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7511 if (__predict_false(error == EAGAIN)) { 7512 /* 7513 * NOTE: 7514 * This should _not_ happen in real world, since the 7515 * consumption of the TX bufring from the TX path is 7516 * controlled. 7517 */ 7518 if (rxr->hn_ack_failed == 0) 7519 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7520 rxr->hn_ack_failed++; 7521 retries++; 7522 if (retries < 10) { 7523 DELAY(100); 7524 goto again; 7525 } 7526 /* RXBUF leaks! */ 7527 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7528 } 7529 } 7530 7531 static void 7532 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7533 { 7534 struct hn_rx_ring *rxr = xrxr; 7535 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7536 7537 for (;;) { 7538 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7539 int error, pktlen; 7540 7541 pktlen = rxr->hn_pktbuf_len; 7542 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7543 if (__predict_false(error == ENOBUFS)) { 7544 void *nbuf; 7545 int nlen; 7546 7547 /* 7548 * Expand channel packet buffer. 7549 * 7550 * XXX 7551 * Use M_WAITOK here, since allocation failure 7552 * is fatal. 7553 */ 7554 nlen = rxr->hn_pktbuf_len * 2; 7555 while (nlen < pktlen) 7556 nlen *= 2; 7557 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7558 7559 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7560 rxr->hn_pktbuf_len, nlen); 7561 7562 free(rxr->hn_pktbuf, M_DEVBUF); 7563 rxr->hn_pktbuf = nbuf; 7564 rxr->hn_pktbuf_len = nlen; 7565 /* Retry! */ 7566 continue; 7567 } else if (__predict_false(error == EAGAIN)) { 7568 /* No more channel packets; done! */ 7569 break; 7570 } 7571 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7572 7573 switch (pkt->cph_type) { 7574 case VMBUS_CHANPKT_TYPE_COMP: 7575 hn_nvs_handle_comp(sc, chan, pkt); 7576 break; 7577 7578 case VMBUS_CHANPKT_TYPE_RXBUF: 7579 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7580 break; 7581 7582 case VMBUS_CHANPKT_TYPE_INBAND: 7583 hn_nvs_handle_notify(sc, pkt); 7584 break; 7585 7586 default: 7587 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7588 pkt->cph_type); 7589 break; 7590 } 7591 } 7592 hn_chan_rollup(rxr, rxr->hn_txr); 7593 } 7594 7595 static void 7596 hn_sysinit(void *arg __unused) 7597 { 7598 int i; 7599 7600 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7601 7602 #ifdef HN_IFSTART_SUPPORT 7603 /* 7604 * Don't use ifnet.if_start if transparent VF mode is requested; 7605 * mainly due to the IFF_DRV_OACTIVE flag. 7606 */ 7607 if (hn_xpnt_vf && hn_use_if_start) { 7608 hn_use_if_start = 0; 7609 printf("hn: tranparent VF mode, if_transmit will be used, " 7610 "instead of if_start\n"); 7611 } 7612 #endif 7613 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7614 printf("hn: invalid transparent VF attach routing " 7615 "wait timeout %d, reset to %d\n", 7616 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7617 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7618 } 7619 7620 /* 7621 * Initialize VF map. 7622 */ 7623 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7624 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7625 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7626 M_WAITOK | M_ZERO); 7627 7628 /* 7629 * Fix the # of TX taskqueues. 7630 */ 7631 if (hn_tx_taskq_cnt <= 0) 7632 hn_tx_taskq_cnt = 1; 7633 else if (hn_tx_taskq_cnt > mp_ncpus) 7634 hn_tx_taskq_cnt = mp_ncpus; 7635 7636 /* 7637 * Fix the TX taskqueue mode. 7638 */ 7639 switch (hn_tx_taskq_mode) { 7640 case HN_TX_TASKQ_M_INDEP: 7641 case HN_TX_TASKQ_M_GLOBAL: 7642 case HN_TX_TASKQ_M_EVTTQ: 7643 break; 7644 default: 7645 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7646 break; 7647 } 7648 7649 if (vm_guest != VM_GUEST_HV) 7650 return; 7651 7652 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7653 return; 7654 7655 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7656 M_DEVBUF, M_WAITOK); 7657 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7658 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7659 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7660 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7661 "hn tx%d", i); 7662 } 7663 } 7664 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7665 7666 static void 7667 hn_sysuninit(void *arg __unused) 7668 { 7669 7670 if (hn_tx_taskque != NULL) { 7671 int i; 7672 7673 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7674 taskqueue_free(hn_tx_taskque[i]); 7675 free(hn_tx_taskque, M_DEVBUF); 7676 } 7677 7678 if (hn_vfmap != NULL) 7679 free(hn_vfmap, M_DEVBUF); 7680 rm_destroy(&hn_vfmap_lock); 7681 7682 counter_u64_free(hn_udpcs_fixup); 7683 } 7684 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7685