1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 87 #include <machine/atomic.h> 88 #include <machine/in_cksum.h> 89 90 #include <net/bpf.h> 91 #include <net/ethernet.h> 92 #include <net/if.h> 93 #include <net/if_dl.h> 94 #include <net/if_media.h> 95 #include <net/if_types.h> 96 #include <net/if_var.h> 97 #include <net/rndis.h> 98 #ifdef RSS 99 #include <net/rss_config.h> 100 #endif 101 102 #include <netinet/in_systm.h> 103 #include <netinet/in.h> 104 #include <netinet/ip.h> 105 #include <netinet/ip6.h> 106 #include <netinet/tcp.h> 107 #include <netinet/tcp_lro.h> 108 #include <netinet/udp.h> 109 110 #include <dev/hyperv/include/hyperv.h> 111 #include <dev/hyperv/include/hyperv_busdma.h> 112 #include <dev/hyperv/include/vmbus.h> 113 #include <dev/hyperv/include/vmbus_xact.h> 114 115 #include <dev/hyperv/netvsc/ndis.h> 116 #include <dev/hyperv/netvsc/if_hnreg.h> 117 #include <dev/hyperv/netvsc/if_hnvar.h> 118 #include <dev/hyperv/netvsc/hn_nvs.h> 119 #include <dev/hyperv/netvsc/hn_rndis.h> 120 121 #include "vmbus_if.h" 122 123 #define HN_IFSTART_SUPPORT 124 125 #define HN_RING_CNT_DEF_MAX 8 126 127 #define HN_VFMAP_SIZE_DEF 8 128 129 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 130 131 /* YYY should get it from the underlying channel */ 132 #define HN_TX_DESC_CNT 512 133 134 #define HN_RNDIS_PKT_LEN \ 135 (sizeof(struct rndis_packet_msg) + \ 136 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 137 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 140 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 141 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 142 143 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 144 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 145 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 146 /* -1 for RNDIS packet message */ 147 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 148 149 #define HN_DIRECT_TX_SIZE_DEF 128 150 151 #define HN_EARLY_TXEOF_THRESH 8 152 153 #define HN_PKTBUF_LEN_DEF (16 * 1024) 154 155 #define HN_LROENT_CNT_DEF 128 156 157 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 158 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 159 /* YYY 2*MTU is a bit rough, but should be good enough. */ 160 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 161 162 #define HN_LRO_ACKCNT_DEF 1 163 164 #define HN_LOCK_INIT(sc) \ 165 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 166 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 167 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 168 #define HN_LOCK(sc) \ 169 do { \ 170 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 171 /* Relinquish cpu to avoid deadlock */ \ 172 sched_relinquish(curthread); \ 173 DELAY(1000); \ 174 } \ 175 } while (0) 176 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 177 178 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 179 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 180 #define HN_CSUM_IP_HWASSIST(sc) \ 181 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 182 #define HN_CSUM_IP6_HWASSIST(sc) \ 183 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 184 185 #define HN_PKTSIZE_MIN(align) \ 186 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 187 HN_RNDIS_PKT_LEN, (align)) 188 #define HN_PKTSIZE(m, align) \ 189 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 190 191 #ifdef RSS 192 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 193 #else 194 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 195 #endif 196 197 struct hn_txdesc { 198 #ifndef HN_USE_TXDESC_BUFRING 199 SLIST_ENTRY(hn_txdesc) link; 200 #endif 201 STAILQ_ENTRY(hn_txdesc) agg_link; 202 203 /* Aggregated txdescs, in sending order. */ 204 STAILQ_HEAD(, hn_txdesc) agg_list; 205 206 /* The oldest packet, if transmission aggregation happens. */ 207 struct mbuf *m; 208 struct hn_tx_ring *txr; 209 int refs; 210 uint32_t flags; /* HN_TXD_FLAG_ */ 211 struct hn_nvs_sendctx send_ctx; 212 uint32_t chim_index; 213 int chim_size; 214 215 bus_dmamap_t data_dmap; 216 217 bus_addr_t rndis_pkt_paddr; 218 struct rndis_packet_msg *rndis_pkt; 219 bus_dmamap_t rndis_pkt_dmap; 220 }; 221 222 #define HN_TXD_FLAG_ONLIST 0x0001 223 #define HN_TXD_FLAG_DMAMAP 0x0002 224 #define HN_TXD_FLAG_ONAGG 0x0004 225 226 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 227 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 228 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 229 230 struct packet_info_id { 231 uint8_t ver; 232 uint8_t flag; 233 uint16_t pkt_id; 234 }; 235 236 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 237 238 239 struct hn_rxinfo { 240 const uint32_t *vlan_info; 241 const uint32_t *csum_info; 242 const uint32_t *hash_info; 243 const uint32_t *hash_value; 244 const struct packet_info_id *pktinfo_id; 245 }; 246 247 struct hn_rxvf_setarg { 248 struct hn_rx_ring *rxr; 249 struct ifnet *vf_ifp; 250 }; 251 252 #define HN_RXINFO_VLAN 0x0001 253 #define HN_RXINFO_CSUM 0x0002 254 #define HN_RXINFO_HASHINF 0x0004 255 #define HN_RXINFO_HASHVAL 0x0008 256 #define HN_RXINFO_PKTINFO_ID 0x0010 257 #define HN_RXINFO_ALL \ 258 (HN_RXINFO_VLAN | \ 259 HN_RXINFO_CSUM | \ 260 HN_RXINFO_HASHINF | \ 261 HN_RXINFO_HASHVAL | \ 262 HN_RXINFO_PKTINFO_ID) 263 264 static int hn_probe(device_t); 265 static int hn_attach(device_t); 266 static int hn_detach(device_t); 267 static int hn_shutdown(device_t); 268 static void hn_chan_callback(struct vmbus_channel *, 269 void *); 270 271 static void hn_init(void *); 272 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 273 #ifdef HN_IFSTART_SUPPORT 274 static void hn_start(struct ifnet *); 275 #endif 276 static int hn_transmit(struct ifnet *, struct mbuf *); 277 static void hn_xmit_qflush(struct ifnet *); 278 static int hn_ifmedia_upd(struct ifnet *); 279 static void hn_ifmedia_sts(struct ifnet *, 280 struct ifmediareq *); 281 282 static void hn_ifnet_event(void *, struct ifnet *, int); 283 static void hn_ifaddr_event(void *, struct ifnet *); 284 static void hn_ifnet_attevent(void *, struct ifnet *); 285 static void hn_ifnet_detevent(void *, struct ifnet *); 286 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 287 288 static bool hn_ismyvf(const struct hn_softc *, 289 const struct ifnet *); 290 static void hn_rxvf_change(struct hn_softc *, 291 struct ifnet *, bool); 292 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 293 static void hn_rxvf_set_task(void *, int); 294 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 295 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 296 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 297 struct ifreq *); 298 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 299 static bool hn_xpnt_vf_isready(struct hn_softc *); 300 static void hn_xpnt_vf_setready(struct hn_softc *); 301 static void hn_xpnt_vf_init_taskfunc(void *, int); 302 static void hn_xpnt_vf_init(struct hn_softc *); 303 static void hn_xpnt_vf_setenable(struct hn_softc *); 304 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 305 static void hn_vf_rss_fixup(struct hn_softc *, bool); 306 static void hn_vf_rss_restore(struct hn_softc *); 307 308 static int hn_rndis_rxinfo(const void *, int, 309 struct hn_rxinfo *); 310 static void hn_rndis_rx_data(struct hn_rx_ring *, 311 const void *, int); 312 static void hn_rndis_rx_status(struct hn_softc *, 313 const void *, int); 314 static void hn_rndis_init_fixat(struct hn_softc *, int); 315 316 static void hn_nvs_handle_notify(struct hn_softc *, 317 const struct vmbus_chanpkt_hdr *); 318 static void hn_nvs_handle_comp(struct hn_softc *, 319 struct vmbus_channel *, 320 const struct vmbus_chanpkt_hdr *); 321 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 322 struct vmbus_channel *, 323 const struct vmbus_chanpkt_hdr *); 324 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 325 struct vmbus_channel *, uint64_t); 326 327 #if __FreeBSD_version >= 1100099 328 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 329 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 330 #endif 331 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 333 #if __FreeBSD_version < 1100095 334 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 335 #else 336 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 337 #endif 338 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 345 #ifndef RSS 346 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 347 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 348 #endif 349 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 362 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 363 364 static void hn_stop(struct hn_softc *, bool); 365 static void hn_init_locked(struct hn_softc *); 366 static int hn_chan_attach(struct hn_softc *, 367 struct vmbus_channel *); 368 static void hn_chan_detach(struct hn_softc *, 369 struct vmbus_channel *); 370 static int hn_attach_subchans(struct hn_softc *); 371 static void hn_detach_allchans(struct hn_softc *); 372 static void hn_chan_rollup(struct hn_rx_ring *, 373 struct hn_tx_ring *); 374 static void hn_set_ring_inuse(struct hn_softc *, int); 375 static int hn_synth_attach(struct hn_softc *, int); 376 static void hn_synth_detach(struct hn_softc *); 377 static int hn_synth_alloc_subchans(struct hn_softc *, 378 int *); 379 static bool hn_synth_attachable(const struct hn_softc *); 380 static void hn_suspend(struct hn_softc *); 381 static void hn_suspend_data(struct hn_softc *); 382 static void hn_suspend_mgmt(struct hn_softc *); 383 static void hn_resume(struct hn_softc *); 384 static void hn_resume_data(struct hn_softc *); 385 static void hn_resume_mgmt(struct hn_softc *); 386 static void hn_suspend_mgmt_taskfunc(void *, int); 387 static void hn_chan_drain(struct hn_softc *, 388 struct vmbus_channel *); 389 static void hn_disable_rx(struct hn_softc *); 390 static void hn_drain_rxtx(struct hn_softc *, int); 391 static void hn_polling(struct hn_softc *, u_int); 392 static void hn_chan_polling(struct vmbus_channel *, u_int); 393 static void hn_mtu_change_fixup(struct hn_softc *); 394 395 static void hn_update_link_status(struct hn_softc *); 396 static void hn_change_network(struct hn_softc *); 397 static void hn_link_taskfunc(void *, int); 398 static void hn_netchg_init_taskfunc(void *, int); 399 static void hn_netchg_status_taskfunc(void *, int); 400 static void hn_link_status(struct hn_softc *); 401 402 static int hn_create_rx_data(struct hn_softc *, int); 403 static void hn_destroy_rx_data(struct hn_softc *); 404 static int hn_check_iplen(const struct mbuf *, int); 405 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 406 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 407 static int hn_rxfilter_config(struct hn_softc *); 408 static int hn_rss_reconfig(struct hn_softc *); 409 static void hn_rss_ind_fixup(struct hn_softc *); 410 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 411 static int hn_rxpkt(struct hn_rx_ring *); 412 static uint32_t hn_rss_type_fromndis(uint32_t); 413 static uint32_t hn_rss_type_tondis(uint32_t); 414 415 static int hn_tx_ring_create(struct hn_softc *, int); 416 static void hn_tx_ring_destroy(struct hn_tx_ring *); 417 static int hn_create_tx_data(struct hn_softc *, int); 418 static void hn_fixup_tx_data(struct hn_softc *); 419 static void hn_fixup_rx_data(struct hn_softc *); 420 static void hn_destroy_tx_data(struct hn_softc *); 421 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 422 static void hn_txdesc_gc(struct hn_tx_ring *, 423 struct hn_txdesc *); 424 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 425 struct hn_txdesc *, struct mbuf **); 426 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 427 struct hn_txdesc *); 428 static void hn_set_chim_size(struct hn_softc *, int); 429 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 430 static bool hn_tx_ring_pending(struct hn_tx_ring *); 431 static void hn_tx_ring_qflush(struct hn_tx_ring *); 432 static void hn_resume_tx(struct hn_softc *, int); 433 static void hn_set_txagg(struct hn_softc *); 434 static void *hn_try_txagg(struct ifnet *, 435 struct hn_tx_ring *, struct hn_txdesc *, 436 int); 437 static int hn_get_txswq_depth(const struct hn_tx_ring *); 438 static void hn_txpkt_done(struct hn_nvs_sendctx *, 439 struct hn_softc *, struct vmbus_channel *, 440 const void *, int); 441 static int hn_txpkt_sglist(struct hn_tx_ring *, 442 struct hn_txdesc *); 443 static int hn_txpkt_chim(struct hn_tx_ring *, 444 struct hn_txdesc *); 445 static int hn_xmit(struct hn_tx_ring *, int); 446 static void hn_xmit_taskfunc(void *, int); 447 static void hn_xmit_txeof(struct hn_tx_ring *); 448 static void hn_xmit_txeof_taskfunc(void *, int); 449 #ifdef HN_IFSTART_SUPPORT 450 static int hn_start_locked(struct hn_tx_ring *, int); 451 static void hn_start_taskfunc(void *, int); 452 static void hn_start_txeof(struct hn_tx_ring *); 453 static void hn_start_txeof_taskfunc(void *, int); 454 #endif 455 456 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 457 "Hyper-V network interface"); 458 459 /* Trust tcp segements verification on host side. */ 460 static int hn_trust_hosttcp = 1; 461 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 462 &hn_trust_hosttcp, 0, 463 "Trust tcp segement verification on host side, " 464 "when csum info is missing (global setting)"); 465 466 /* Trust udp datagrams verification on host side. */ 467 static int hn_trust_hostudp = 1; 468 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 469 &hn_trust_hostudp, 0, 470 "Trust udp datagram verification on host side, " 471 "when csum info is missing (global setting)"); 472 473 /* Trust ip packets verification on host side. */ 474 static int hn_trust_hostip = 1; 475 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 476 &hn_trust_hostip, 0, 477 "Trust ip packet verification on host side, " 478 "when csum info is missing (global setting)"); 479 480 /* 481 * Offload UDP/IPv4 checksum. 482 */ 483 static int hn_enable_udp4cs = 1; 484 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 485 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 486 487 /* 488 * Offload UDP/IPv6 checksum. 489 */ 490 static int hn_enable_udp6cs = 1; 491 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 492 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 493 494 /* Stats. */ 495 static counter_u64_t hn_udpcs_fixup; 496 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 497 &hn_udpcs_fixup, "# of UDP checksum fixup"); 498 499 /* 500 * See hn_set_hlen(). 501 * 502 * This value is for Azure. For Hyper-V, set this above 503 * 65536 to disable UDP datagram checksum fixup. 504 */ 505 static int hn_udpcs_fixup_mtu = 1420; 506 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 507 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 508 509 /* Limit TSO burst size */ 510 static int hn_tso_maxlen = IP_MAXPACKET; 511 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 512 &hn_tso_maxlen, 0, "TSO burst limit"); 513 514 /* Limit chimney send size */ 515 static int hn_tx_chimney_size = 0; 516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 517 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 518 519 /* Limit the size of packet for direct transmission */ 520 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 521 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 522 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 523 524 /* # of LRO entries per RX ring */ 525 #if defined(INET) || defined(INET6) 526 #if __FreeBSD_version >= 1100095 527 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 528 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 529 &hn_lro_entry_count, 0, "LRO entry count"); 530 #endif 531 #endif 532 533 static int hn_tx_taskq_cnt = 1; 534 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 535 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 536 537 #define HN_TX_TASKQ_M_INDEP 0 538 #define HN_TX_TASKQ_M_GLOBAL 1 539 #define HN_TX_TASKQ_M_EVTTQ 2 540 541 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 542 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 543 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 544 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 545 546 #ifndef HN_USE_TXDESC_BUFRING 547 static int hn_use_txdesc_bufring = 0; 548 #else 549 static int hn_use_txdesc_bufring = 1; 550 #endif 551 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 552 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 553 554 #ifdef HN_IFSTART_SUPPORT 555 /* Use ifnet.if_start instead of ifnet.if_transmit */ 556 static int hn_use_if_start = 0; 557 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 558 &hn_use_if_start, 0, "Use if_start TX method"); 559 #endif 560 561 /* # of channels to use */ 562 static int hn_chan_cnt = 0; 563 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 564 &hn_chan_cnt, 0, 565 "# of channels to use; each channel has one RX ring and one TX ring"); 566 567 /* # of transmit rings to use */ 568 static int hn_tx_ring_cnt = 0; 569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 570 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 571 572 /* Software TX ring deptch */ 573 static int hn_tx_swq_depth = 0; 574 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 575 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 576 577 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 578 #if __FreeBSD_version >= 1100095 579 static u_int hn_lro_mbufq_depth = 0; 580 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 581 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 582 #endif 583 584 /* Packet transmission aggregation size limit */ 585 static int hn_tx_agg_size = -1; 586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 587 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 588 589 /* Packet transmission aggregation count limit */ 590 static int hn_tx_agg_pkts = -1; 591 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 592 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 593 594 /* VF list */ 595 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 596 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 597 hn_vflist_sysctl, "A", 598 "VF list"); 599 600 /* VF mapping */ 601 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 602 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 603 hn_vfmap_sysctl, "A", 604 "VF mapping"); 605 606 /* Transparent VF */ 607 static int hn_xpnt_vf = 1; 608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 609 &hn_xpnt_vf, 0, "Transparent VF mod"); 610 611 /* Accurate BPF support for Transparent VF */ 612 static int hn_xpnt_vf_accbpf = 0; 613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 614 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 615 616 /* Extra wait for transparent VF attach routing; unit seconds. */ 617 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 618 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 619 &hn_xpnt_vf_attwait, 0, 620 "Extra wait for transparent VF attach routing; unit: seconds"); 621 622 static u_int hn_cpu_index; /* next CPU for channel */ 623 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 624 625 static struct rmlock hn_vfmap_lock; 626 static int hn_vfmap_size; 627 static struct ifnet **hn_vfmap; 628 629 #ifndef RSS 630 static const uint8_t 631 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 632 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 633 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 634 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 635 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 636 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 637 }; 638 #endif /* !RSS */ 639 640 static const struct hyperv_guid hn_guid = { 641 .hv_guid = { 642 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 643 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 644 }; 645 646 static device_method_t hn_methods[] = { 647 /* Device interface */ 648 DEVMETHOD(device_probe, hn_probe), 649 DEVMETHOD(device_attach, hn_attach), 650 DEVMETHOD(device_detach, hn_detach), 651 DEVMETHOD(device_shutdown, hn_shutdown), 652 DEVMETHOD_END 653 }; 654 655 static driver_t hn_driver = { 656 "hn", 657 hn_methods, 658 sizeof(struct hn_softc) 659 }; 660 661 static devclass_t hn_devclass; 662 663 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 664 MODULE_VERSION(hn, 1); 665 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 666 667 #if __FreeBSD_version >= 1100099 668 static void 669 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 670 { 671 int i; 672 673 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 674 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 675 } 676 #endif 677 678 static int 679 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 680 { 681 682 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 683 txd->chim_size == 0, ("invalid rndis sglist txd")); 684 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 685 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 686 } 687 688 static int 689 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 690 { 691 struct hn_nvs_rndis rndis; 692 693 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 694 txd->chim_size > 0, ("invalid rndis chim txd")); 695 696 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 697 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 698 rndis.nvs_chim_idx = txd->chim_index; 699 rndis.nvs_chim_sz = txd->chim_size; 700 701 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 702 &rndis, sizeof(rndis), &txd->send_ctx)); 703 } 704 705 static __inline uint32_t 706 hn_chim_alloc(struct hn_softc *sc) 707 { 708 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 709 u_long *bmap = sc->hn_chim_bmap; 710 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 711 712 for (i = 0; i < bmap_cnt; ++i) { 713 int idx; 714 715 idx = ffsl(~bmap[i]); 716 if (idx == 0) 717 continue; 718 719 --idx; /* ffsl is 1-based */ 720 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 721 ("invalid i %d and idx %d", i, idx)); 722 723 if (atomic_testandset_long(&bmap[i], idx)) 724 continue; 725 726 ret = i * LONG_BIT + idx; 727 break; 728 } 729 return (ret); 730 } 731 732 static __inline void 733 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 734 { 735 u_long mask; 736 uint32_t idx; 737 738 idx = chim_idx / LONG_BIT; 739 KASSERT(idx < sc->hn_chim_bmap_cnt, 740 ("invalid chimney index 0x%x", chim_idx)); 741 742 mask = 1UL << (chim_idx % LONG_BIT); 743 KASSERT(sc->hn_chim_bmap[idx] & mask, 744 ("index bitmap 0x%lx, chimney index %u, " 745 "bitmap idx %d, bitmask 0x%lx", 746 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 747 748 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 749 } 750 751 #if defined(INET6) || defined(INET) 752 753 #define PULLUP_HDR(m, len) \ 754 do { \ 755 if (__predict_false((m)->m_len < (len))) { \ 756 (m) = m_pullup((m), (len)); \ 757 if ((m) == NULL) \ 758 return (NULL); \ 759 } \ 760 } while (0) 761 762 /* 763 * NOTE: If this function failed, the m_head would be freed. 764 */ 765 static __inline struct mbuf * 766 hn_tso_fixup(struct mbuf *m_head) 767 { 768 struct ether_vlan_header *evl; 769 struct tcphdr *th; 770 int ehlen; 771 772 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 773 774 PULLUP_HDR(m_head, sizeof(*evl)); 775 evl = mtod(m_head, struct ether_vlan_header *); 776 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 777 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 778 else 779 ehlen = ETHER_HDR_LEN; 780 m_head->m_pkthdr.l2hlen = ehlen; 781 782 #ifdef INET 783 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 784 struct ip *ip; 785 int iphlen; 786 787 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 788 ip = mtodo(m_head, ehlen); 789 iphlen = ip->ip_hl << 2; 790 m_head->m_pkthdr.l3hlen = iphlen; 791 792 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 793 th = mtodo(m_head, ehlen + iphlen); 794 795 ip->ip_len = 0; 796 ip->ip_sum = 0; 797 th->th_sum = in_pseudo(ip->ip_src.s_addr, 798 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 799 } 800 #endif 801 #if defined(INET6) && defined(INET) 802 else 803 #endif 804 #ifdef INET6 805 { 806 struct ip6_hdr *ip6; 807 808 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 809 ip6 = mtodo(m_head, ehlen); 810 if (ip6->ip6_nxt != IPPROTO_TCP) { 811 m_freem(m_head); 812 return (NULL); 813 } 814 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 815 816 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 817 th = mtodo(m_head, ehlen + sizeof(*ip6)); 818 819 ip6->ip6_plen = 0; 820 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 821 } 822 #endif 823 return (m_head); 824 } 825 826 /* 827 * NOTE: If this function failed, the m_head would be freed. 828 */ 829 static __inline struct mbuf * 830 hn_set_hlen(struct mbuf *m_head) 831 { 832 const struct ether_vlan_header *evl; 833 int ehlen; 834 835 PULLUP_HDR(m_head, sizeof(*evl)); 836 evl = mtod(m_head, const struct ether_vlan_header *); 837 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 838 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 839 else 840 ehlen = ETHER_HDR_LEN; 841 m_head->m_pkthdr.l2hlen = ehlen; 842 843 #ifdef INET 844 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 845 const struct ip *ip; 846 int iphlen; 847 848 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 849 ip = mtodo(m_head, ehlen); 850 iphlen = ip->ip_hl << 2; 851 m_head->m_pkthdr.l3hlen = iphlen; 852 853 /* 854 * UDP checksum offload does not work in Azure, if the 855 * following conditions meet: 856 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 857 * - IP_DF is not set in the IP hdr. 858 * 859 * Fallback to software checksum for these UDP datagrams. 860 */ 861 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 862 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 863 (ntohs(ip->ip_off) & IP_DF) == 0) { 864 uint16_t off = ehlen + iphlen; 865 866 counter_u64_add(hn_udpcs_fixup, 1); 867 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 868 *(uint16_t *)(m_head->m_data + off + 869 m_head->m_pkthdr.csum_data) = in_cksum_skip( 870 m_head, m_head->m_pkthdr.len, off); 871 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 872 } 873 } 874 #endif 875 #if defined(INET6) && defined(INET) 876 else 877 #endif 878 #ifdef INET6 879 { 880 const struct ip6_hdr *ip6; 881 882 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 883 ip6 = mtodo(m_head, ehlen); 884 if (ip6->ip6_nxt != IPPROTO_TCP && 885 ip6->ip6_nxt != IPPROTO_UDP) { 886 m_freem(m_head); 887 return (NULL); 888 } 889 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 890 } 891 #endif 892 return (m_head); 893 } 894 895 /* 896 * NOTE: If this function failed, the m_head would be freed. 897 */ 898 static __inline struct mbuf * 899 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 900 { 901 const struct tcphdr *th; 902 int ehlen, iphlen; 903 904 *tcpsyn = 0; 905 ehlen = m_head->m_pkthdr.l2hlen; 906 iphlen = m_head->m_pkthdr.l3hlen; 907 908 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 909 th = mtodo(m_head, ehlen + iphlen); 910 if (th->th_flags & TH_SYN) 911 *tcpsyn = 1; 912 return (m_head); 913 } 914 915 #undef PULLUP_HDR 916 917 #endif /* INET6 || INET */ 918 919 static int 920 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 921 { 922 int error = 0; 923 924 HN_LOCK_ASSERT(sc); 925 926 if (sc->hn_rx_filter != filter) { 927 error = hn_rndis_set_rxfilter(sc, filter); 928 if (!error) 929 sc->hn_rx_filter = filter; 930 } 931 return (error); 932 } 933 934 static int 935 hn_rxfilter_config(struct hn_softc *sc) 936 { 937 struct ifnet *ifp = sc->hn_ifp; 938 uint32_t filter; 939 940 HN_LOCK_ASSERT(sc); 941 942 /* 943 * If the non-transparent mode VF is activated, we don't know how 944 * its RX filter is configured, so stick the synthetic device in 945 * the promiscous mode. 946 */ 947 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 948 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 949 } else { 950 filter = NDIS_PACKET_TYPE_DIRECTED; 951 if (ifp->if_flags & IFF_BROADCAST) 952 filter |= NDIS_PACKET_TYPE_BROADCAST; 953 /* TODO: support multicast list */ 954 if ((ifp->if_flags & IFF_ALLMULTI) || 955 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 956 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 957 } 958 return (hn_set_rxfilter(sc, filter)); 959 } 960 961 static void 962 hn_set_txagg(struct hn_softc *sc) 963 { 964 uint32_t size, pkts; 965 int i; 966 967 /* 968 * Setup aggregation size. 969 */ 970 if (sc->hn_agg_size < 0) 971 size = UINT32_MAX; 972 else 973 size = sc->hn_agg_size; 974 975 if (sc->hn_rndis_agg_size < size) 976 size = sc->hn_rndis_agg_size; 977 978 /* NOTE: We only aggregate packets using chimney sending buffers. */ 979 if (size > (uint32_t)sc->hn_chim_szmax) 980 size = sc->hn_chim_szmax; 981 982 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 983 /* Disable */ 984 size = 0; 985 pkts = 0; 986 goto done; 987 } 988 989 /* NOTE: Type of the per TX ring setting is 'int'. */ 990 if (size > INT_MAX) 991 size = INT_MAX; 992 993 /* 994 * Setup aggregation packet count. 995 */ 996 if (sc->hn_agg_pkts < 0) 997 pkts = UINT32_MAX; 998 else 999 pkts = sc->hn_agg_pkts; 1000 1001 if (sc->hn_rndis_agg_pkts < pkts) 1002 pkts = sc->hn_rndis_agg_pkts; 1003 1004 if (pkts <= 1) { 1005 /* Disable */ 1006 size = 0; 1007 pkts = 0; 1008 goto done; 1009 } 1010 1011 /* NOTE: Type of the per TX ring setting is 'short'. */ 1012 if (pkts > SHRT_MAX) 1013 pkts = SHRT_MAX; 1014 1015 done: 1016 /* NOTE: Type of the per TX ring setting is 'short'. */ 1017 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1018 /* Disable */ 1019 size = 0; 1020 pkts = 0; 1021 } 1022 1023 if (bootverbose) { 1024 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1025 size, pkts, sc->hn_rndis_agg_align); 1026 } 1027 1028 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1029 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1030 1031 mtx_lock(&txr->hn_tx_lock); 1032 txr->hn_agg_szmax = size; 1033 txr->hn_agg_pktmax = pkts; 1034 txr->hn_agg_align = sc->hn_rndis_agg_align; 1035 mtx_unlock(&txr->hn_tx_lock); 1036 } 1037 } 1038 1039 static int 1040 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1041 { 1042 1043 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1044 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1045 return txr->hn_txdesc_cnt; 1046 return hn_tx_swq_depth; 1047 } 1048 1049 static int 1050 hn_rss_reconfig(struct hn_softc *sc) 1051 { 1052 int error; 1053 1054 HN_LOCK_ASSERT(sc); 1055 1056 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1057 return (ENXIO); 1058 1059 /* 1060 * Disable RSS first. 1061 * 1062 * NOTE: 1063 * Direct reconfiguration by setting the UNCHG flags does 1064 * _not_ work properly. 1065 */ 1066 if (bootverbose) 1067 if_printf(sc->hn_ifp, "disable RSS\n"); 1068 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1069 if (error) { 1070 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1071 return (error); 1072 } 1073 1074 /* 1075 * Reenable the RSS w/ the updated RSS key or indirect 1076 * table. 1077 */ 1078 if (bootverbose) 1079 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1080 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1081 if (error) { 1082 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1083 return (error); 1084 } 1085 return (0); 1086 } 1087 1088 static void 1089 hn_rss_ind_fixup(struct hn_softc *sc) 1090 { 1091 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1092 int i, nchan; 1093 1094 nchan = sc->hn_rx_ring_inuse; 1095 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1096 1097 /* 1098 * Check indirect table to make sure that all channels in it 1099 * can be used. 1100 */ 1101 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1102 if (rss->rss_ind[i] >= nchan) { 1103 if_printf(sc->hn_ifp, 1104 "RSS indirect table %d fixup: %u -> %d\n", 1105 i, rss->rss_ind[i], nchan - 1); 1106 rss->rss_ind[i] = nchan - 1; 1107 } 1108 } 1109 } 1110 1111 static int 1112 hn_ifmedia_upd(struct ifnet *ifp __unused) 1113 { 1114 1115 return EOPNOTSUPP; 1116 } 1117 1118 static void 1119 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1120 { 1121 struct hn_softc *sc = ifp->if_softc; 1122 1123 ifmr->ifm_status = IFM_AVALID; 1124 ifmr->ifm_active = IFM_ETHER; 1125 1126 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1127 ifmr->ifm_active |= IFM_NONE; 1128 return; 1129 } 1130 ifmr->ifm_status |= IFM_ACTIVE; 1131 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1132 } 1133 1134 static void 1135 hn_rxvf_set_task(void *xarg, int pending __unused) 1136 { 1137 struct hn_rxvf_setarg *arg = xarg; 1138 1139 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1140 } 1141 1142 static void 1143 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1144 { 1145 struct hn_rx_ring *rxr; 1146 struct hn_rxvf_setarg arg; 1147 struct task task; 1148 int i; 1149 1150 HN_LOCK_ASSERT(sc); 1151 1152 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1153 1154 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1155 rxr = &sc->hn_rx_ring[i]; 1156 1157 if (i < sc->hn_rx_ring_inuse) { 1158 arg.rxr = rxr; 1159 arg.vf_ifp = vf_ifp; 1160 vmbus_chan_run_task(rxr->hn_chan, &task); 1161 } else { 1162 rxr->hn_rxvf_ifp = vf_ifp; 1163 } 1164 } 1165 } 1166 1167 static bool 1168 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1169 { 1170 const struct ifnet *hn_ifp; 1171 1172 hn_ifp = sc->hn_ifp; 1173 1174 if (ifp == hn_ifp) 1175 return (false); 1176 1177 if (ifp->if_alloctype != IFT_ETHER) 1178 return (false); 1179 1180 /* Ignore lagg/vlan interfaces */ 1181 if (strcmp(ifp->if_dname, "lagg") == 0 || 1182 strcmp(ifp->if_dname, "vlan") == 0) 1183 return (false); 1184 1185 /* 1186 * During detach events ifp->if_addr might be NULL. 1187 * Make sure the bcmp() below doesn't panic on that: 1188 */ 1189 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1190 return (false); 1191 1192 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1193 return (false); 1194 1195 return (true); 1196 } 1197 1198 static void 1199 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1200 { 1201 struct ifnet *hn_ifp; 1202 1203 HN_LOCK(sc); 1204 1205 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1206 goto out; 1207 1208 if (!hn_ismyvf(sc, ifp)) 1209 goto out; 1210 hn_ifp = sc->hn_ifp; 1211 1212 if (rxvf) { 1213 if (sc->hn_flags & HN_FLAG_RXVF) 1214 goto out; 1215 1216 sc->hn_flags |= HN_FLAG_RXVF; 1217 hn_rxfilter_config(sc); 1218 } else { 1219 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1220 goto out; 1221 1222 sc->hn_flags &= ~HN_FLAG_RXVF; 1223 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1224 hn_rxfilter_config(sc); 1225 else 1226 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1227 } 1228 1229 hn_nvs_set_datapath(sc, 1230 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1231 1232 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1233 1234 if (rxvf) { 1235 hn_vf_rss_fixup(sc, true); 1236 hn_suspend_mgmt(sc); 1237 sc->hn_link_flags &= 1238 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1239 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1240 } else { 1241 hn_vf_rss_restore(sc); 1242 hn_resume_mgmt(sc); 1243 } 1244 1245 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1246 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1247 1248 if (bootverbose) { 1249 if_printf(hn_ifp, "datapath is switched %s %s\n", 1250 rxvf ? "to" : "from", ifp->if_xname); 1251 } 1252 out: 1253 HN_UNLOCK(sc); 1254 } 1255 1256 static void 1257 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1258 { 1259 1260 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1261 return; 1262 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1263 } 1264 1265 static void 1266 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1267 { 1268 1269 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1270 } 1271 1272 static int 1273 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1274 { 1275 struct ifnet *ifp, *vf_ifp; 1276 uint64_t tmp; 1277 int error; 1278 1279 HN_LOCK_ASSERT(sc); 1280 ifp = sc->hn_ifp; 1281 vf_ifp = sc->hn_vf_ifp; 1282 1283 /* 1284 * Fix up requested capabilities w/ supported capabilities, 1285 * since the supported capabilities could have been changed. 1286 */ 1287 ifr->ifr_reqcap &= ifp->if_capabilities; 1288 /* Pass SIOCSIFCAP to VF. */ 1289 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1290 1291 /* 1292 * NOTE: 1293 * The error will be propagated to the callers, however, it 1294 * is _not_ useful here. 1295 */ 1296 1297 /* 1298 * Merge VF's enabled capabilities. 1299 */ 1300 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1301 1302 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1303 if (ifp->if_capenable & IFCAP_TXCSUM) 1304 ifp->if_hwassist |= tmp; 1305 else 1306 ifp->if_hwassist &= ~tmp; 1307 1308 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1309 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1310 ifp->if_hwassist |= tmp; 1311 else 1312 ifp->if_hwassist &= ~tmp; 1313 1314 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1315 if (ifp->if_capenable & IFCAP_TSO4) 1316 ifp->if_hwassist |= tmp; 1317 else 1318 ifp->if_hwassist &= ~tmp; 1319 1320 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1321 if (ifp->if_capenable & IFCAP_TSO6) 1322 ifp->if_hwassist |= tmp; 1323 else 1324 ifp->if_hwassist &= ~tmp; 1325 1326 return (error); 1327 } 1328 1329 static int 1330 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1331 { 1332 struct ifnet *vf_ifp; 1333 struct ifreq ifr; 1334 1335 HN_LOCK_ASSERT(sc); 1336 vf_ifp = sc->hn_vf_ifp; 1337 1338 memset(&ifr, 0, sizeof(ifr)); 1339 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1340 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1341 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1342 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1343 } 1344 1345 static void 1346 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1347 { 1348 struct ifnet *ifp = sc->hn_ifp; 1349 int allmulti = 0; 1350 1351 HN_LOCK_ASSERT(sc); 1352 1353 /* XXX vlan(4) style mcast addr maintenance */ 1354 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1355 allmulti = IFF_ALLMULTI; 1356 1357 /* Always set the VF's if_flags */ 1358 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1359 } 1360 1361 static void 1362 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1363 { 1364 struct rm_priotracker pt; 1365 struct ifnet *hn_ifp = NULL; 1366 struct mbuf *mn; 1367 1368 /* 1369 * XXX racy, if hn(4) ever detached. 1370 */ 1371 rm_rlock(&hn_vfmap_lock, &pt); 1372 if (vf_ifp->if_index < hn_vfmap_size) 1373 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1374 rm_runlock(&hn_vfmap_lock, &pt); 1375 1376 if (hn_ifp != NULL) { 1377 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1378 /* 1379 * Allow tapping on the VF. 1380 */ 1381 ETHER_BPF_MTAP(vf_ifp, mn); 1382 1383 /* 1384 * Update VF stats. 1385 */ 1386 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1387 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1388 mn->m_pkthdr.len); 1389 } 1390 /* 1391 * XXX IFCOUNTER_IMCAST 1392 * This stat updating is kinda invasive, since it 1393 * requires two checks on the mbuf: the length check 1394 * and the ethernet header check. As of this write, 1395 * all multicast packets go directly to hn(4), which 1396 * makes imcast stat updating in the VF a try in vian. 1397 */ 1398 1399 /* 1400 * Fix up rcvif and increase hn(4)'s ipackets. 1401 */ 1402 mn->m_pkthdr.rcvif = hn_ifp; 1403 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1404 } 1405 /* 1406 * Go through hn(4)'s if_input. 1407 */ 1408 hn_ifp->if_input(hn_ifp, m); 1409 } else { 1410 /* 1411 * In the middle of the transition; free this 1412 * mbuf chain. 1413 */ 1414 while (m != NULL) { 1415 mn = m->m_nextpkt; 1416 m->m_nextpkt = NULL; 1417 m_freem(m); 1418 m = mn; 1419 } 1420 } 1421 } 1422 1423 static void 1424 hn_mtu_change_fixup(struct hn_softc *sc) 1425 { 1426 struct ifnet *ifp; 1427 1428 HN_LOCK_ASSERT(sc); 1429 ifp = sc->hn_ifp; 1430 1431 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1432 #if __FreeBSD_version >= 1100099 1433 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1434 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1435 #endif 1436 } 1437 1438 static uint32_t 1439 hn_rss_type_fromndis(uint32_t rss_hash) 1440 { 1441 uint32_t types = 0; 1442 1443 if (rss_hash & NDIS_HASH_IPV4) 1444 types |= RSS_TYPE_IPV4; 1445 if (rss_hash & NDIS_HASH_TCP_IPV4) 1446 types |= RSS_TYPE_TCP_IPV4; 1447 if (rss_hash & NDIS_HASH_IPV6) 1448 types |= RSS_TYPE_IPV6; 1449 if (rss_hash & NDIS_HASH_IPV6_EX) 1450 types |= RSS_TYPE_IPV6_EX; 1451 if (rss_hash & NDIS_HASH_TCP_IPV6) 1452 types |= RSS_TYPE_TCP_IPV6; 1453 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1454 types |= RSS_TYPE_TCP_IPV6_EX; 1455 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1456 types |= RSS_TYPE_UDP_IPV4; 1457 return (types); 1458 } 1459 1460 static uint32_t 1461 hn_rss_type_tondis(uint32_t types) 1462 { 1463 uint32_t rss_hash = 0; 1464 1465 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1466 ("UDP6 and UDP6EX are not supported")); 1467 1468 if (types & RSS_TYPE_IPV4) 1469 rss_hash |= NDIS_HASH_IPV4; 1470 if (types & RSS_TYPE_TCP_IPV4) 1471 rss_hash |= NDIS_HASH_TCP_IPV4; 1472 if (types & RSS_TYPE_IPV6) 1473 rss_hash |= NDIS_HASH_IPV6; 1474 if (types & RSS_TYPE_IPV6_EX) 1475 rss_hash |= NDIS_HASH_IPV6_EX; 1476 if (types & RSS_TYPE_TCP_IPV6) 1477 rss_hash |= NDIS_HASH_TCP_IPV6; 1478 if (types & RSS_TYPE_TCP_IPV6_EX) 1479 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1480 if (types & RSS_TYPE_UDP_IPV4) 1481 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1482 return (rss_hash); 1483 } 1484 1485 static void 1486 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1487 { 1488 int i; 1489 1490 HN_LOCK_ASSERT(sc); 1491 1492 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1493 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1494 } 1495 1496 static void 1497 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1498 { 1499 struct ifnet *ifp, *vf_ifp; 1500 struct ifrsshash ifrh; 1501 struct ifrsskey ifrk; 1502 int error; 1503 uint32_t my_types, diff_types, mbuf_types = 0; 1504 1505 HN_LOCK_ASSERT(sc); 1506 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1507 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1508 1509 if (sc->hn_rx_ring_inuse == 1) { 1510 /* No RSS on synthetic parts; done. */ 1511 return; 1512 } 1513 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1514 /* Synthetic parts do not support Toeplitz; done. */ 1515 return; 1516 } 1517 1518 ifp = sc->hn_ifp; 1519 vf_ifp = sc->hn_vf_ifp; 1520 1521 /* 1522 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1523 * supported. 1524 */ 1525 memset(&ifrk, 0, sizeof(ifrk)); 1526 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1527 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1528 if (error) { 1529 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1530 vf_ifp->if_xname, error); 1531 goto done; 1532 } 1533 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1534 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1535 vf_ifp->if_xname, ifrk.ifrk_func); 1536 goto done; 1537 } 1538 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1539 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1540 vf_ifp->if_xname, ifrk.ifrk_keylen); 1541 goto done; 1542 } 1543 1544 /* 1545 * Extract VF's RSS hash. Only Toeplitz is supported. 1546 */ 1547 memset(&ifrh, 0, sizeof(ifrh)); 1548 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1549 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1550 if (error) { 1551 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1552 vf_ifp->if_xname, error); 1553 goto done; 1554 } 1555 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1556 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1557 vf_ifp->if_xname, ifrh.ifrh_func); 1558 goto done; 1559 } 1560 1561 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1562 if ((ifrh.ifrh_types & my_types) == 0) { 1563 /* This disables RSS; ignore it then */ 1564 if_printf(ifp, "%s intersection of RSS types failed. " 1565 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1566 ifrh.ifrh_types, my_types); 1567 goto done; 1568 } 1569 1570 diff_types = my_types ^ ifrh.ifrh_types; 1571 my_types &= ifrh.ifrh_types; 1572 mbuf_types = my_types; 1573 1574 /* 1575 * Detect RSS hash value/type confliction. 1576 * 1577 * NOTE: 1578 * We don't disable the hash type, but stop delivery the hash 1579 * value/type through mbufs on RX path. 1580 * 1581 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1582 * hash is delivered with type of TCP_IPV4. This means if 1583 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1584 * least to hn_mbuf_hash. However, given that _all_ of the 1585 * NICs implement TCP_IPV4, this will _not_ impose any issues 1586 * here. 1587 */ 1588 if ((my_types & RSS_TYPE_IPV4) && 1589 (diff_types & ifrh.ifrh_types & 1590 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1591 /* Conflict; disable IPV4 hash type/value delivery. */ 1592 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1593 mbuf_types &= ~RSS_TYPE_IPV4; 1594 } 1595 if ((my_types & RSS_TYPE_IPV6) && 1596 (diff_types & ifrh.ifrh_types & 1597 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1598 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1599 RSS_TYPE_IPV6_EX))) { 1600 /* Conflict; disable IPV6 hash type/value delivery. */ 1601 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1602 mbuf_types &= ~RSS_TYPE_IPV6; 1603 } 1604 if ((my_types & RSS_TYPE_IPV6_EX) && 1605 (diff_types & ifrh.ifrh_types & 1606 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1607 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1608 RSS_TYPE_IPV6))) { 1609 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1610 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1611 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1612 } 1613 if ((my_types & RSS_TYPE_TCP_IPV6) && 1614 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1615 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1616 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1617 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1618 } 1619 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1620 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1621 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1622 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1623 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1624 } 1625 if ((my_types & RSS_TYPE_UDP_IPV6) && 1626 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1627 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1628 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1629 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1630 } 1631 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1632 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1633 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1634 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1635 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1636 } 1637 1638 /* 1639 * Indirect table does not matter. 1640 */ 1641 1642 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1643 hn_rss_type_tondis(my_types); 1644 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1645 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1646 1647 if (reconf) { 1648 error = hn_rss_reconfig(sc); 1649 if (error) { 1650 /* XXX roll-back? */ 1651 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1652 /* XXX keep going. */ 1653 } 1654 } 1655 done: 1656 /* Hash deliverability for mbufs. */ 1657 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1658 } 1659 1660 static void 1661 hn_vf_rss_restore(struct hn_softc *sc) 1662 { 1663 1664 HN_LOCK_ASSERT(sc); 1665 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1666 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1667 1668 if (sc->hn_rx_ring_inuse == 1) 1669 goto done; 1670 1671 /* 1672 * Restore hash types. Key does _not_ matter. 1673 */ 1674 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1675 int error; 1676 1677 sc->hn_rss_hash = sc->hn_rss_hcap; 1678 error = hn_rss_reconfig(sc); 1679 if (error) { 1680 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1681 error); 1682 /* XXX keep going. */ 1683 } 1684 } 1685 done: 1686 /* Hash deliverability for mbufs. */ 1687 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1688 } 1689 1690 static void 1691 hn_xpnt_vf_setready(struct hn_softc *sc) 1692 { 1693 struct ifnet *ifp, *vf_ifp; 1694 struct ifreq ifr; 1695 1696 HN_LOCK_ASSERT(sc); 1697 ifp = sc->hn_ifp; 1698 vf_ifp = sc->hn_vf_ifp; 1699 1700 /* 1701 * Mark the VF ready. 1702 */ 1703 sc->hn_vf_rdytick = 0; 1704 1705 /* 1706 * Save information for restoration. 1707 */ 1708 sc->hn_saved_caps = ifp->if_capabilities; 1709 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1710 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1711 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1712 1713 /* 1714 * Intersect supported/enabled capabilities. 1715 * 1716 * NOTE: 1717 * if_hwassist is not changed here. 1718 */ 1719 ifp->if_capabilities &= vf_ifp->if_capabilities; 1720 ifp->if_capenable &= ifp->if_capabilities; 1721 1722 /* 1723 * Fix TSO settings. 1724 */ 1725 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1726 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1727 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1728 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1729 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1730 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1731 1732 /* 1733 * Change VF's enabled capabilities. 1734 */ 1735 memset(&ifr, 0, sizeof(ifr)); 1736 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1737 ifr.ifr_reqcap = ifp->if_capenable; 1738 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1739 1740 if (ifp->if_mtu != ETHERMTU) { 1741 int error; 1742 1743 /* 1744 * Change VF's MTU. 1745 */ 1746 memset(&ifr, 0, sizeof(ifr)); 1747 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1748 ifr.ifr_mtu = ifp->if_mtu; 1749 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1750 if (error) { 1751 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1752 vf_ifp->if_xname, ifp->if_mtu); 1753 if (ifp->if_mtu > ETHERMTU) { 1754 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1755 1756 /* 1757 * XXX 1758 * No need to adjust the synthetic parts' MTU; 1759 * failure of the adjustment will cause us 1760 * infinite headache. 1761 */ 1762 ifp->if_mtu = ETHERMTU; 1763 hn_mtu_change_fixup(sc); 1764 } 1765 } 1766 } 1767 } 1768 1769 static bool 1770 hn_xpnt_vf_isready(struct hn_softc *sc) 1771 { 1772 1773 HN_LOCK_ASSERT(sc); 1774 1775 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1776 return (false); 1777 1778 if (sc->hn_vf_rdytick == 0) 1779 return (true); 1780 1781 if (sc->hn_vf_rdytick > ticks) 1782 return (false); 1783 1784 /* Mark VF as ready. */ 1785 hn_xpnt_vf_setready(sc); 1786 return (true); 1787 } 1788 1789 static void 1790 hn_xpnt_vf_setenable(struct hn_softc *sc) 1791 { 1792 int i; 1793 1794 HN_LOCK_ASSERT(sc); 1795 1796 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1797 rm_wlock(&sc->hn_vf_lock); 1798 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1799 rm_wunlock(&sc->hn_vf_lock); 1800 1801 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1802 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1803 } 1804 1805 static void 1806 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1807 { 1808 int i; 1809 1810 HN_LOCK_ASSERT(sc); 1811 1812 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1813 rm_wlock(&sc->hn_vf_lock); 1814 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1815 if (clear_vf) 1816 sc->hn_vf_ifp = NULL; 1817 rm_wunlock(&sc->hn_vf_lock); 1818 1819 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1820 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1821 } 1822 1823 static void 1824 hn_xpnt_vf_init(struct hn_softc *sc) 1825 { 1826 int error; 1827 1828 HN_LOCK_ASSERT(sc); 1829 1830 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1831 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1832 1833 if (bootverbose) { 1834 if_printf(sc->hn_ifp, "try bringing up %s\n", 1835 sc->hn_vf_ifp->if_xname); 1836 } 1837 1838 /* 1839 * Bring the VF up. 1840 */ 1841 hn_xpnt_vf_saveifflags(sc); 1842 sc->hn_vf_ifp->if_flags |= IFF_UP; 1843 error = hn_xpnt_vf_iocsetflags(sc); 1844 if (error) { 1845 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1846 sc->hn_vf_ifp->if_xname, error); 1847 return; 1848 } 1849 1850 /* 1851 * NOTE: 1852 * Datapath setting must happen _after_ bringing the VF up. 1853 */ 1854 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1855 1856 /* 1857 * NOTE: 1858 * Fixup RSS related bits _after_ the VF is brought up, since 1859 * many VFs generate RSS key during it's initialization. 1860 */ 1861 hn_vf_rss_fixup(sc, true); 1862 1863 /* Mark transparent mode VF as enabled. */ 1864 hn_xpnt_vf_setenable(sc); 1865 } 1866 1867 static void 1868 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1869 { 1870 struct hn_softc *sc = xsc; 1871 1872 HN_LOCK(sc); 1873 1874 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1875 goto done; 1876 if (sc->hn_vf_ifp == NULL) 1877 goto done; 1878 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1879 goto done; 1880 1881 if (sc->hn_vf_rdytick != 0) { 1882 /* Mark VF as ready. */ 1883 hn_xpnt_vf_setready(sc); 1884 } 1885 1886 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1887 /* 1888 * Delayed VF initialization. 1889 */ 1890 if (bootverbose) { 1891 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1892 sc->hn_vf_ifp->if_xname); 1893 } 1894 hn_xpnt_vf_init(sc); 1895 } 1896 done: 1897 HN_UNLOCK(sc); 1898 } 1899 1900 static void 1901 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1902 { 1903 struct hn_softc *sc = xsc; 1904 1905 HN_LOCK(sc); 1906 1907 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1908 goto done; 1909 1910 if (!hn_ismyvf(sc, ifp)) 1911 goto done; 1912 1913 if (sc->hn_vf_ifp != NULL) { 1914 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1915 sc->hn_vf_ifp->if_xname); 1916 goto done; 1917 } 1918 1919 if (hn_xpnt_vf && ifp->if_start != NULL) { 1920 /* 1921 * ifnet.if_start is _not_ supported by transparent 1922 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1923 */ 1924 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1925 "in transparent VF mode.\n", ifp->if_xname); 1926 goto done; 1927 } 1928 1929 rm_wlock(&hn_vfmap_lock); 1930 1931 if (ifp->if_index >= hn_vfmap_size) { 1932 struct ifnet **newmap; 1933 int newsize; 1934 1935 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1936 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1937 M_WAITOK | M_ZERO); 1938 1939 memcpy(newmap, hn_vfmap, 1940 sizeof(struct ifnet *) * hn_vfmap_size); 1941 free(hn_vfmap, M_DEVBUF); 1942 hn_vfmap = newmap; 1943 hn_vfmap_size = newsize; 1944 } 1945 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1946 ("%s: ifindex %d was mapped to %s", 1947 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1948 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1949 1950 rm_wunlock(&hn_vfmap_lock); 1951 1952 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1953 rm_wlock(&sc->hn_vf_lock); 1954 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1955 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1956 sc->hn_vf_ifp = ifp; 1957 rm_wunlock(&sc->hn_vf_lock); 1958 1959 if (hn_xpnt_vf) { 1960 int wait_ticks; 1961 1962 /* 1963 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1964 * Save vf_ifp's current if_input for later restoration. 1965 */ 1966 sc->hn_vf_input = ifp->if_input; 1967 ifp->if_input = hn_xpnt_vf_input; 1968 1969 /* 1970 * Stop link status management; use the VF's. 1971 */ 1972 hn_suspend_mgmt(sc); 1973 1974 /* 1975 * Give VF sometime to complete its attach routing. 1976 */ 1977 wait_ticks = hn_xpnt_vf_attwait * hz; 1978 sc->hn_vf_rdytick = ticks + wait_ticks; 1979 1980 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1981 wait_ticks); 1982 } 1983 done: 1984 HN_UNLOCK(sc); 1985 } 1986 1987 static void 1988 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1989 { 1990 struct hn_softc *sc = xsc; 1991 1992 HN_LOCK(sc); 1993 1994 if (sc->hn_vf_ifp == NULL) 1995 goto done; 1996 1997 if (!hn_ismyvf(sc, ifp)) 1998 goto done; 1999 2000 if (hn_xpnt_vf) { 2001 /* 2002 * Make sure that the delayed initialization is not running. 2003 * 2004 * NOTE: 2005 * - This lock _must_ be released, since the hn_vf_init task 2006 * will try holding this lock. 2007 * - It is safe to release this lock here, since the 2008 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2009 * 2010 * XXX racy, if hn(4) ever detached. 2011 */ 2012 HN_UNLOCK(sc); 2013 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2014 HN_LOCK(sc); 2015 2016 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2017 sc->hn_ifp->if_xname)); 2018 ifp->if_input = sc->hn_vf_input; 2019 sc->hn_vf_input = NULL; 2020 2021 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2022 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2023 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2024 2025 if (sc->hn_vf_rdytick == 0) { 2026 /* 2027 * The VF was ready; restore some settings. 2028 */ 2029 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2030 /* 2031 * NOTE: 2032 * There is _no_ need to fixup if_capenable and 2033 * if_hwassist, since the if_capabilities before 2034 * restoration was an intersection of the VF's 2035 * if_capabilites and the synthetic device's 2036 * if_capabilites. 2037 */ 2038 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2039 sc->hn_ifp->if_hw_tsomaxsegcount = 2040 sc->hn_saved_tsosegcnt; 2041 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2042 } 2043 2044 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2045 /* 2046 * Restore RSS settings. 2047 */ 2048 hn_vf_rss_restore(sc); 2049 2050 /* 2051 * Resume link status management, which was suspended 2052 * by hn_ifnet_attevent(). 2053 */ 2054 hn_resume_mgmt(sc); 2055 } 2056 } 2057 2058 /* Mark transparent mode VF as disabled. */ 2059 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2060 2061 rm_wlock(&hn_vfmap_lock); 2062 2063 KASSERT(ifp->if_index < hn_vfmap_size, 2064 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2065 if (hn_vfmap[ifp->if_index] != NULL) { 2066 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2067 ("%s: ifindex %d was mapped to %s", 2068 ifp->if_xname, ifp->if_index, 2069 hn_vfmap[ifp->if_index]->if_xname)); 2070 hn_vfmap[ifp->if_index] = NULL; 2071 } 2072 2073 rm_wunlock(&hn_vfmap_lock); 2074 done: 2075 HN_UNLOCK(sc); 2076 } 2077 2078 static void 2079 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2080 { 2081 struct hn_softc *sc = xsc; 2082 2083 if (sc->hn_vf_ifp == ifp) 2084 if_link_state_change(sc->hn_ifp, link_state); 2085 } 2086 2087 static int 2088 hn_probe(device_t dev) 2089 { 2090 2091 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2092 device_set_desc(dev, "Hyper-V Network Interface"); 2093 return BUS_PROBE_DEFAULT; 2094 } 2095 return ENXIO; 2096 } 2097 2098 static int 2099 hn_attach(device_t dev) 2100 { 2101 struct hn_softc *sc = device_get_softc(dev); 2102 struct sysctl_oid_list *child; 2103 struct sysctl_ctx_list *ctx; 2104 uint8_t eaddr[ETHER_ADDR_LEN]; 2105 struct ifnet *ifp = NULL; 2106 int error, ring_cnt, tx_ring_cnt; 2107 uint32_t mtu; 2108 2109 sc->hn_dev = dev; 2110 sc->hn_prichan = vmbus_get_channel(dev); 2111 HN_LOCK_INIT(sc); 2112 rm_init(&sc->hn_vf_lock, "hnvf"); 2113 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2114 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2115 2116 /* 2117 * Initialize these tunables once. 2118 */ 2119 sc->hn_agg_size = hn_tx_agg_size; 2120 sc->hn_agg_pkts = hn_tx_agg_pkts; 2121 2122 /* 2123 * Setup taskqueue for transmission. 2124 */ 2125 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2126 int i; 2127 2128 sc->hn_tx_taskqs = 2129 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2130 M_DEVBUF, M_WAITOK); 2131 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2132 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2133 M_WAITOK, taskqueue_thread_enqueue, 2134 &sc->hn_tx_taskqs[i]); 2135 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2136 "%s tx%d", device_get_nameunit(dev), i); 2137 } 2138 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2139 sc->hn_tx_taskqs = hn_tx_taskque; 2140 } 2141 2142 /* 2143 * Setup taskqueue for mangement tasks, e.g. link status. 2144 */ 2145 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2146 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2147 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2148 device_get_nameunit(dev)); 2149 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2150 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2151 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2152 hn_netchg_status_taskfunc, sc); 2153 2154 if (hn_xpnt_vf) { 2155 /* 2156 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2157 */ 2158 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2159 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2160 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2161 device_get_nameunit(dev)); 2162 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2163 hn_xpnt_vf_init_taskfunc, sc); 2164 } 2165 2166 /* 2167 * Allocate ifnet and setup its name earlier, so that if_printf 2168 * can be used by functions, which will be called after 2169 * ether_ifattach(). 2170 */ 2171 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2172 ifp->if_softc = sc; 2173 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2174 2175 /* 2176 * Initialize ifmedia earlier so that it can be unconditionally 2177 * destroyed, if error happened later on. 2178 */ 2179 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2180 2181 /* 2182 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2183 * to use (tx_ring_cnt). 2184 * 2185 * NOTE: 2186 * The # of RX rings to use is same as the # of channels to use. 2187 */ 2188 ring_cnt = hn_chan_cnt; 2189 if (ring_cnt <= 0) { 2190 /* Default */ 2191 ring_cnt = mp_ncpus; 2192 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2193 ring_cnt = HN_RING_CNT_DEF_MAX; 2194 } else if (ring_cnt > mp_ncpus) { 2195 ring_cnt = mp_ncpus; 2196 } 2197 #ifdef RSS 2198 if (ring_cnt > rss_getnumbuckets()) 2199 ring_cnt = rss_getnumbuckets(); 2200 #endif 2201 2202 tx_ring_cnt = hn_tx_ring_cnt; 2203 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2204 tx_ring_cnt = ring_cnt; 2205 #ifdef HN_IFSTART_SUPPORT 2206 if (hn_use_if_start) { 2207 /* ifnet.if_start only needs one TX ring. */ 2208 tx_ring_cnt = 1; 2209 } 2210 #endif 2211 2212 /* 2213 * Set the leader CPU for channels. 2214 */ 2215 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2216 2217 /* 2218 * Create enough TX/RX rings, even if only limited number of 2219 * channels can be allocated. 2220 */ 2221 error = hn_create_tx_data(sc, tx_ring_cnt); 2222 if (error) 2223 goto failed; 2224 error = hn_create_rx_data(sc, ring_cnt); 2225 if (error) 2226 goto failed; 2227 2228 /* 2229 * Create transaction context for NVS and RNDIS transactions. 2230 */ 2231 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2232 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2233 if (sc->hn_xact == NULL) { 2234 error = ENXIO; 2235 goto failed; 2236 } 2237 2238 /* 2239 * Install orphan handler for the revocation of this device's 2240 * primary channel. 2241 * 2242 * NOTE: 2243 * The processing order is critical here: 2244 * Install the orphan handler, _before_ testing whether this 2245 * device's primary channel has been revoked or not. 2246 */ 2247 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2248 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2249 error = ENXIO; 2250 goto failed; 2251 } 2252 2253 /* 2254 * Attach the synthetic parts, i.e. NVS and RNDIS. 2255 */ 2256 error = hn_synth_attach(sc, ETHERMTU); 2257 if (error) 2258 goto failed; 2259 2260 error = hn_rndis_get_eaddr(sc, eaddr); 2261 if (error) 2262 goto failed; 2263 2264 error = hn_rndis_get_mtu(sc, &mtu); 2265 if (error) 2266 mtu = ETHERMTU; 2267 else if (bootverbose) 2268 device_printf(dev, "RNDIS mtu %u\n", mtu); 2269 2270 #if __FreeBSD_version >= 1100099 2271 if (sc->hn_rx_ring_inuse > 1) { 2272 /* 2273 * Reduce TCP segment aggregation limit for multiple 2274 * RX rings to increase ACK timeliness. 2275 */ 2276 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2277 } 2278 #endif 2279 2280 /* 2281 * Fixup TX/RX stuffs after synthetic parts are attached. 2282 */ 2283 hn_fixup_tx_data(sc); 2284 hn_fixup_rx_data(sc); 2285 2286 ctx = device_get_sysctl_ctx(dev); 2287 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2288 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2289 &sc->hn_nvs_ver, 0, "NVS version"); 2290 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2291 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2292 hn_ndis_version_sysctl, "A", "NDIS version"); 2293 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2294 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2295 hn_caps_sysctl, "A", "capabilities"); 2296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2297 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2298 hn_hwassist_sysctl, "A", "hwassist"); 2299 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2300 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2301 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2302 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2303 "max # of TSO segments"); 2304 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2305 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2306 "max size of TSO segment"); 2307 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2308 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2309 hn_rxfilter_sysctl, "A", "rxfilter"); 2310 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2311 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2312 hn_rss_hash_sysctl, "A", "RSS hash"); 2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2314 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2315 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2317 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2318 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2319 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2320 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2321 #ifndef RSS 2322 /* 2323 * Don't allow RSS key/indirect table changes, if RSS is defined. 2324 */ 2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2326 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2327 hn_rss_key_sysctl, "IU", "RSS key"); 2328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2329 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2330 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2331 #endif 2332 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2333 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2334 "RNDIS offered packet transmission aggregation size limit"); 2335 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2336 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2337 "RNDIS offered packet transmission aggregation count limit"); 2338 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2339 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2340 "RNDIS packet transmission aggregation alignment"); 2341 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2342 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2343 hn_txagg_size_sysctl, "I", 2344 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2345 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2346 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2347 hn_txagg_pkts_sysctl, "I", 2348 "Packet transmission aggregation packets, " 2349 "0 -- disable, -1 -- auto"); 2350 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2351 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2352 hn_polling_sysctl, "I", 2353 "Polling frequency: [100,1000000], 0 disable polling"); 2354 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2355 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2356 hn_vf_sysctl, "A", "Virtual Function's name"); 2357 if (!hn_xpnt_vf) { 2358 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2359 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2360 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2361 } else { 2362 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2363 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2364 hn_xpnt_vf_enabled_sysctl, "I", 2365 "Transparent VF enabled"); 2366 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2367 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2368 hn_xpnt_vf_accbpf_sysctl, "I", 2369 "Accurate BPF for transparent VF"); 2370 } 2371 2372 /* 2373 * Setup the ifmedia, which has been initialized earlier. 2374 */ 2375 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2376 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2377 /* XXX ifmedia_set really should do this for us */ 2378 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2379 2380 /* 2381 * Setup the ifnet for this interface. 2382 */ 2383 2384 ifp->if_baudrate = IF_Gbps(10); 2385 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2386 ifp->if_ioctl = hn_ioctl; 2387 ifp->if_init = hn_init; 2388 #ifdef HN_IFSTART_SUPPORT 2389 if (hn_use_if_start) { 2390 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2391 2392 ifp->if_start = hn_start; 2393 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2394 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2395 IFQ_SET_READY(&ifp->if_snd); 2396 } else 2397 #endif 2398 { 2399 ifp->if_transmit = hn_transmit; 2400 ifp->if_qflush = hn_xmit_qflush; 2401 } 2402 2403 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2404 #ifdef foo 2405 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2406 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2407 #endif 2408 if (sc->hn_caps & HN_CAP_VLAN) { 2409 /* XXX not sure about VLAN_MTU. */ 2410 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2411 } 2412 2413 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2414 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2415 ifp->if_capabilities |= IFCAP_TXCSUM; 2416 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2417 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2418 if (sc->hn_caps & HN_CAP_TSO4) { 2419 ifp->if_capabilities |= IFCAP_TSO4; 2420 ifp->if_hwassist |= CSUM_IP_TSO; 2421 } 2422 if (sc->hn_caps & HN_CAP_TSO6) { 2423 ifp->if_capabilities |= IFCAP_TSO6; 2424 ifp->if_hwassist |= CSUM_IP6_TSO; 2425 } 2426 2427 /* Enable all available capabilities by default. */ 2428 ifp->if_capenable = ifp->if_capabilities; 2429 2430 /* 2431 * Disable IPv6 TSO and TXCSUM by default, they still can 2432 * be enabled through SIOCSIFCAP. 2433 */ 2434 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2435 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2436 2437 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2438 /* 2439 * Lock hn_set_tso_maxsize() to simplify its 2440 * internal logic. 2441 */ 2442 HN_LOCK(sc); 2443 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2444 HN_UNLOCK(sc); 2445 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2446 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2447 } 2448 2449 ether_ifattach(ifp, eaddr); 2450 2451 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2452 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2453 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2454 } 2455 if (mtu < ETHERMTU) { 2456 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2457 ifp->if_mtu = mtu; 2458 } 2459 2460 /* Inform the upper layer about the long frame support. */ 2461 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2462 2463 /* 2464 * Kick off link status check. 2465 */ 2466 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2467 hn_update_link_status(sc); 2468 2469 if (!hn_xpnt_vf) { 2470 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2471 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2472 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2473 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2474 } else { 2475 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2476 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2477 } 2478 2479 /* 2480 * NOTE: 2481 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2482 * since interface's LLADDR is needed; interface LLADDR is not 2483 * available when ifnet_arrival event is triggered. 2484 */ 2485 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2486 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2487 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2488 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2489 2490 return (0); 2491 failed: 2492 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2493 hn_synth_detach(sc); 2494 hn_detach(dev); 2495 return (error); 2496 } 2497 2498 static int 2499 hn_detach(device_t dev) 2500 { 2501 struct hn_softc *sc = device_get_softc(dev); 2502 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2503 2504 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2505 /* 2506 * In case that the vmbus missed the orphan handler 2507 * installation. 2508 */ 2509 vmbus_xact_ctx_orphan(sc->hn_xact); 2510 } 2511 2512 if (sc->hn_ifaddr_evthand != NULL) 2513 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2514 if (sc->hn_ifnet_evthand != NULL) 2515 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2516 if (sc->hn_ifnet_atthand != NULL) { 2517 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2518 sc->hn_ifnet_atthand); 2519 } 2520 if (sc->hn_ifnet_dethand != NULL) { 2521 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2522 sc->hn_ifnet_dethand); 2523 } 2524 if (sc->hn_ifnet_lnkhand != NULL) 2525 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2526 2527 vf_ifp = sc->hn_vf_ifp; 2528 __compiler_membar(); 2529 if (vf_ifp != NULL) 2530 hn_ifnet_detevent(sc, vf_ifp); 2531 2532 if (device_is_attached(dev)) { 2533 HN_LOCK(sc); 2534 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2535 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2536 hn_stop(sc, true); 2537 /* 2538 * NOTE: 2539 * hn_stop() only suspends data, so managment 2540 * stuffs have to be suspended manually here. 2541 */ 2542 hn_suspend_mgmt(sc); 2543 hn_synth_detach(sc); 2544 } 2545 HN_UNLOCK(sc); 2546 ether_ifdetach(ifp); 2547 } 2548 2549 ifmedia_removeall(&sc->hn_media); 2550 hn_destroy_rx_data(sc); 2551 hn_destroy_tx_data(sc); 2552 2553 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2554 int i; 2555 2556 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2557 taskqueue_free(sc->hn_tx_taskqs[i]); 2558 free(sc->hn_tx_taskqs, M_DEVBUF); 2559 } 2560 taskqueue_free(sc->hn_mgmt_taskq0); 2561 if (sc->hn_vf_taskq != NULL) 2562 taskqueue_free(sc->hn_vf_taskq); 2563 2564 if (sc->hn_xact != NULL) { 2565 /* 2566 * Uninstall the orphan handler _before_ the xact is 2567 * destructed. 2568 */ 2569 vmbus_chan_unset_orphan(sc->hn_prichan); 2570 vmbus_xact_ctx_destroy(sc->hn_xact); 2571 } 2572 2573 if_free(ifp); 2574 2575 HN_LOCK_DESTROY(sc); 2576 rm_destroy(&sc->hn_vf_lock); 2577 return (0); 2578 } 2579 2580 static int 2581 hn_shutdown(device_t dev) 2582 { 2583 2584 return (0); 2585 } 2586 2587 static void 2588 hn_link_status(struct hn_softc *sc) 2589 { 2590 uint32_t link_status; 2591 int error; 2592 2593 error = hn_rndis_get_linkstatus(sc, &link_status); 2594 if (error) { 2595 /* XXX what to do? */ 2596 return; 2597 } 2598 2599 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2600 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2601 else 2602 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2603 if_link_state_change(sc->hn_ifp, 2604 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2605 LINK_STATE_UP : LINK_STATE_DOWN); 2606 } 2607 2608 static void 2609 hn_link_taskfunc(void *xsc, int pending __unused) 2610 { 2611 struct hn_softc *sc = xsc; 2612 2613 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2614 return; 2615 hn_link_status(sc); 2616 } 2617 2618 static void 2619 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2620 { 2621 struct hn_softc *sc = xsc; 2622 2623 /* Prevent any link status checks from running. */ 2624 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2625 2626 /* 2627 * Fake up a [link down --> link up] state change; 5 seconds 2628 * delay is used, which closely simulates miibus reaction 2629 * upon link down event. 2630 */ 2631 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2632 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2633 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2634 &sc->hn_netchg_status, 5 * hz); 2635 } 2636 2637 static void 2638 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2639 { 2640 struct hn_softc *sc = xsc; 2641 2642 /* Re-allow link status checks. */ 2643 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2644 hn_link_status(sc); 2645 } 2646 2647 static void 2648 hn_update_link_status(struct hn_softc *sc) 2649 { 2650 2651 if (sc->hn_mgmt_taskq != NULL) 2652 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2653 } 2654 2655 static void 2656 hn_change_network(struct hn_softc *sc) 2657 { 2658 2659 if (sc->hn_mgmt_taskq != NULL) 2660 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2661 } 2662 2663 static __inline int 2664 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2665 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2666 { 2667 struct mbuf *m = *m_head; 2668 int error; 2669 2670 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2671 2672 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2673 m, segs, nsegs, BUS_DMA_NOWAIT); 2674 if (error == EFBIG) { 2675 struct mbuf *m_new; 2676 2677 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2678 if (m_new == NULL) 2679 return ENOBUFS; 2680 else 2681 *m_head = m = m_new; 2682 txr->hn_tx_collapsed++; 2683 2684 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2685 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2686 } 2687 if (!error) { 2688 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2689 BUS_DMASYNC_PREWRITE); 2690 txd->flags |= HN_TXD_FLAG_DMAMAP; 2691 } 2692 return error; 2693 } 2694 2695 static __inline int 2696 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2697 { 2698 2699 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2700 ("put an onlist txd %#x", txd->flags)); 2701 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2702 ("put an onagg txd %#x", txd->flags)); 2703 2704 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2705 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2706 return 0; 2707 2708 if (!STAILQ_EMPTY(&txd->agg_list)) { 2709 struct hn_txdesc *tmp_txd; 2710 2711 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2712 int freed; 2713 2714 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2715 ("resursive aggregation on aggregated txdesc")); 2716 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2717 ("not aggregated txdesc")); 2718 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2719 ("aggregated txdesc uses dmamap")); 2720 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2721 ("aggregated txdesc consumes " 2722 "chimney sending buffer")); 2723 KASSERT(tmp_txd->chim_size == 0, 2724 ("aggregated txdesc has non-zero " 2725 "chimney sending size")); 2726 2727 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2728 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2729 freed = hn_txdesc_put(txr, tmp_txd); 2730 KASSERT(freed, ("failed to free aggregated txdesc")); 2731 } 2732 } 2733 2734 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2735 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2736 ("chim txd uses dmamap")); 2737 hn_chim_free(txr->hn_sc, txd->chim_index); 2738 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2739 txd->chim_size = 0; 2740 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2741 bus_dmamap_sync(txr->hn_tx_data_dtag, 2742 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2743 bus_dmamap_unload(txr->hn_tx_data_dtag, 2744 txd->data_dmap); 2745 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2746 } 2747 2748 if (txd->m != NULL) { 2749 m_freem(txd->m); 2750 txd->m = NULL; 2751 } 2752 2753 txd->flags |= HN_TXD_FLAG_ONLIST; 2754 #ifndef HN_USE_TXDESC_BUFRING 2755 mtx_lock_spin(&txr->hn_txlist_spin); 2756 KASSERT(txr->hn_txdesc_avail >= 0 && 2757 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2758 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2759 txr->hn_txdesc_avail++; 2760 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2761 mtx_unlock_spin(&txr->hn_txlist_spin); 2762 #else /* HN_USE_TXDESC_BUFRING */ 2763 #ifdef HN_DEBUG 2764 atomic_add_int(&txr->hn_txdesc_avail, 1); 2765 #endif 2766 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2767 #endif /* !HN_USE_TXDESC_BUFRING */ 2768 2769 return 1; 2770 } 2771 2772 static __inline struct hn_txdesc * 2773 hn_txdesc_get(struct hn_tx_ring *txr) 2774 { 2775 struct hn_txdesc *txd; 2776 2777 #ifndef HN_USE_TXDESC_BUFRING 2778 mtx_lock_spin(&txr->hn_txlist_spin); 2779 txd = SLIST_FIRST(&txr->hn_txlist); 2780 if (txd != NULL) { 2781 KASSERT(txr->hn_txdesc_avail > 0, 2782 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2783 txr->hn_txdesc_avail--; 2784 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2785 } 2786 mtx_unlock_spin(&txr->hn_txlist_spin); 2787 #else 2788 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2789 #endif 2790 2791 if (txd != NULL) { 2792 #ifdef HN_USE_TXDESC_BUFRING 2793 #ifdef HN_DEBUG 2794 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2795 #endif 2796 #endif /* HN_USE_TXDESC_BUFRING */ 2797 KASSERT(txd->m == NULL && txd->refs == 0 && 2798 STAILQ_EMPTY(&txd->agg_list) && 2799 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2800 txd->chim_size == 0 && 2801 (txd->flags & HN_TXD_FLAG_ONLIST) && 2802 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2803 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2804 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2805 txd->refs = 1; 2806 } 2807 return txd; 2808 } 2809 2810 static __inline void 2811 hn_txdesc_hold(struct hn_txdesc *txd) 2812 { 2813 2814 /* 0->1 transition will never work */ 2815 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2816 atomic_add_int(&txd->refs, 1); 2817 } 2818 2819 static __inline void 2820 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2821 { 2822 2823 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2824 ("recursive aggregation on aggregating txdesc")); 2825 2826 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2827 ("already aggregated")); 2828 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2829 ("recursive aggregation on to-be-aggregated txdesc")); 2830 2831 txd->flags |= HN_TXD_FLAG_ONAGG; 2832 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2833 } 2834 2835 static bool 2836 hn_tx_ring_pending(struct hn_tx_ring *txr) 2837 { 2838 bool pending = false; 2839 2840 #ifndef HN_USE_TXDESC_BUFRING 2841 mtx_lock_spin(&txr->hn_txlist_spin); 2842 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2843 pending = true; 2844 mtx_unlock_spin(&txr->hn_txlist_spin); 2845 #else 2846 if (!buf_ring_full(txr->hn_txdesc_br)) 2847 pending = true; 2848 #endif 2849 return (pending); 2850 } 2851 2852 static __inline void 2853 hn_txeof(struct hn_tx_ring *txr) 2854 { 2855 txr->hn_has_txeof = 0; 2856 txr->hn_txeof(txr); 2857 } 2858 2859 static void 2860 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2861 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2862 { 2863 struct hn_txdesc *txd = sndc->hn_cbarg; 2864 struct hn_tx_ring *txr; 2865 2866 txr = txd->txr; 2867 KASSERT(txr->hn_chan == chan, 2868 ("channel mismatch, on chan%u, should be chan%u", 2869 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2870 2871 txr->hn_has_txeof = 1; 2872 hn_txdesc_put(txr, txd); 2873 2874 ++txr->hn_txdone_cnt; 2875 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2876 txr->hn_txdone_cnt = 0; 2877 if (txr->hn_oactive) 2878 hn_txeof(txr); 2879 } 2880 } 2881 2882 static void 2883 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2884 { 2885 #if defined(INET) || defined(INET6) 2886 tcp_lro_flush_all(&rxr->hn_lro); 2887 #endif 2888 2889 /* 2890 * NOTE: 2891 * 'txr' could be NULL, if multiple channels and 2892 * ifnet.if_start method are enabled. 2893 */ 2894 if (txr == NULL || !txr->hn_has_txeof) 2895 return; 2896 2897 txr->hn_txdone_cnt = 0; 2898 hn_txeof(txr); 2899 } 2900 2901 static __inline uint32_t 2902 hn_rndis_pktmsg_offset(uint32_t ofs) 2903 { 2904 2905 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2906 ("invalid RNDIS packet msg offset %u", ofs)); 2907 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2908 } 2909 2910 static __inline void * 2911 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2912 size_t pi_dlen, uint32_t pi_type) 2913 { 2914 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2915 struct rndis_pktinfo *pi; 2916 2917 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2918 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2919 2920 /* 2921 * Per-packet-info does not move; it only grows. 2922 * 2923 * NOTE: 2924 * rm_pktinfooffset in this phase counts from the beginning 2925 * of rndis_packet_msg. 2926 */ 2927 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2928 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2929 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2930 pkt->rm_pktinfolen); 2931 pkt->rm_pktinfolen += pi_size; 2932 2933 pi->rm_size = pi_size; 2934 pi->rm_type = pi_type; 2935 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2936 2937 return (pi->rm_data); 2938 } 2939 2940 static __inline int 2941 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2942 { 2943 struct hn_txdesc *txd; 2944 struct mbuf *m; 2945 int error, pkts; 2946 2947 txd = txr->hn_agg_txd; 2948 KASSERT(txd != NULL, ("no aggregate txdesc")); 2949 2950 /* 2951 * Since hn_txpkt() will reset this temporary stat, save 2952 * it now, so that oerrors can be updated properly, if 2953 * hn_txpkt() ever fails. 2954 */ 2955 pkts = txr->hn_stat_pkts; 2956 2957 /* 2958 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2959 * failure, save it for later freeing, if hn_txpkt() ever 2960 * fails. 2961 */ 2962 m = txd->m; 2963 error = hn_txpkt(ifp, txr, txd); 2964 if (__predict_false(error)) { 2965 /* txd is freed, but m is not. */ 2966 m_freem(m); 2967 2968 txr->hn_flush_failed++; 2969 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2970 } 2971 2972 /* Reset all aggregation states. */ 2973 txr->hn_agg_txd = NULL; 2974 txr->hn_agg_szleft = 0; 2975 txr->hn_agg_pktleft = 0; 2976 txr->hn_agg_prevpkt = NULL; 2977 2978 return (error); 2979 } 2980 2981 static void * 2982 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2983 int pktsize) 2984 { 2985 void *chim; 2986 2987 if (txr->hn_agg_txd != NULL) { 2988 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2989 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2990 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2991 int olen; 2992 2993 /* 2994 * Update the previous RNDIS packet's total length, 2995 * it can be increased due to the mandatory alignment 2996 * padding for this RNDIS packet. And update the 2997 * aggregating txdesc's chimney sending buffer size 2998 * accordingly. 2999 * 3000 * XXX 3001 * Zero-out the padding, as required by the RNDIS spec. 3002 */ 3003 olen = pkt->rm_len; 3004 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3005 agg_txd->chim_size += pkt->rm_len - olen; 3006 3007 /* Link this txdesc to the parent. */ 3008 hn_txdesc_agg(agg_txd, txd); 3009 3010 chim = (uint8_t *)pkt + pkt->rm_len; 3011 /* Save the current packet for later fixup. */ 3012 txr->hn_agg_prevpkt = chim; 3013 3014 txr->hn_agg_pktleft--; 3015 txr->hn_agg_szleft -= pktsize; 3016 if (txr->hn_agg_szleft <= 3017 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3018 /* 3019 * Probably can't aggregate more packets, 3020 * flush this aggregating txdesc proactively. 3021 */ 3022 txr->hn_agg_pktleft = 0; 3023 } 3024 /* Done! */ 3025 return (chim); 3026 } 3027 hn_flush_txagg(ifp, txr); 3028 } 3029 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3030 3031 txr->hn_tx_chimney_tried++; 3032 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3033 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3034 return (NULL); 3035 txr->hn_tx_chimney++; 3036 3037 chim = txr->hn_sc->hn_chim + 3038 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3039 3040 if (txr->hn_agg_pktmax > 1 && 3041 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3042 txr->hn_agg_txd = txd; 3043 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3044 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3045 txr->hn_agg_prevpkt = chim; 3046 } 3047 return (chim); 3048 } 3049 3050 /* 3051 * NOTE: 3052 * If this function fails, then both txd and m_head0 will be freed. 3053 */ 3054 static int 3055 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3056 struct mbuf **m_head0) 3057 { 3058 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3059 int error, nsegs, i; 3060 struct mbuf *m_head = *m_head0; 3061 struct rndis_packet_msg *pkt; 3062 uint32_t *pi_data; 3063 void *chim = NULL; 3064 int pkt_hlen, pkt_size; 3065 3066 pkt = txd->rndis_pkt; 3067 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3068 if (pkt_size < txr->hn_chim_size) { 3069 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3070 if (chim != NULL) 3071 pkt = chim; 3072 } else { 3073 if (txr->hn_agg_txd != NULL) 3074 hn_flush_txagg(ifp, txr); 3075 } 3076 3077 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3078 pkt->rm_len = m_head->m_pkthdr.len; 3079 pkt->rm_dataoffset = 0; 3080 pkt->rm_datalen = m_head->m_pkthdr.len; 3081 pkt->rm_oobdataoffset = 0; 3082 pkt->rm_oobdatalen = 0; 3083 pkt->rm_oobdataelements = 0; 3084 pkt->rm_pktinfooffset = sizeof(*pkt); 3085 pkt->rm_pktinfolen = 0; 3086 pkt->rm_vchandle = 0; 3087 pkt->rm_reserved = 0; 3088 3089 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3090 /* 3091 * Set the hash value for this packet. 3092 */ 3093 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3094 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3095 3096 if (M_HASHTYPE_ISHASH(m_head)) 3097 /* 3098 * The flowid field contains the hash value host 3099 * set in the rx queue if it is a ip forwarding pkt. 3100 * Set the same hash value so host can send on the 3101 * cpu it was received. 3102 */ 3103 *pi_data = m_head->m_pkthdr.flowid; 3104 else 3105 /* 3106 * Otherwise just put the tx queue index. 3107 */ 3108 *pi_data = txr->hn_tx_idx; 3109 } 3110 3111 if (m_head->m_flags & M_VLANTAG) { 3112 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3113 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3114 *pi_data = NDIS_VLAN_INFO_MAKE( 3115 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3116 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3117 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3118 } 3119 3120 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3121 #if defined(INET6) || defined(INET) 3122 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3123 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3124 #ifdef INET 3125 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3126 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3127 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3128 m_head->m_pkthdr.tso_segsz); 3129 } 3130 #endif 3131 #if defined(INET6) && defined(INET) 3132 else 3133 #endif 3134 #ifdef INET6 3135 { 3136 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3137 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3138 m_head->m_pkthdr.tso_segsz); 3139 } 3140 #endif 3141 #endif /* INET6 || INET */ 3142 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3143 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3144 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3145 if (m_head->m_pkthdr.csum_flags & 3146 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3147 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3148 } else { 3149 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3150 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3151 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3152 } 3153 3154 if (m_head->m_pkthdr.csum_flags & 3155 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3156 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3157 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3158 } else if (m_head->m_pkthdr.csum_flags & 3159 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3160 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3161 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3162 } 3163 } 3164 3165 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3166 /* Fixup RNDIS packet message total length */ 3167 pkt->rm_len += pkt_hlen; 3168 /* Convert RNDIS packet message offsets */ 3169 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3170 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3171 3172 /* 3173 * Fast path: Chimney sending. 3174 */ 3175 if (chim != NULL) { 3176 struct hn_txdesc *tgt_txd = txd; 3177 3178 if (txr->hn_agg_txd != NULL) { 3179 tgt_txd = txr->hn_agg_txd; 3180 #ifdef INVARIANTS 3181 *m_head0 = NULL; 3182 #endif 3183 } 3184 3185 KASSERT(pkt == chim, 3186 ("RNDIS pkt not in chimney sending buffer")); 3187 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3188 ("chimney sending buffer is not used")); 3189 tgt_txd->chim_size += pkt->rm_len; 3190 3191 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3192 ((uint8_t *)chim) + pkt_hlen); 3193 3194 txr->hn_gpa_cnt = 0; 3195 txr->hn_sendpkt = hn_txpkt_chim; 3196 goto done; 3197 } 3198 3199 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3200 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3201 ("chimney buffer is used")); 3202 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3203 3204 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3205 if (__predict_false(error)) { 3206 int freed; 3207 3208 /* 3209 * This mbuf is not linked w/ the txd yet, so free it now. 3210 */ 3211 m_freem(m_head); 3212 *m_head0 = NULL; 3213 3214 freed = hn_txdesc_put(txr, txd); 3215 KASSERT(freed != 0, 3216 ("fail to free txd upon txdma error")); 3217 3218 txr->hn_txdma_failed++; 3219 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3220 return error; 3221 } 3222 *m_head0 = m_head; 3223 3224 /* +1 RNDIS packet message */ 3225 txr->hn_gpa_cnt = nsegs + 1; 3226 3227 /* send packet with page buffer */ 3228 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3229 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3230 txr->hn_gpa[0].gpa_len = pkt_hlen; 3231 3232 /* 3233 * Fill the page buffers with mbuf info after the page 3234 * buffer for RNDIS packet message. 3235 */ 3236 for (i = 0; i < nsegs; ++i) { 3237 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3238 3239 gpa->gpa_page = atop(segs[i].ds_addr); 3240 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3241 gpa->gpa_len = segs[i].ds_len; 3242 } 3243 3244 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3245 txd->chim_size = 0; 3246 txr->hn_sendpkt = hn_txpkt_sglist; 3247 done: 3248 txd->m = m_head; 3249 3250 /* Set the completion routine */ 3251 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3252 3253 /* Update temporary stats for later use. */ 3254 txr->hn_stat_pkts++; 3255 txr->hn_stat_size += m_head->m_pkthdr.len; 3256 if (m_head->m_flags & M_MCAST) 3257 txr->hn_stat_mcasts++; 3258 3259 return 0; 3260 } 3261 3262 /* 3263 * NOTE: 3264 * If this function fails, then txd will be freed, but the mbuf 3265 * associated w/ the txd will _not_ be freed. 3266 */ 3267 static int 3268 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3269 { 3270 int error, send_failed = 0, has_bpf; 3271 3272 again: 3273 has_bpf = bpf_peers_present(ifp->if_bpf); 3274 if (has_bpf) { 3275 /* 3276 * Make sure that this txd and any aggregated txds are not 3277 * freed before ETHER_BPF_MTAP. 3278 */ 3279 hn_txdesc_hold(txd); 3280 } 3281 error = txr->hn_sendpkt(txr, txd); 3282 if (!error) { 3283 if (has_bpf) { 3284 const struct hn_txdesc *tmp_txd; 3285 3286 ETHER_BPF_MTAP(ifp, txd->m); 3287 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3288 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3289 } 3290 3291 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3292 #ifdef HN_IFSTART_SUPPORT 3293 if (!hn_use_if_start) 3294 #endif 3295 { 3296 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3297 txr->hn_stat_size); 3298 if (txr->hn_stat_mcasts != 0) { 3299 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3300 txr->hn_stat_mcasts); 3301 } 3302 } 3303 txr->hn_pkts += txr->hn_stat_pkts; 3304 txr->hn_sends++; 3305 } 3306 if (has_bpf) 3307 hn_txdesc_put(txr, txd); 3308 3309 if (__predict_false(error)) { 3310 int freed; 3311 3312 /* 3313 * This should "really rarely" happen. 3314 * 3315 * XXX Too many RX to be acked or too many sideband 3316 * commands to run? Ask netvsc_channel_rollup() 3317 * to kick start later. 3318 */ 3319 txr->hn_has_txeof = 1; 3320 if (!send_failed) { 3321 txr->hn_send_failed++; 3322 send_failed = 1; 3323 /* 3324 * Try sending again after set hn_has_txeof; 3325 * in case that we missed the last 3326 * netvsc_channel_rollup(). 3327 */ 3328 goto again; 3329 } 3330 if_printf(ifp, "send failed\n"); 3331 3332 /* 3333 * Caller will perform further processing on the 3334 * associated mbuf, so don't free it in hn_txdesc_put(); 3335 * only unload it from the DMA map in hn_txdesc_put(), 3336 * if it was loaded. 3337 */ 3338 txd->m = NULL; 3339 freed = hn_txdesc_put(txr, txd); 3340 KASSERT(freed != 0, 3341 ("fail to free txd upon send error")); 3342 3343 txr->hn_send_failed++; 3344 } 3345 3346 /* Reset temporary stats, after this sending is done. */ 3347 txr->hn_stat_size = 0; 3348 txr->hn_stat_pkts = 0; 3349 txr->hn_stat_mcasts = 0; 3350 3351 return (error); 3352 } 3353 3354 /* 3355 * Append the specified data to the indicated mbuf chain, 3356 * Extend the mbuf chain if the new data does not fit in 3357 * existing space. 3358 * 3359 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3360 * There should be an equivalent in the kernel mbuf code, 3361 * but there does not appear to be one yet. 3362 * 3363 * Differs from m_append() in that additional mbufs are 3364 * allocated with cluster size MJUMPAGESIZE, and filled 3365 * accordingly. 3366 * 3367 * Return the last mbuf in the chain or NULL if failed to 3368 * allocate new mbuf. 3369 */ 3370 static struct mbuf * 3371 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3372 { 3373 struct mbuf *m, *n; 3374 int remainder, space; 3375 3376 for (m = m0; m->m_next != NULL; m = m->m_next) 3377 ; 3378 remainder = len; 3379 space = M_TRAILINGSPACE(m); 3380 if (space > 0) { 3381 /* 3382 * Copy into available space. 3383 */ 3384 if (space > remainder) 3385 space = remainder; 3386 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3387 m->m_len += space; 3388 cp += space; 3389 remainder -= space; 3390 } 3391 while (remainder > 0) { 3392 /* 3393 * Allocate a new mbuf; could check space 3394 * and allocate a cluster instead. 3395 */ 3396 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3397 if (n == NULL) 3398 return NULL; 3399 n->m_len = min(MJUMPAGESIZE, remainder); 3400 bcopy(cp, mtod(n, caddr_t), n->m_len); 3401 cp += n->m_len; 3402 remainder -= n->m_len; 3403 m->m_next = n; 3404 m = n; 3405 } 3406 3407 return m; 3408 } 3409 3410 #if defined(INET) || defined(INET6) 3411 static __inline int 3412 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3413 { 3414 #if __FreeBSD_version >= 1100095 3415 if (hn_lro_mbufq_depth) { 3416 tcp_lro_queue_mbuf(lc, m); 3417 return 0; 3418 } 3419 #endif 3420 return tcp_lro_rx(lc, m, 0); 3421 } 3422 #endif 3423 3424 static int 3425 hn_rxpkt(struct hn_rx_ring *rxr) 3426 { 3427 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3428 struct mbuf *m_new, *n; 3429 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3430 int hash_type = M_HASHTYPE_NONE; 3431 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3432 int i; 3433 3434 ifp = hn_ifp; 3435 if (rxr->hn_rxvf_ifp != NULL) { 3436 /* 3437 * Non-transparent mode VF; pretend this packet is from 3438 * the VF. 3439 */ 3440 ifp = rxr->hn_rxvf_ifp; 3441 is_vf = 1; 3442 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3443 /* Transparent mode VF. */ 3444 is_vf = 1; 3445 } 3446 3447 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3448 /* 3449 * NOTE: 3450 * See the NOTE of hn_rndis_init_fixat(). This 3451 * function can be reached, immediately after the 3452 * RNDIS is initialized but before the ifnet is 3453 * setup on the hn_attach() path; drop the unexpected 3454 * packets. 3455 */ 3456 return (0); 3457 } 3458 3459 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3460 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3461 return (0); 3462 } 3463 3464 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3465 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3466 if (m_new == NULL) { 3467 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3468 return (0); 3469 } 3470 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3471 rxr->rsc.frag_len[0]); 3472 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3473 } else { 3474 /* 3475 * Get an mbuf with a cluster. For packets 2K or less, 3476 * get a standard 2K cluster. For anything larger, get a 3477 * 4K cluster. Any buffers larger than 4K can cause problems 3478 * if looped around to the Hyper-V TX channel, so avoid them. 3479 */ 3480 size = MCLBYTES; 3481 if (rxr->rsc.pktlen > MCLBYTES) { 3482 /* 4096 */ 3483 size = MJUMPAGESIZE; 3484 } 3485 3486 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3487 if (m_new == NULL) { 3488 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3489 return (0); 3490 } 3491 3492 n = m_new; 3493 for (i = 0; i < rxr->rsc.cnt; i++) { 3494 n = hv_m_append(n, rxr->rsc.frag_len[i], 3495 rxr->rsc.frag_data[i]); 3496 if (n == NULL) { 3497 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3498 return (0); 3499 } else { 3500 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3501 } 3502 } 3503 } 3504 if (rxr->rsc.pktlen <= MHLEN) 3505 rxr->hn_small_pkts++; 3506 3507 m_new->m_pkthdr.rcvif = ifp; 3508 3509 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3510 do_csum = 0; 3511 3512 /* receive side checksum offload */ 3513 if (rxr->rsc.csum_info != NULL) { 3514 /* IP csum offload */ 3515 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3516 m_new->m_pkthdr.csum_flags |= 3517 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3518 rxr->hn_csum_ip++; 3519 } 3520 3521 /* TCP/UDP csum offload */ 3522 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3523 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3524 m_new->m_pkthdr.csum_flags |= 3525 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3526 m_new->m_pkthdr.csum_data = 0xffff; 3527 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3528 rxr->hn_csum_tcp++; 3529 else 3530 rxr->hn_csum_udp++; 3531 } 3532 3533 /* 3534 * XXX 3535 * As of this write (Oct 28th, 2016), host side will turn 3536 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3537 * the do_lro setting here is actually _not_ accurate. We 3538 * depend on the RSS hash type check to reset do_lro. 3539 */ 3540 if ((*(rxr->rsc.csum_info) & 3541 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3542 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3543 do_lro = 1; 3544 } else { 3545 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3546 if (l3proto == ETHERTYPE_IP) { 3547 if (l4proto == IPPROTO_TCP) { 3548 if (do_csum && 3549 (rxr->hn_trust_hcsum & 3550 HN_TRUST_HCSUM_TCP)) { 3551 rxr->hn_csum_trusted++; 3552 m_new->m_pkthdr.csum_flags |= 3553 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3554 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3555 m_new->m_pkthdr.csum_data = 0xffff; 3556 } 3557 do_lro = 1; 3558 } else if (l4proto == IPPROTO_UDP) { 3559 if (do_csum && 3560 (rxr->hn_trust_hcsum & 3561 HN_TRUST_HCSUM_UDP)) { 3562 rxr->hn_csum_trusted++; 3563 m_new->m_pkthdr.csum_flags |= 3564 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3565 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3566 m_new->m_pkthdr.csum_data = 0xffff; 3567 } 3568 } else if (l4proto != IPPROTO_DONE && do_csum && 3569 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3570 rxr->hn_csum_trusted++; 3571 m_new->m_pkthdr.csum_flags |= 3572 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3573 } 3574 } 3575 } 3576 3577 if (rxr->rsc.vlan_info != NULL) { 3578 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3579 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3580 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3581 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3582 m_new->m_flags |= M_VLANTAG; 3583 } 3584 3585 /* 3586 * If VF is activated (tranparent/non-transparent mode does not 3587 * matter here). 3588 * 3589 * - Disable LRO 3590 * 3591 * hn(4) will only receive broadcast packets, multicast packets, 3592 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3593 * packet types. 3594 * 3595 * For non-transparent, we definitely _cannot_ enable LRO at 3596 * all, since the LRO flush will use hn(4) as the receiving 3597 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3598 */ 3599 if (is_vf) 3600 do_lro = 0; 3601 3602 /* 3603 * If VF is activated (tranparent/non-transparent mode does not 3604 * matter here), do _not_ mess with unsupported hash types or 3605 * functions. 3606 */ 3607 if (rxr->rsc.hash_info != NULL) { 3608 rxr->hn_rss_pkts++; 3609 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3610 if (!is_vf) 3611 hash_type = M_HASHTYPE_OPAQUE_HASH; 3612 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3613 NDIS_HASH_FUNCTION_TOEPLITZ) { 3614 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3615 rxr->hn_mbuf_hash); 3616 3617 /* 3618 * NOTE: 3619 * do_lro is resetted, if the hash types are not TCP 3620 * related. See the comment in the above csum_flags 3621 * setup section. 3622 */ 3623 switch (type) { 3624 case NDIS_HASH_IPV4: 3625 hash_type = M_HASHTYPE_RSS_IPV4; 3626 do_lro = 0; 3627 break; 3628 3629 case NDIS_HASH_TCP_IPV4: 3630 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3631 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3632 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3633 3634 if (is_vf) 3635 def_htype = M_HASHTYPE_NONE; 3636 3637 /* 3638 * UDP 4-tuple hash is delivered as 3639 * TCP 4-tuple hash. 3640 */ 3641 if (l3proto == ETHERTYPE_MAX) { 3642 hn_rxpkt_proto(m_new, 3643 &l3proto, &l4proto); 3644 } 3645 if (l3proto == ETHERTYPE_IP) { 3646 if (l4proto == IPPROTO_UDP && 3647 (rxr->hn_mbuf_hash & 3648 NDIS_HASH_UDP_IPV4_X)) { 3649 hash_type = 3650 M_HASHTYPE_RSS_UDP_IPV4; 3651 do_lro = 0; 3652 } else if (l4proto != 3653 IPPROTO_TCP) { 3654 hash_type = def_htype; 3655 do_lro = 0; 3656 } 3657 } else { 3658 hash_type = def_htype; 3659 do_lro = 0; 3660 } 3661 } 3662 break; 3663 3664 case NDIS_HASH_IPV6: 3665 hash_type = M_HASHTYPE_RSS_IPV6; 3666 do_lro = 0; 3667 break; 3668 3669 case NDIS_HASH_IPV6_EX: 3670 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3671 do_lro = 0; 3672 break; 3673 3674 case NDIS_HASH_TCP_IPV6: 3675 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3676 break; 3677 3678 case NDIS_HASH_TCP_IPV6_EX: 3679 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3680 break; 3681 } 3682 } 3683 } else if (!is_vf) { 3684 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3685 hash_type = M_HASHTYPE_OPAQUE; 3686 } 3687 M_HASHTYPE_SET(m_new, hash_type); 3688 3689 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3690 if (hn_ifp != ifp) { 3691 const struct ether_header *eh; 3692 3693 /* 3694 * Non-transparent mode VF is activated. 3695 */ 3696 3697 /* 3698 * Allow tapping on hn(4). 3699 */ 3700 ETHER_BPF_MTAP(hn_ifp, m_new); 3701 3702 /* 3703 * Update hn(4)'s stats. 3704 */ 3705 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3706 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3707 /* Checked at the beginning of this function. */ 3708 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3709 eh = mtod(m_new, struct ether_header *); 3710 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3711 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3712 } 3713 rxr->hn_pkts++; 3714 3715 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3716 #if defined(INET) || defined(INET6) 3717 struct lro_ctrl *lro = &rxr->hn_lro; 3718 3719 if (lro->lro_cnt) { 3720 rxr->hn_lro_tried++; 3721 if (hn_lro_rx(lro, m_new) == 0) { 3722 /* DONE! */ 3723 return 0; 3724 } 3725 } 3726 #endif 3727 } 3728 ifp->if_input(ifp, m_new); 3729 3730 return (0); 3731 } 3732 3733 static int 3734 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3735 { 3736 struct hn_softc *sc = ifp->if_softc; 3737 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3738 struct ifnet *vf_ifp; 3739 int mask, error = 0; 3740 struct ifrsskey *ifrk; 3741 struct ifrsshash *ifrh; 3742 uint32_t mtu; 3743 3744 switch (cmd) { 3745 case SIOCSIFMTU: 3746 if (ifr->ifr_mtu > HN_MTU_MAX) { 3747 error = EINVAL; 3748 break; 3749 } 3750 3751 HN_LOCK(sc); 3752 3753 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3754 HN_UNLOCK(sc); 3755 break; 3756 } 3757 3758 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3759 /* Can't change MTU */ 3760 HN_UNLOCK(sc); 3761 error = EOPNOTSUPP; 3762 break; 3763 } 3764 3765 if (ifp->if_mtu == ifr->ifr_mtu) { 3766 HN_UNLOCK(sc); 3767 break; 3768 } 3769 3770 if (hn_xpnt_vf_isready(sc)) { 3771 vf_ifp = sc->hn_vf_ifp; 3772 ifr_vf = *ifr; 3773 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3774 sizeof(ifr_vf.ifr_name)); 3775 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3776 (caddr_t)&ifr_vf); 3777 if (error) { 3778 HN_UNLOCK(sc); 3779 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3780 vf_ifp->if_xname, ifr->ifr_mtu, error); 3781 break; 3782 } 3783 } 3784 3785 /* 3786 * Suspend this interface before the synthetic parts 3787 * are ripped. 3788 */ 3789 hn_suspend(sc); 3790 3791 /* 3792 * Detach the synthetics parts, i.e. NVS and RNDIS. 3793 */ 3794 hn_synth_detach(sc); 3795 3796 /* 3797 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3798 * with the new MTU setting. 3799 */ 3800 error = hn_synth_attach(sc, ifr->ifr_mtu); 3801 if (error) { 3802 HN_UNLOCK(sc); 3803 break; 3804 } 3805 3806 error = hn_rndis_get_mtu(sc, &mtu); 3807 if (error) 3808 mtu = ifr->ifr_mtu; 3809 else if (bootverbose) 3810 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3811 3812 /* 3813 * Commit the requested MTU, after the synthetic parts 3814 * have been successfully attached. 3815 */ 3816 if (mtu >= ifr->ifr_mtu) { 3817 mtu = ifr->ifr_mtu; 3818 } else { 3819 if_printf(ifp, "fixup mtu %d -> %u\n", 3820 ifr->ifr_mtu, mtu); 3821 } 3822 ifp->if_mtu = mtu; 3823 3824 /* 3825 * Synthetic parts' reattach may change the chimney 3826 * sending size; update it. 3827 */ 3828 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3829 hn_set_chim_size(sc, sc->hn_chim_szmax); 3830 3831 /* 3832 * Make sure that various parameters based on MTU are 3833 * still valid, after the MTU change. 3834 */ 3835 hn_mtu_change_fixup(sc); 3836 3837 /* 3838 * All done! Resume the interface now. 3839 */ 3840 hn_resume(sc); 3841 3842 if ((sc->hn_flags & HN_FLAG_RXVF) || 3843 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3844 /* 3845 * Since we have reattached the NVS part, 3846 * change the datapath to VF again; in case 3847 * that it is lost, after the NVS was detached. 3848 */ 3849 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3850 } 3851 3852 HN_UNLOCK(sc); 3853 break; 3854 3855 case SIOCSIFFLAGS: 3856 HN_LOCK(sc); 3857 3858 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3859 HN_UNLOCK(sc); 3860 break; 3861 } 3862 3863 if (hn_xpnt_vf_isready(sc)) 3864 hn_xpnt_vf_saveifflags(sc); 3865 3866 if (ifp->if_flags & IFF_UP) { 3867 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3868 /* 3869 * Caller meight hold mutex, e.g. 3870 * bpf; use busy-wait for the RNDIS 3871 * reply. 3872 */ 3873 HN_NO_SLEEPING(sc); 3874 hn_rxfilter_config(sc); 3875 HN_SLEEPING_OK(sc); 3876 3877 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3878 error = hn_xpnt_vf_iocsetflags(sc); 3879 } else { 3880 hn_init_locked(sc); 3881 } 3882 } else { 3883 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3884 hn_stop(sc, false); 3885 } 3886 sc->hn_if_flags = ifp->if_flags; 3887 3888 HN_UNLOCK(sc); 3889 break; 3890 3891 case SIOCSIFCAP: 3892 HN_LOCK(sc); 3893 3894 if (hn_xpnt_vf_isready(sc)) { 3895 ifr_vf = *ifr; 3896 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3897 sizeof(ifr_vf.ifr_name)); 3898 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3899 HN_UNLOCK(sc); 3900 break; 3901 } 3902 3903 /* 3904 * Fix up requested capabilities w/ supported capabilities, 3905 * since the supported capabilities could have been changed. 3906 */ 3907 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3908 ifp->if_capenable; 3909 3910 if (mask & IFCAP_TXCSUM) { 3911 ifp->if_capenable ^= IFCAP_TXCSUM; 3912 if (ifp->if_capenable & IFCAP_TXCSUM) 3913 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3914 else 3915 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3916 } 3917 if (mask & IFCAP_TXCSUM_IPV6) { 3918 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3919 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3920 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3921 else 3922 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3923 } 3924 3925 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3926 if (mask & IFCAP_RXCSUM) 3927 ifp->if_capenable ^= IFCAP_RXCSUM; 3928 #ifdef foo 3929 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3930 if (mask & IFCAP_RXCSUM_IPV6) 3931 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3932 #endif 3933 3934 if (mask & IFCAP_LRO) 3935 ifp->if_capenable ^= IFCAP_LRO; 3936 3937 if (mask & IFCAP_TSO4) { 3938 ifp->if_capenable ^= IFCAP_TSO4; 3939 if (ifp->if_capenable & IFCAP_TSO4) 3940 ifp->if_hwassist |= CSUM_IP_TSO; 3941 else 3942 ifp->if_hwassist &= ~CSUM_IP_TSO; 3943 } 3944 if (mask & IFCAP_TSO6) { 3945 ifp->if_capenable ^= IFCAP_TSO6; 3946 if (ifp->if_capenable & IFCAP_TSO6) 3947 ifp->if_hwassist |= CSUM_IP6_TSO; 3948 else 3949 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3950 } 3951 3952 HN_UNLOCK(sc); 3953 break; 3954 3955 case SIOCADDMULTI: 3956 case SIOCDELMULTI: 3957 HN_LOCK(sc); 3958 3959 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3960 HN_UNLOCK(sc); 3961 break; 3962 } 3963 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3964 /* 3965 * Multicast uses mutex; use busy-wait for 3966 * the RNDIS reply. 3967 */ 3968 HN_NO_SLEEPING(sc); 3969 hn_rxfilter_config(sc); 3970 HN_SLEEPING_OK(sc); 3971 } 3972 3973 /* XXX vlan(4) style mcast addr maintenance */ 3974 if (hn_xpnt_vf_isready(sc)) { 3975 int old_if_flags; 3976 3977 old_if_flags = sc->hn_vf_ifp->if_flags; 3978 hn_xpnt_vf_saveifflags(sc); 3979 3980 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3981 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3982 IFF_ALLMULTI)) 3983 error = hn_xpnt_vf_iocsetflags(sc); 3984 } 3985 3986 HN_UNLOCK(sc); 3987 break; 3988 3989 case SIOCSIFMEDIA: 3990 case SIOCGIFMEDIA: 3991 HN_LOCK(sc); 3992 if (hn_xpnt_vf_isready(sc)) { 3993 /* 3994 * SIOCGIFMEDIA expects ifmediareq, so don't 3995 * create and pass ifr_vf to the VF here; just 3996 * replace the ifr_name. 3997 */ 3998 vf_ifp = sc->hn_vf_ifp; 3999 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 4000 sizeof(ifr->ifr_name)); 4001 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 4002 /* Restore the ifr_name. */ 4003 strlcpy(ifr->ifr_name, ifp->if_xname, 4004 sizeof(ifr->ifr_name)); 4005 HN_UNLOCK(sc); 4006 break; 4007 } 4008 HN_UNLOCK(sc); 4009 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4010 break; 4011 4012 case SIOCGIFRSSHASH: 4013 ifrh = (struct ifrsshash *)data; 4014 HN_LOCK(sc); 4015 if (sc->hn_rx_ring_inuse == 1) { 4016 HN_UNLOCK(sc); 4017 ifrh->ifrh_func = RSS_FUNC_NONE; 4018 ifrh->ifrh_types = 0; 4019 break; 4020 } 4021 4022 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4023 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4024 else 4025 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4026 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4027 HN_UNLOCK(sc); 4028 break; 4029 4030 case SIOCGIFRSSKEY: 4031 ifrk = (struct ifrsskey *)data; 4032 HN_LOCK(sc); 4033 if (sc->hn_rx_ring_inuse == 1) { 4034 HN_UNLOCK(sc); 4035 ifrk->ifrk_func = RSS_FUNC_NONE; 4036 ifrk->ifrk_keylen = 0; 4037 break; 4038 } 4039 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4040 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4041 else 4042 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4043 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4044 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4045 NDIS_HASH_KEYSIZE_TOEPLITZ); 4046 HN_UNLOCK(sc); 4047 break; 4048 4049 default: 4050 error = ether_ioctl(ifp, cmd, data); 4051 break; 4052 } 4053 return (error); 4054 } 4055 4056 static void 4057 hn_stop(struct hn_softc *sc, bool detaching) 4058 { 4059 struct ifnet *ifp = sc->hn_ifp; 4060 int i; 4061 4062 HN_LOCK_ASSERT(sc); 4063 4064 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4065 ("synthetic parts were not attached")); 4066 4067 /* Clear RUNNING bit ASAP. */ 4068 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4069 4070 /* Disable polling. */ 4071 hn_polling(sc, 0); 4072 4073 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4074 KASSERT(sc->hn_vf_ifp != NULL, 4075 ("%s: VF is not attached", ifp->if_xname)); 4076 4077 /* Mark transparent mode VF as disabled. */ 4078 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4079 4080 /* 4081 * NOTE: 4082 * Datapath setting must happen _before_ bringing 4083 * the VF down. 4084 */ 4085 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4086 4087 /* 4088 * Bring the VF down. 4089 */ 4090 hn_xpnt_vf_saveifflags(sc); 4091 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4092 hn_xpnt_vf_iocsetflags(sc); 4093 } 4094 4095 /* Suspend data transfers. */ 4096 hn_suspend_data(sc); 4097 4098 /* Clear OACTIVE bit. */ 4099 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4100 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4101 sc->hn_tx_ring[i].hn_oactive = 0; 4102 4103 /* 4104 * If the non-transparent mode VF is active, make sure 4105 * that the RX filter still allows packet reception. 4106 */ 4107 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4108 hn_rxfilter_config(sc); 4109 } 4110 4111 static void 4112 hn_init_locked(struct hn_softc *sc) 4113 { 4114 struct ifnet *ifp = sc->hn_ifp; 4115 int i; 4116 4117 HN_LOCK_ASSERT(sc); 4118 4119 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4120 return; 4121 4122 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4123 return; 4124 4125 /* Configure RX filter */ 4126 hn_rxfilter_config(sc); 4127 4128 /* Clear OACTIVE bit. */ 4129 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4130 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4131 sc->hn_tx_ring[i].hn_oactive = 0; 4132 4133 /* Clear TX 'suspended' bit. */ 4134 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4135 4136 if (hn_xpnt_vf_isready(sc)) { 4137 /* Initialize transparent VF. */ 4138 hn_xpnt_vf_init(sc); 4139 } 4140 4141 /* Everything is ready; unleash! */ 4142 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4143 4144 /* Re-enable polling if requested. */ 4145 if (sc->hn_pollhz > 0) 4146 hn_polling(sc, sc->hn_pollhz); 4147 } 4148 4149 static void 4150 hn_init(void *xsc) 4151 { 4152 struct hn_softc *sc = xsc; 4153 4154 HN_LOCK(sc); 4155 hn_init_locked(sc); 4156 HN_UNLOCK(sc); 4157 } 4158 4159 #if __FreeBSD_version >= 1100099 4160 4161 static int 4162 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4163 { 4164 struct hn_softc *sc = arg1; 4165 unsigned int lenlim; 4166 int error; 4167 4168 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4169 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4170 if (error || req->newptr == NULL) 4171 return error; 4172 4173 HN_LOCK(sc); 4174 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4175 lenlim > TCP_LRO_LENGTH_MAX) { 4176 HN_UNLOCK(sc); 4177 return EINVAL; 4178 } 4179 hn_set_lro_lenlim(sc, lenlim); 4180 HN_UNLOCK(sc); 4181 4182 return 0; 4183 } 4184 4185 static int 4186 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4187 { 4188 struct hn_softc *sc = arg1; 4189 int ackcnt, error, i; 4190 4191 /* 4192 * lro_ackcnt_lim is append count limit, 4193 * +1 to turn it into aggregation limit. 4194 */ 4195 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4196 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4197 if (error || req->newptr == NULL) 4198 return error; 4199 4200 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4201 return EINVAL; 4202 4203 /* 4204 * Convert aggregation limit back to append 4205 * count limit. 4206 */ 4207 --ackcnt; 4208 HN_LOCK(sc); 4209 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4210 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4211 HN_UNLOCK(sc); 4212 return 0; 4213 } 4214 4215 #endif 4216 4217 static int 4218 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4219 { 4220 struct hn_softc *sc = arg1; 4221 int hcsum = arg2; 4222 int on, error, i; 4223 4224 on = 0; 4225 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4226 on = 1; 4227 4228 error = sysctl_handle_int(oidp, &on, 0, req); 4229 if (error || req->newptr == NULL) 4230 return error; 4231 4232 HN_LOCK(sc); 4233 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4234 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4235 4236 if (on) 4237 rxr->hn_trust_hcsum |= hcsum; 4238 else 4239 rxr->hn_trust_hcsum &= ~hcsum; 4240 } 4241 HN_UNLOCK(sc); 4242 return 0; 4243 } 4244 4245 static int 4246 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4247 { 4248 struct hn_softc *sc = arg1; 4249 int chim_size, error; 4250 4251 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4252 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4253 if (error || req->newptr == NULL) 4254 return error; 4255 4256 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4257 return EINVAL; 4258 4259 HN_LOCK(sc); 4260 hn_set_chim_size(sc, chim_size); 4261 HN_UNLOCK(sc); 4262 return 0; 4263 } 4264 4265 #if __FreeBSD_version < 1100095 4266 static int 4267 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4268 { 4269 struct hn_softc *sc = arg1; 4270 int ofs = arg2, i, error; 4271 struct hn_rx_ring *rxr; 4272 uint64_t stat; 4273 4274 stat = 0; 4275 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4276 rxr = &sc->hn_rx_ring[i]; 4277 stat += *((int *)((uint8_t *)rxr + ofs)); 4278 } 4279 4280 error = sysctl_handle_64(oidp, &stat, 0, req); 4281 if (error || req->newptr == NULL) 4282 return error; 4283 4284 /* Zero out this stat. */ 4285 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4286 rxr = &sc->hn_rx_ring[i]; 4287 *((int *)((uint8_t *)rxr + ofs)) = 0; 4288 } 4289 return 0; 4290 } 4291 #else 4292 static int 4293 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4294 { 4295 struct hn_softc *sc = arg1; 4296 int ofs = arg2, i, error; 4297 struct hn_rx_ring *rxr; 4298 uint64_t stat; 4299 4300 stat = 0; 4301 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4302 rxr = &sc->hn_rx_ring[i]; 4303 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4304 } 4305 4306 error = sysctl_handle_64(oidp, &stat, 0, req); 4307 if (error || req->newptr == NULL) 4308 return error; 4309 4310 /* Zero out this stat. */ 4311 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4312 rxr = &sc->hn_rx_ring[i]; 4313 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4314 } 4315 return 0; 4316 } 4317 4318 #endif 4319 4320 static int 4321 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4322 { 4323 struct hn_softc *sc = arg1; 4324 int ofs = arg2, i, error; 4325 struct hn_rx_ring *rxr; 4326 u_long stat; 4327 4328 stat = 0; 4329 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4330 rxr = &sc->hn_rx_ring[i]; 4331 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4332 } 4333 4334 error = sysctl_handle_long(oidp, &stat, 0, req); 4335 if (error || req->newptr == NULL) 4336 return error; 4337 4338 /* Zero out this stat. */ 4339 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4340 rxr = &sc->hn_rx_ring[i]; 4341 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4342 } 4343 return 0; 4344 } 4345 4346 static int 4347 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4348 { 4349 struct hn_softc *sc = arg1; 4350 int ofs = arg2, i, error; 4351 struct hn_tx_ring *txr; 4352 u_long stat; 4353 4354 stat = 0; 4355 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4356 txr = &sc->hn_tx_ring[i]; 4357 stat += *((u_long *)((uint8_t *)txr + ofs)); 4358 } 4359 4360 error = sysctl_handle_long(oidp, &stat, 0, req); 4361 if (error || req->newptr == NULL) 4362 return error; 4363 4364 /* Zero out this stat. */ 4365 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4366 txr = &sc->hn_tx_ring[i]; 4367 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4368 } 4369 return 0; 4370 } 4371 4372 static int 4373 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4374 { 4375 struct hn_softc *sc = arg1; 4376 int ofs = arg2, i, error, conf; 4377 struct hn_tx_ring *txr; 4378 4379 txr = &sc->hn_tx_ring[0]; 4380 conf = *((int *)((uint8_t *)txr + ofs)); 4381 4382 error = sysctl_handle_int(oidp, &conf, 0, req); 4383 if (error || req->newptr == NULL) 4384 return error; 4385 4386 HN_LOCK(sc); 4387 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4388 txr = &sc->hn_tx_ring[i]; 4389 *((int *)((uint8_t *)txr + ofs)) = conf; 4390 } 4391 HN_UNLOCK(sc); 4392 4393 return 0; 4394 } 4395 4396 static int 4397 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4398 { 4399 struct hn_softc *sc = arg1; 4400 int error, size; 4401 4402 size = sc->hn_agg_size; 4403 error = sysctl_handle_int(oidp, &size, 0, req); 4404 if (error || req->newptr == NULL) 4405 return (error); 4406 4407 HN_LOCK(sc); 4408 sc->hn_agg_size = size; 4409 hn_set_txagg(sc); 4410 HN_UNLOCK(sc); 4411 4412 return (0); 4413 } 4414 4415 static int 4416 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4417 { 4418 struct hn_softc *sc = arg1; 4419 int error, pkts; 4420 4421 pkts = sc->hn_agg_pkts; 4422 error = sysctl_handle_int(oidp, &pkts, 0, req); 4423 if (error || req->newptr == NULL) 4424 return (error); 4425 4426 HN_LOCK(sc); 4427 sc->hn_agg_pkts = pkts; 4428 hn_set_txagg(sc); 4429 HN_UNLOCK(sc); 4430 4431 return (0); 4432 } 4433 4434 static int 4435 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4436 { 4437 struct hn_softc *sc = arg1; 4438 int pkts; 4439 4440 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4441 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4442 } 4443 4444 static int 4445 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4446 { 4447 struct hn_softc *sc = arg1; 4448 int align; 4449 4450 align = sc->hn_tx_ring[0].hn_agg_align; 4451 return (sysctl_handle_int(oidp, &align, 0, req)); 4452 } 4453 4454 static void 4455 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4456 { 4457 if (pollhz == 0) 4458 vmbus_chan_poll_disable(chan); 4459 else 4460 vmbus_chan_poll_enable(chan, pollhz); 4461 } 4462 4463 static void 4464 hn_polling(struct hn_softc *sc, u_int pollhz) 4465 { 4466 int nsubch = sc->hn_rx_ring_inuse - 1; 4467 4468 HN_LOCK_ASSERT(sc); 4469 4470 if (nsubch > 0) { 4471 struct vmbus_channel **subch; 4472 int i; 4473 4474 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4475 for (i = 0; i < nsubch; ++i) 4476 hn_chan_polling(subch[i], pollhz); 4477 vmbus_subchan_rel(subch, nsubch); 4478 } 4479 hn_chan_polling(sc->hn_prichan, pollhz); 4480 } 4481 4482 static int 4483 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4484 { 4485 struct hn_softc *sc = arg1; 4486 int pollhz, error; 4487 4488 pollhz = sc->hn_pollhz; 4489 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4490 if (error || req->newptr == NULL) 4491 return (error); 4492 4493 if (pollhz != 0 && 4494 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4495 return (EINVAL); 4496 4497 HN_LOCK(sc); 4498 if (sc->hn_pollhz != pollhz) { 4499 sc->hn_pollhz = pollhz; 4500 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4501 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4502 hn_polling(sc, sc->hn_pollhz); 4503 } 4504 HN_UNLOCK(sc); 4505 4506 return (0); 4507 } 4508 4509 static int 4510 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4511 { 4512 struct hn_softc *sc = arg1; 4513 char verstr[16]; 4514 4515 snprintf(verstr, sizeof(verstr), "%u.%u", 4516 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4517 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4518 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4519 } 4520 4521 static int 4522 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4523 { 4524 struct hn_softc *sc = arg1; 4525 char caps_str[128]; 4526 uint32_t caps; 4527 4528 HN_LOCK(sc); 4529 caps = sc->hn_caps; 4530 HN_UNLOCK(sc); 4531 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4532 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4533 } 4534 4535 static int 4536 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4537 { 4538 struct hn_softc *sc = arg1; 4539 char assist_str[128]; 4540 uint32_t hwassist; 4541 4542 HN_LOCK(sc); 4543 hwassist = sc->hn_ifp->if_hwassist; 4544 HN_UNLOCK(sc); 4545 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4546 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4547 } 4548 4549 static int 4550 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4551 { 4552 struct hn_softc *sc = arg1; 4553 char filter_str[128]; 4554 uint32_t filter; 4555 4556 HN_LOCK(sc); 4557 filter = sc->hn_rx_filter; 4558 HN_UNLOCK(sc); 4559 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4560 NDIS_PACKET_TYPES); 4561 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4562 } 4563 4564 #ifndef RSS 4565 4566 static int 4567 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4568 { 4569 struct hn_softc *sc = arg1; 4570 int error; 4571 4572 HN_LOCK(sc); 4573 4574 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4575 if (error || req->newptr == NULL) 4576 goto back; 4577 4578 if ((sc->hn_flags & HN_FLAG_RXVF) || 4579 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4580 /* 4581 * RSS key is synchronized w/ VF's, don't allow users 4582 * to change it. 4583 */ 4584 error = EBUSY; 4585 goto back; 4586 } 4587 4588 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4589 if (error) 4590 goto back; 4591 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4592 4593 if (sc->hn_rx_ring_inuse > 1) { 4594 error = hn_rss_reconfig(sc); 4595 } else { 4596 /* Not RSS capable, at least for now; just save the RSS key. */ 4597 error = 0; 4598 } 4599 back: 4600 HN_UNLOCK(sc); 4601 return (error); 4602 } 4603 4604 static int 4605 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4606 { 4607 struct hn_softc *sc = arg1; 4608 int error; 4609 4610 HN_LOCK(sc); 4611 4612 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4613 if (error || req->newptr == NULL) 4614 goto back; 4615 4616 /* 4617 * Don't allow RSS indirect table change, if this interface is not 4618 * RSS capable currently. 4619 */ 4620 if (sc->hn_rx_ring_inuse == 1) { 4621 error = EOPNOTSUPP; 4622 goto back; 4623 } 4624 4625 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4626 if (error) 4627 goto back; 4628 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4629 4630 hn_rss_ind_fixup(sc); 4631 error = hn_rss_reconfig(sc); 4632 back: 4633 HN_UNLOCK(sc); 4634 return (error); 4635 } 4636 4637 #endif /* !RSS */ 4638 4639 static int 4640 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4641 { 4642 struct hn_softc *sc = arg1; 4643 char hash_str[128]; 4644 uint32_t hash; 4645 4646 HN_LOCK(sc); 4647 hash = sc->hn_rss_hash; 4648 HN_UNLOCK(sc); 4649 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4650 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4651 } 4652 4653 static int 4654 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4655 { 4656 struct hn_softc *sc = arg1; 4657 char hash_str[128]; 4658 uint32_t hash; 4659 4660 HN_LOCK(sc); 4661 hash = sc->hn_rss_hcap; 4662 HN_UNLOCK(sc); 4663 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4664 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4665 } 4666 4667 static int 4668 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4669 { 4670 struct hn_softc *sc = arg1; 4671 char hash_str[128]; 4672 uint32_t hash; 4673 4674 HN_LOCK(sc); 4675 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4676 HN_UNLOCK(sc); 4677 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4678 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4679 } 4680 4681 static int 4682 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4683 { 4684 struct hn_softc *sc = arg1; 4685 char vf_name[IFNAMSIZ + 1]; 4686 struct ifnet *vf_ifp; 4687 4688 HN_LOCK(sc); 4689 vf_name[0] = '\0'; 4690 vf_ifp = sc->hn_vf_ifp; 4691 if (vf_ifp != NULL) 4692 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4693 HN_UNLOCK(sc); 4694 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4695 } 4696 4697 static int 4698 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4699 { 4700 struct hn_softc *sc = arg1; 4701 char vf_name[IFNAMSIZ + 1]; 4702 struct ifnet *vf_ifp; 4703 4704 HN_LOCK(sc); 4705 vf_name[0] = '\0'; 4706 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4707 if (vf_ifp != NULL) 4708 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4709 HN_UNLOCK(sc); 4710 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4711 } 4712 4713 static int 4714 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4715 { 4716 struct rm_priotracker pt; 4717 struct sbuf *sb; 4718 int error, i; 4719 bool first; 4720 4721 error = sysctl_wire_old_buffer(req, 0); 4722 if (error != 0) 4723 return (error); 4724 4725 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4726 if (sb == NULL) 4727 return (ENOMEM); 4728 4729 rm_rlock(&hn_vfmap_lock, &pt); 4730 4731 first = true; 4732 for (i = 0; i < hn_vfmap_size; ++i) { 4733 struct ifnet *ifp; 4734 4735 if (hn_vfmap[i] == NULL) 4736 continue; 4737 4738 ifp = ifnet_byindex(i); 4739 if (ifp != NULL) { 4740 if (first) 4741 sbuf_printf(sb, "%s", ifp->if_xname); 4742 else 4743 sbuf_printf(sb, " %s", ifp->if_xname); 4744 first = false; 4745 } 4746 } 4747 4748 rm_runlock(&hn_vfmap_lock, &pt); 4749 4750 error = sbuf_finish(sb); 4751 sbuf_delete(sb); 4752 return (error); 4753 } 4754 4755 static int 4756 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4757 { 4758 struct rm_priotracker pt; 4759 struct sbuf *sb; 4760 int error, i; 4761 bool first; 4762 4763 error = sysctl_wire_old_buffer(req, 0); 4764 if (error != 0) 4765 return (error); 4766 4767 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4768 if (sb == NULL) 4769 return (ENOMEM); 4770 4771 rm_rlock(&hn_vfmap_lock, &pt); 4772 4773 first = true; 4774 for (i = 0; i < hn_vfmap_size; ++i) { 4775 struct ifnet *ifp, *hn_ifp; 4776 4777 hn_ifp = hn_vfmap[i]; 4778 if (hn_ifp == NULL) 4779 continue; 4780 4781 ifp = ifnet_byindex(i); 4782 if (ifp != NULL) { 4783 if (first) { 4784 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4785 hn_ifp->if_xname); 4786 } else { 4787 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4788 hn_ifp->if_xname); 4789 } 4790 first = false; 4791 } 4792 } 4793 4794 rm_runlock(&hn_vfmap_lock, &pt); 4795 4796 error = sbuf_finish(sb); 4797 sbuf_delete(sb); 4798 return (error); 4799 } 4800 4801 static int 4802 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4803 { 4804 struct hn_softc *sc = arg1; 4805 int error, onoff = 0; 4806 4807 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4808 onoff = 1; 4809 error = sysctl_handle_int(oidp, &onoff, 0, req); 4810 if (error || req->newptr == NULL) 4811 return (error); 4812 4813 HN_LOCK(sc); 4814 /* NOTE: hn_vf_lock for hn_transmit() */ 4815 rm_wlock(&sc->hn_vf_lock); 4816 if (onoff) 4817 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4818 else 4819 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4820 rm_wunlock(&sc->hn_vf_lock); 4821 HN_UNLOCK(sc); 4822 4823 return (0); 4824 } 4825 4826 static int 4827 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4828 { 4829 struct hn_softc *sc = arg1; 4830 int enabled = 0; 4831 4832 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4833 enabled = 1; 4834 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4835 } 4836 4837 static int 4838 hn_check_iplen(const struct mbuf *m, int hoff) 4839 { 4840 const struct ip *ip; 4841 int len, iphlen, iplen; 4842 const struct tcphdr *th; 4843 int thoff; /* TCP data offset */ 4844 4845 len = hoff + sizeof(struct ip); 4846 4847 /* The packet must be at least the size of an IP header. */ 4848 if (m->m_pkthdr.len < len) 4849 return IPPROTO_DONE; 4850 4851 /* The fixed IP header must reside completely in the first mbuf. */ 4852 if (m->m_len < len) 4853 return IPPROTO_DONE; 4854 4855 ip = mtodo(m, hoff); 4856 4857 /* Bound check the packet's stated IP header length. */ 4858 iphlen = ip->ip_hl << 2; 4859 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4860 return IPPROTO_DONE; 4861 4862 /* The full IP header must reside completely in the one mbuf. */ 4863 if (m->m_len < hoff + iphlen) 4864 return IPPROTO_DONE; 4865 4866 iplen = ntohs(ip->ip_len); 4867 4868 /* 4869 * Check that the amount of data in the buffers is as 4870 * at least much as the IP header would have us expect. 4871 */ 4872 if (m->m_pkthdr.len < hoff + iplen) 4873 return IPPROTO_DONE; 4874 4875 /* 4876 * Ignore IP fragments. 4877 */ 4878 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4879 return IPPROTO_DONE; 4880 4881 /* 4882 * The TCP/IP or UDP/IP header must be entirely contained within 4883 * the first fragment of a packet. 4884 */ 4885 switch (ip->ip_p) { 4886 case IPPROTO_TCP: 4887 if (iplen < iphlen + sizeof(struct tcphdr)) 4888 return IPPROTO_DONE; 4889 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4890 return IPPROTO_DONE; 4891 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4892 thoff = th->th_off << 2; 4893 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4894 return IPPROTO_DONE; 4895 if (m->m_len < hoff + iphlen + thoff) 4896 return IPPROTO_DONE; 4897 break; 4898 case IPPROTO_UDP: 4899 if (iplen < iphlen + sizeof(struct udphdr)) 4900 return IPPROTO_DONE; 4901 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4902 return IPPROTO_DONE; 4903 break; 4904 default: 4905 if (iplen < iphlen) 4906 return IPPROTO_DONE; 4907 break; 4908 } 4909 return ip->ip_p; 4910 } 4911 4912 static void 4913 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4914 { 4915 const struct ether_header *eh; 4916 uint16_t etype; 4917 int hoff; 4918 4919 hoff = sizeof(*eh); 4920 /* Checked at the beginning of this function. */ 4921 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4922 4923 eh = mtod(m_new, const struct ether_header *); 4924 etype = ntohs(eh->ether_type); 4925 if (etype == ETHERTYPE_VLAN) { 4926 const struct ether_vlan_header *evl; 4927 4928 hoff = sizeof(*evl); 4929 if (m_new->m_len < hoff) 4930 return; 4931 evl = mtod(m_new, const struct ether_vlan_header *); 4932 etype = ntohs(evl->evl_proto); 4933 } 4934 *l3proto = etype; 4935 4936 if (etype == ETHERTYPE_IP) 4937 *l4proto = hn_check_iplen(m_new, hoff); 4938 else 4939 *l4proto = IPPROTO_DONE; 4940 } 4941 4942 static int 4943 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4944 { 4945 struct sysctl_oid_list *child; 4946 struct sysctl_ctx_list *ctx; 4947 device_t dev = sc->hn_dev; 4948 #if defined(INET) || defined(INET6) 4949 #if __FreeBSD_version >= 1100095 4950 int lroent_cnt; 4951 #endif 4952 #endif 4953 int i; 4954 4955 /* 4956 * Create RXBUF for reception. 4957 * 4958 * NOTE: 4959 * - It is shared by all channels. 4960 * - A large enough buffer is allocated, certain version of NVSes 4961 * may further limit the usable space. 4962 */ 4963 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4964 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4965 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4966 if (sc->hn_rxbuf == NULL) { 4967 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4968 return (ENOMEM); 4969 } 4970 4971 sc->hn_rx_ring_cnt = ring_cnt; 4972 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4973 4974 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4975 M_DEVBUF, M_WAITOK | M_ZERO); 4976 4977 #if defined(INET) || defined(INET6) 4978 #if __FreeBSD_version >= 1100095 4979 lroent_cnt = hn_lro_entry_count; 4980 if (lroent_cnt < TCP_LRO_ENTRIES) 4981 lroent_cnt = TCP_LRO_ENTRIES; 4982 if (bootverbose) 4983 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4984 #endif 4985 #endif /* INET || INET6 */ 4986 4987 ctx = device_get_sysctl_ctx(dev); 4988 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4989 4990 /* Create dev.hn.UNIT.rx sysctl tree */ 4991 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4992 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4993 4994 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4995 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4996 4997 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4998 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 4999 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5000 if (rxr->hn_br == NULL) { 5001 device_printf(dev, "allocate bufring failed\n"); 5002 return (ENOMEM); 5003 } 5004 5005 if (hn_trust_hosttcp) 5006 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5007 if (hn_trust_hostudp) 5008 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5009 if (hn_trust_hostip) 5010 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5011 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5012 rxr->hn_ifp = sc->hn_ifp; 5013 if (i < sc->hn_tx_ring_cnt) 5014 rxr->hn_txr = &sc->hn_tx_ring[i]; 5015 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5016 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5017 rxr->hn_rx_idx = i; 5018 rxr->hn_rxbuf = sc->hn_rxbuf; 5019 5020 /* 5021 * Initialize LRO. 5022 */ 5023 #if defined(INET) || defined(INET6) 5024 #if __FreeBSD_version >= 1100095 5025 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5026 hn_lro_mbufq_depth); 5027 #else 5028 tcp_lro_init(&rxr->hn_lro); 5029 rxr->hn_lro.ifp = sc->hn_ifp; 5030 #endif 5031 #if __FreeBSD_version >= 1100099 5032 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5033 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5034 #endif 5035 #endif /* INET || INET6 */ 5036 5037 if (sc->hn_rx_sysctl_tree != NULL) { 5038 char name[16]; 5039 5040 /* 5041 * Create per RX ring sysctl tree: 5042 * dev.hn.UNIT.rx.RINGID 5043 */ 5044 snprintf(name, sizeof(name), "%d", i); 5045 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5046 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5047 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5048 5049 if (rxr->hn_rx_sysctl_tree != NULL) { 5050 SYSCTL_ADD_ULONG(ctx, 5051 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5052 OID_AUTO, "packets", CTLFLAG_RW, 5053 &rxr->hn_pkts, "# of packets received"); 5054 SYSCTL_ADD_ULONG(ctx, 5055 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5056 OID_AUTO, "rss_pkts", CTLFLAG_RW, 5057 &rxr->hn_rss_pkts, 5058 "# of packets w/ RSS info received"); 5059 SYSCTL_ADD_ULONG(ctx, 5060 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5061 OID_AUTO, "rsc_pkts", CTLFLAG_RW, 5062 &rxr->hn_rsc_pkts, 5063 "# of RSC packets received"); 5064 SYSCTL_ADD_ULONG(ctx, 5065 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5066 OID_AUTO, "rsc_drop", CTLFLAG_RW, 5067 &rxr->hn_rsc_drop, 5068 "# of RSC fragments dropped"); 5069 SYSCTL_ADD_INT(ctx, 5070 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5071 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5072 &rxr->hn_pktbuf_len, 0, 5073 "Temporary channel packet buffer length"); 5074 } 5075 } 5076 } 5077 5078 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5079 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5080 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5081 #if __FreeBSD_version < 1100095 5082 hn_rx_stat_int_sysctl, 5083 #else 5084 hn_rx_stat_u64_sysctl, 5085 #endif 5086 "LU", "LRO queued"); 5087 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5088 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5089 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5090 #if __FreeBSD_version < 1100095 5091 hn_rx_stat_int_sysctl, 5092 #else 5093 hn_rx_stat_u64_sysctl, 5094 #endif 5095 "LU", "LRO flushed"); 5096 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5097 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5098 __offsetof(struct hn_rx_ring, hn_lro_tried), 5099 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5100 #if __FreeBSD_version >= 1100099 5101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5102 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5103 hn_lro_lenlim_sysctl, "IU", 5104 "Max # of data bytes to be aggregated by LRO"); 5105 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5106 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5107 hn_lro_ackcnt_sysctl, "I", 5108 "Max # of ACKs to be aggregated by LRO"); 5109 #endif 5110 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5111 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5112 hn_trust_hcsum_sysctl, "I", 5113 "Trust tcp segement verification on host side, " 5114 "when csum info is missing"); 5115 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5116 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5117 hn_trust_hcsum_sysctl, "I", 5118 "Trust udp datagram verification on host side, " 5119 "when csum info is missing"); 5120 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5121 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5122 hn_trust_hcsum_sysctl, "I", 5123 "Trust ip packet verification on host side, " 5124 "when csum info is missing"); 5125 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5126 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5127 __offsetof(struct hn_rx_ring, hn_csum_ip), 5128 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5129 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5130 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5131 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5132 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5133 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5134 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5135 __offsetof(struct hn_rx_ring, hn_csum_udp), 5136 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5137 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5138 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5139 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5140 hn_rx_stat_ulong_sysctl, "LU", 5141 "# of packets that we trust host's csum verification"); 5142 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5143 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5144 __offsetof(struct hn_rx_ring, hn_small_pkts), 5145 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5146 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5147 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5148 __offsetof(struct hn_rx_ring, hn_ack_failed), 5149 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5150 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5151 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5152 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5153 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5154 5155 return (0); 5156 } 5157 5158 static void 5159 hn_destroy_rx_data(struct hn_softc *sc) 5160 { 5161 int i; 5162 5163 if (sc->hn_rxbuf != NULL) { 5164 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5165 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5166 else 5167 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5168 sc->hn_rxbuf = NULL; 5169 } 5170 5171 if (sc->hn_rx_ring_cnt == 0) 5172 return; 5173 5174 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5175 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5176 5177 if (rxr->hn_br == NULL) 5178 continue; 5179 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5180 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5181 } else { 5182 device_printf(sc->hn_dev, 5183 "%dth channel bufring is referenced", i); 5184 } 5185 rxr->hn_br = NULL; 5186 5187 #if defined(INET) || defined(INET6) 5188 tcp_lro_free(&rxr->hn_lro); 5189 #endif 5190 free(rxr->hn_pktbuf, M_DEVBUF); 5191 } 5192 free(sc->hn_rx_ring, M_DEVBUF); 5193 sc->hn_rx_ring = NULL; 5194 5195 sc->hn_rx_ring_cnt = 0; 5196 sc->hn_rx_ring_inuse = 0; 5197 } 5198 5199 static int 5200 hn_tx_ring_create(struct hn_softc *sc, int id) 5201 { 5202 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5203 device_t dev = sc->hn_dev; 5204 bus_dma_tag_t parent_dtag; 5205 int error, i; 5206 5207 txr->hn_sc = sc; 5208 txr->hn_tx_idx = id; 5209 5210 #ifndef HN_USE_TXDESC_BUFRING 5211 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5212 #endif 5213 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5214 5215 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5216 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5217 M_DEVBUF, M_WAITOK | M_ZERO); 5218 #ifndef HN_USE_TXDESC_BUFRING 5219 SLIST_INIT(&txr->hn_txlist); 5220 #else 5221 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5222 M_WAITOK, &txr->hn_tx_lock); 5223 #endif 5224 5225 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5226 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5227 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5228 } else { 5229 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5230 } 5231 5232 #ifdef HN_IFSTART_SUPPORT 5233 if (hn_use_if_start) { 5234 txr->hn_txeof = hn_start_txeof; 5235 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5236 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5237 } else 5238 #endif 5239 { 5240 int br_depth; 5241 5242 txr->hn_txeof = hn_xmit_txeof; 5243 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5244 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5245 5246 br_depth = hn_get_txswq_depth(txr); 5247 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5248 M_WAITOK, &txr->hn_tx_lock); 5249 } 5250 5251 txr->hn_direct_tx_size = hn_direct_tx_size; 5252 5253 /* 5254 * Always schedule transmission instead of trying to do direct 5255 * transmission. This one gives the best performance so far. 5256 */ 5257 txr->hn_sched_tx = 1; 5258 5259 parent_dtag = bus_get_dma_tag(dev); 5260 5261 /* DMA tag for RNDIS packet messages. */ 5262 error = bus_dma_tag_create(parent_dtag, /* parent */ 5263 HN_RNDIS_PKT_ALIGN, /* alignment */ 5264 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5265 BUS_SPACE_MAXADDR, /* lowaddr */ 5266 BUS_SPACE_MAXADDR, /* highaddr */ 5267 NULL, NULL, /* filter, filterarg */ 5268 HN_RNDIS_PKT_LEN, /* maxsize */ 5269 1, /* nsegments */ 5270 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5271 0, /* flags */ 5272 NULL, /* lockfunc */ 5273 NULL, /* lockfuncarg */ 5274 &txr->hn_tx_rndis_dtag); 5275 if (error) { 5276 device_printf(dev, "failed to create rndis dmatag\n"); 5277 return error; 5278 } 5279 5280 /* DMA tag for data. */ 5281 error = bus_dma_tag_create(parent_dtag, /* parent */ 5282 1, /* alignment */ 5283 HN_TX_DATA_BOUNDARY, /* boundary */ 5284 BUS_SPACE_MAXADDR, /* lowaddr */ 5285 BUS_SPACE_MAXADDR, /* highaddr */ 5286 NULL, NULL, /* filter, filterarg */ 5287 HN_TX_DATA_MAXSIZE, /* maxsize */ 5288 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5289 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5290 0, /* flags */ 5291 NULL, /* lockfunc */ 5292 NULL, /* lockfuncarg */ 5293 &txr->hn_tx_data_dtag); 5294 if (error) { 5295 device_printf(dev, "failed to create data dmatag\n"); 5296 return error; 5297 } 5298 5299 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5300 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5301 5302 txd->txr = txr; 5303 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5304 STAILQ_INIT(&txd->agg_list); 5305 5306 /* 5307 * Allocate and load RNDIS packet message. 5308 */ 5309 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5310 (void **)&txd->rndis_pkt, 5311 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5312 &txd->rndis_pkt_dmap); 5313 if (error) { 5314 device_printf(dev, 5315 "failed to allocate rndis_packet_msg, %d\n", i); 5316 return error; 5317 } 5318 5319 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5320 txd->rndis_pkt_dmap, 5321 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5322 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5323 BUS_DMA_NOWAIT); 5324 if (error) { 5325 device_printf(dev, 5326 "failed to load rndis_packet_msg, %d\n", i); 5327 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5328 txd->rndis_pkt, txd->rndis_pkt_dmap); 5329 return error; 5330 } 5331 5332 /* DMA map for TX data. */ 5333 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5334 &txd->data_dmap); 5335 if (error) { 5336 device_printf(dev, 5337 "failed to allocate tx data dmamap\n"); 5338 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5339 txd->rndis_pkt_dmap); 5340 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5341 txd->rndis_pkt, txd->rndis_pkt_dmap); 5342 return error; 5343 } 5344 5345 /* All set, put it to list */ 5346 txd->flags |= HN_TXD_FLAG_ONLIST; 5347 #ifndef HN_USE_TXDESC_BUFRING 5348 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5349 #else 5350 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5351 #endif 5352 } 5353 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5354 5355 if (sc->hn_tx_sysctl_tree != NULL) { 5356 struct sysctl_oid_list *child; 5357 struct sysctl_ctx_list *ctx; 5358 char name[16]; 5359 5360 /* 5361 * Create per TX ring sysctl tree: 5362 * dev.hn.UNIT.tx.RINGID 5363 */ 5364 ctx = device_get_sysctl_ctx(dev); 5365 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5366 5367 snprintf(name, sizeof(name), "%d", id); 5368 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5369 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5370 5371 if (txr->hn_tx_sysctl_tree != NULL) { 5372 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5373 5374 #ifdef HN_DEBUG 5375 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5376 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5377 "# of available TX descs"); 5378 #endif 5379 #ifdef HN_IFSTART_SUPPORT 5380 if (!hn_use_if_start) 5381 #endif 5382 { 5383 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5384 CTLFLAG_RD, &txr->hn_oactive, 0, 5385 "over active"); 5386 } 5387 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5388 CTLFLAG_RW, &txr->hn_pkts, 5389 "# of packets transmitted"); 5390 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5391 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 5392 } 5393 } 5394 5395 return 0; 5396 } 5397 5398 static void 5399 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5400 { 5401 struct hn_tx_ring *txr = txd->txr; 5402 5403 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5404 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5405 5406 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5407 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5408 txd->rndis_pkt_dmap); 5409 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5410 } 5411 5412 static void 5413 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5414 { 5415 5416 KASSERT(txd->refs == 0 || txd->refs == 1, 5417 ("invalid txd refs %d", txd->refs)); 5418 5419 /* Aggregated txds will be freed by their aggregating txd. */ 5420 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5421 int freed; 5422 5423 freed = hn_txdesc_put(txr, txd); 5424 KASSERT(freed, ("can't free txdesc")); 5425 } 5426 } 5427 5428 static void 5429 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5430 { 5431 int i; 5432 5433 if (txr->hn_txdesc == NULL) 5434 return; 5435 5436 /* 5437 * NOTE: 5438 * Because the freeing of aggregated txds will be deferred 5439 * to the aggregating txd, two passes are used here: 5440 * - The first pass GCes any pending txds. This GC is necessary, 5441 * since if the channels are revoked, hypervisor will not 5442 * deliver send-done for all pending txds. 5443 * - The second pass frees the busdma stuffs, i.e. after all txds 5444 * were freed. 5445 */ 5446 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5447 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5448 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5449 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5450 5451 if (txr->hn_tx_data_dtag != NULL) 5452 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5453 if (txr->hn_tx_rndis_dtag != NULL) 5454 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5455 5456 #ifdef HN_USE_TXDESC_BUFRING 5457 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5458 #endif 5459 5460 free(txr->hn_txdesc, M_DEVBUF); 5461 txr->hn_txdesc = NULL; 5462 5463 if (txr->hn_mbuf_br != NULL) 5464 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5465 5466 #ifndef HN_USE_TXDESC_BUFRING 5467 mtx_destroy(&txr->hn_txlist_spin); 5468 #endif 5469 mtx_destroy(&txr->hn_tx_lock); 5470 } 5471 5472 static int 5473 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5474 { 5475 struct sysctl_oid_list *child; 5476 struct sysctl_ctx_list *ctx; 5477 int i; 5478 5479 /* 5480 * Create TXBUF for chimney sending. 5481 * 5482 * NOTE: It is shared by all channels. 5483 */ 5484 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5485 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5486 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5487 if (sc->hn_chim == NULL) { 5488 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5489 return (ENOMEM); 5490 } 5491 5492 sc->hn_tx_ring_cnt = ring_cnt; 5493 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5494 5495 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5496 M_DEVBUF, M_WAITOK | M_ZERO); 5497 5498 ctx = device_get_sysctl_ctx(sc->hn_dev); 5499 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5500 5501 /* Create dev.hn.UNIT.tx sysctl tree */ 5502 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5503 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5504 5505 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5506 int error; 5507 5508 error = hn_tx_ring_create(sc, i); 5509 if (error) 5510 return error; 5511 } 5512 5513 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5514 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5515 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5516 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5517 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5518 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5519 __offsetof(struct hn_tx_ring, hn_send_failed), 5520 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5521 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5522 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5523 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5524 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5525 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5526 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5527 __offsetof(struct hn_tx_ring, hn_flush_failed), 5528 hn_tx_stat_ulong_sysctl, "LU", 5529 "# of packet transmission aggregation flush failure"); 5530 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5531 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5532 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5533 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5534 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5535 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5536 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5537 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5538 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5539 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5540 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5541 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5542 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5543 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5544 "# of total TX descs"); 5545 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5546 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5547 "Chimney send packet size upper boundary"); 5548 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5549 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5550 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5551 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5552 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5553 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5554 hn_tx_conf_int_sysctl, "I", 5555 "Size of the packet for direct transmission"); 5556 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5557 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5558 __offsetof(struct hn_tx_ring, hn_sched_tx), 5559 hn_tx_conf_int_sysctl, "I", 5560 "Always schedule transmission " 5561 "instead of doing direct transmission"); 5562 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5563 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5564 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5565 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5566 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5567 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5568 "Applied packet transmission aggregation size"); 5569 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5570 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5571 hn_txagg_pktmax_sysctl, "I", 5572 "Applied packet transmission aggregation packets"); 5573 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5574 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5575 hn_txagg_align_sysctl, "I", 5576 "Applied packet transmission aggregation alignment"); 5577 5578 return 0; 5579 } 5580 5581 static void 5582 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5583 { 5584 int i; 5585 5586 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5587 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5588 } 5589 5590 static void 5591 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5592 { 5593 struct ifnet *ifp = sc->hn_ifp; 5594 u_int hw_tsomax; 5595 int tso_minlen; 5596 5597 HN_LOCK_ASSERT(sc); 5598 5599 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5600 return; 5601 5602 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5603 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5604 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5605 5606 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5607 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5608 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5609 5610 if (tso_maxlen < tso_minlen) 5611 tso_maxlen = tso_minlen; 5612 else if (tso_maxlen > IP_MAXPACKET) 5613 tso_maxlen = IP_MAXPACKET; 5614 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5615 tso_maxlen = sc->hn_ndis_tso_szmax; 5616 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5617 5618 if (hn_xpnt_vf_isready(sc)) { 5619 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5620 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5621 } 5622 ifp->if_hw_tsomax = hw_tsomax; 5623 if (bootverbose) 5624 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5625 } 5626 5627 static void 5628 hn_fixup_tx_data(struct hn_softc *sc) 5629 { 5630 uint64_t csum_assist; 5631 int i; 5632 5633 hn_set_chim_size(sc, sc->hn_chim_szmax); 5634 if (hn_tx_chimney_size > 0 && 5635 hn_tx_chimney_size < sc->hn_chim_szmax) 5636 hn_set_chim_size(sc, hn_tx_chimney_size); 5637 5638 csum_assist = 0; 5639 if (sc->hn_caps & HN_CAP_IPCS) 5640 csum_assist |= CSUM_IP; 5641 if (sc->hn_caps & HN_CAP_TCP4CS) 5642 csum_assist |= CSUM_IP_TCP; 5643 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5644 csum_assist |= CSUM_IP_UDP; 5645 if (sc->hn_caps & HN_CAP_TCP6CS) 5646 csum_assist |= CSUM_IP6_TCP; 5647 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5648 csum_assist |= CSUM_IP6_UDP; 5649 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5650 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5651 5652 if (sc->hn_caps & HN_CAP_HASHVAL) { 5653 /* 5654 * Support HASHVAL pktinfo on TX path. 5655 */ 5656 if (bootverbose) 5657 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5658 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5659 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5660 } 5661 } 5662 5663 static void 5664 hn_fixup_rx_data(struct hn_softc *sc) 5665 { 5666 5667 if (sc->hn_caps & HN_CAP_UDPHASH) { 5668 int i; 5669 5670 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5671 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5672 } 5673 } 5674 5675 static void 5676 hn_destroy_tx_data(struct hn_softc *sc) 5677 { 5678 int i; 5679 5680 if (sc->hn_chim != NULL) { 5681 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5682 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5683 } else { 5684 device_printf(sc->hn_dev, 5685 "chimney sending buffer is referenced"); 5686 } 5687 sc->hn_chim = NULL; 5688 } 5689 5690 if (sc->hn_tx_ring_cnt == 0) 5691 return; 5692 5693 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5694 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5695 5696 free(sc->hn_tx_ring, M_DEVBUF); 5697 sc->hn_tx_ring = NULL; 5698 5699 sc->hn_tx_ring_cnt = 0; 5700 sc->hn_tx_ring_inuse = 0; 5701 } 5702 5703 #ifdef HN_IFSTART_SUPPORT 5704 5705 static void 5706 hn_start_taskfunc(void *xtxr, int pending __unused) 5707 { 5708 struct hn_tx_ring *txr = xtxr; 5709 5710 mtx_lock(&txr->hn_tx_lock); 5711 hn_start_locked(txr, 0); 5712 mtx_unlock(&txr->hn_tx_lock); 5713 } 5714 5715 static int 5716 hn_start_locked(struct hn_tx_ring *txr, int len) 5717 { 5718 struct hn_softc *sc = txr->hn_sc; 5719 struct ifnet *ifp = sc->hn_ifp; 5720 int sched = 0; 5721 5722 KASSERT(hn_use_if_start, 5723 ("hn_start_locked is called, when if_start is disabled")); 5724 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5725 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5726 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5727 5728 if (__predict_false(txr->hn_suspended)) 5729 return (0); 5730 5731 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5732 IFF_DRV_RUNNING) 5733 return (0); 5734 5735 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5736 struct hn_txdesc *txd; 5737 struct mbuf *m_head; 5738 int error; 5739 5740 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5741 if (m_head == NULL) 5742 break; 5743 5744 if (len > 0 && m_head->m_pkthdr.len > len) { 5745 /* 5746 * This sending could be time consuming; let callers 5747 * dispatch this packet sending (and sending of any 5748 * following up packets) to tx taskqueue. 5749 */ 5750 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5751 sched = 1; 5752 break; 5753 } 5754 5755 #if defined(INET6) || defined(INET) 5756 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5757 m_head = hn_tso_fixup(m_head); 5758 if (__predict_false(m_head == NULL)) { 5759 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5760 continue; 5761 } 5762 } else if (m_head->m_pkthdr.csum_flags & 5763 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5764 m_head = hn_set_hlen(m_head); 5765 if (__predict_false(m_head == NULL)) { 5766 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5767 continue; 5768 } 5769 } 5770 #endif 5771 5772 txd = hn_txdesc_get(txr); 5773 if (txd == NULL) { 5774 txr->hn_no_txdescs++; 5775 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5776 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5777 break; 5778 } 5779 5780 error = hn_encap(ifp, txr, txd, &m_head); 5781 if (error) { 5782 /* Both txd and m_head are freed */ 5783 KASSERT(txr->hn_agg_txd == NULL, 5784 ("encap failed w/ pending aggregating txdesc")); 5785 continue; 5786 } 5787 5788 if (txr->hn_agg_pktleft == 0) { 5789 if (txr->hn_agg_txd != NULL) { 5790 KASSERT(m_head == NULL, 5791 ("pending mbuf for aggregating txdesc")); 5792 error = hn_flush_txagg(ifp, txr); 5793 if (__predict_false(error)) { 5794 atomic_set_int(&ifp->if_drv_flags, 5795 IFF_DRV_OACTIVE); 5796 break; 5797 } 5798 } else { 5799 KASSERT(m_head != NULL, ("mbuf was freed")); 5800 error = hn_txpkt(ifp, txr, txd); 5801 if (__predict_false(error)) { 5802 /* txd is freed, but m_head is not */ 5803 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5804 atomic_set_int(&ifp->if_drv_flags, 5805 IFF_DRV_OACTIVE); 5806 break; 5807 } 5808 } 5809 } 5810 #ifdef INVARIANTS 5811 else { 5812 KASSERT(txr->hn_agg_txd != NULL, 5813 ("no aggregating txdesc")); 5814 KASSERT(m_head == NULL, 5815 ("pending mbuf for aggregating txdesc")); 5816 } 5817 #endif 5818 } 5819 5820 /* Flush pending aggerated transmission. */ 5821 if (txr->hn_agg_txd != NULL) 5822 hn_flush_txagg(ifp, txr); 5823 return (sched); 5824 } 5825 5826 static void 5827 hn_start(struct ifnet *ifp) 5828 { 5829 struct hn_softc *sc = ifp->if_softc; 5830 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5831 5832 if (txr->hn_sched_tx) 5833 goto do_sched; 5834 5835 if (mtx_trylock(&txr->hn_tx_lock)) { 5836 int sched; 5837 5838 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5839 mtx_unlock(&txr->hn_tx_lock); 5840 if (!sched) 5841 return; 5842 } 5843 do_sched: 5844 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5845 } 5846 5847 static void 5848 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5849 { 5850 struct hn_tx_ring *txr = xtxr; 5851 5852 mtx_lock(&txr->hn_tx_lock); 5853 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5854 hn_start_locked(txr, 0); 5855 mtx_unlock(&txr->hn_tx_lock); 5856 } 5857 5858 static void 5859 hn_start_txeof(struct hn_tx_ring *txr) 5860 { 5861 struct hn_softc *sc = txr->hn_sc; 5862 struct ifnet *ifp = sc->hn_ifp; 5863 5864 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5865 5866 if (txr->hn_sched_tx) 5867 goto do_sched; 5868 5869 if (mtx_trylock(&txr->hn_tx_lock)) { 5870 int sched; 5871 5872 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5873 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5874 mtx_unlock(&txr->hn_tx_lock); 5875 if (sched) { 5876 taskqueue_enqueue(txr->hn_tx_taskq, 5877 &txr->hn_tx_task); 5878 } 5879 } else { 5880 do_sched: 5881 /* 5882 * Release the OACTIVE earlier, with the hope, that 5883 * others could catch up. The task will clear the 5884 * flag again with the hn_tx_lock to avoid possible 5885 * races. 5886 */ 5887 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5888 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5889 } 5890 } 5891 5892 #endif /* HN_IFSTART_SUPPORT */ 5893 5894 static int 5895 hn_xmit(struct hn_tx_ring *txr, int len) 5896 { 5897 struct hn_softc *sc = txr->hn_sc; 5898 struct ifnet *ifp = sc->hn_ifp; 5899 struct mbuf *m_head; 5900 int sched = 0; 5901 5902 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5903 #ifdef HN_IFSTART_SUPPORT 5904 KASSERT(hn_use_if_start == 0, 5905 ("hn_xmit is called, when if_start is enabled")); 5906 #endif 5907 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5908 5909 if (__predict_false(txr->hn_suspended)) 5910 return (0); 5911 5912 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5913 return (0); 5914 5915 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5916 struct hn_txdesc *txd; 5917 int error; 5918 5919 if (len > 0 && m_head->m_pkthdr.len > len) { 5920 /* 5921 * This sending could be time consuming; let callers 5922 * dispatch this packet sending (and sending of any 5923 * following up packets) to tx taskqueue. 5924 */ 5925 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5926 sched = 1; 5927 break; 5928 } 5929 5930 txd = hn_txdesc_get(txr); 5931 if (txd == NULL) { 5932 txr->hn_no_txdescs++; 5933 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5934 txr->hn_oactive = 1; 5935 break; 5936 } 5937 5938 error = hn_encap(ifp, txr, txd, &m_head); 5939 if (error) { 5940 /* Both txd and m_head are freed; discard */ 5941 KASSERT(txr->hn_agg_txd == NULL, 5942 ("encap failed w/ pending aggregating txdesc")); 5943 drbr_advance(ifp, txr->hn_mbuf_br); 5944 continue; 5945 } 5946 5947 if (txr->hn_agg_pktleft == 0) { 5948 if (txr->hn_agg_txd != NULL) { 5949 KASSERT(m_head == NULL, 5950 ("pending mbuf for aggregating txdesc")); 5951 error = hn_flush_txagg(ifp, txr); 5952 if (__predict_false(error)) { 5953 txr->hn_oactive = 1; 5954 break; 5955 } 5956 } else { 5957 KASSERT(m_head != NULL, ("mbuf was freed")); 5958 error = hn_txpkt(ifp, txr, txd); 5959 if (__predict_false(error)) { 5960 /* txd is freed, but m_head is not */ 5961 drbr_putback(ifp, txr->hn_mbuf_br, 5962 m_head); 5963 txr->hn_oactive = 1; 5964 break; 5965 } 5966 } 5967 } 5968 #ifdef INVARIANTS 5969 else { 5970 KASSERT(txr->hn_agg_txd != NULL, 5971 ("no aggregating txdesc")); 5972 KASSERT(m_head == NULL, 5973 ("pending mbuf for aggregating txdesc")); 5974 } 5975 #endif 5976 5977 /* Sent */ 5978 drbr_advance(ifp, txr->hn_mbuf_br); 5979 } 5980 5981 /* Flush pending aggerated transmission. */ 5982 if (txr->hn_agg_txd != NULL) 5983 hn_flush_txagg(ifp, txr); 5984 return (sched); 5985 } 5986 5987 static int 5988 hn_transmit(struct ifnet *ifp, struct mbuf *m) 5989 { 5990 struct hn_softc *sc = ifp->if_softc; 5991 struct hn_tx_ring *txr; 5992 int error, idx = 0; 5993 5994 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5995 struct rm_priotracker pt; 5996 5997 rm_rlock(&sc->hn_vf_lock, &pt); 5998 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5999 struct mbuf *m_bpf = NULL; 6000 int obytes, omcast; 6001 6002 obytes = m->m_pkthdr.len; 6003 omcast = (m->m_flags & M_MCAST) != 0; 6004 6005 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6006 if (bpf_peers_present(ifp->if_bpf)) { 6007 m_bpf = m_copypacket(m, M_NOWAIT); 6008 if (m_bpf == NULL) { 6009 /* 6010 * Failed to grab a shallow 6011 * copy; tap now. 6012 */ 6013 ETHER_BPF_MTAP(ifp, m); 6014 } 6015 } 6016 } else { 6017 ETHER_BPF_MTAP(ifp, m); 6018 } 6019 6020 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 6021 rm_runlock(&sc->hn_vf_lock, &pt); 6022 6023 if (m_bpf != NULL) { 6024 if (!error) 6025 ETHER_BPF_MTAP(ifp, m_bpf); 6026 m_freem(m_bpf); 6027 } 6028 6029 if (error == ENOBUFS) { 6030 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6031 } else if (error) { 6032 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6033 } else { 6034 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6035 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6036 if (omcast) { 6037 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6038 omcast); 6039 } 6040 } 6041 return (error); 6042 } 6043 rm_runlock(&sc->hn_vf_lock, &pt); 6044 } 6045 6046 #if defined(INET6) || defined(INET) 6047 /* 6048 * Perform TSO packet header fixup or get l2/l3 header length now, 6049 * since packet headers should be cache-hot. 6050 */ 6051 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6052 m = hn_tso_fixup(m); 6053 if (__predict_false(m == NULL)) { 6054 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6055 return EIO; 6056 } 6057 } else if (m->m_pkthdr.csum_flags & 6058 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6059 m = hn_set_hlen(m); 6060 if (__predict_false(m == NULL)) { 6061 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6062 return EIO; 6063 } 6064 } 6065 #endif 6066 6067 /* 6068 * Select the TX ring based on flowid 6069 */ 6070 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6071 #ifdef RSS 6072 uint32_t bid; 6073 6074 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6075 &bid) == 0) 6076 idx = bid % sc->hn_tx_ring_inuse; 6077 else 6078 #endif 6079 { 6080 #if defined(INET6) || defined(INET) 6081 int tcpsyn = 0; 6082 6083 if (m->m_pkthdr.len < 128 && 6084 (m->m_pkthdr.csum_flags & 6085 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6086 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6087 m = hn_check_tcpsyn(m, &tcpsyn); 6088 if (__predict_false(m == NULL)) { 6089 if_inc_counter(ifp, 6090 IFCOUNTER_OERRORS, 1); 6091 return (EIO); 6092 } 6093 } 6094 #else 6095 const int tcpsyn = 0; 6096 #endif 6097 if (tcpsyn) 6098 idx = 0; 6099 else 6100 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6101 } 6102 } 6103 txr = &sc->hn_tx_ring[idx]; 6104 6105 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6106 if (error) { 6107 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6108 return error; 6109 } 6110 6111 if (txr->hn_oactive) 6112 return 0; 6113 6114 if (txr->hn_sched_tx) 6115 goto do_sched; 6116 6117 if (mtx_trylock(&txr->hn_tx_lock)) { 6118 int sched; 6119 6120 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6121 mtx_unlock(&txr->hn_tx_lock); 6122 if (!sched) 6123 return 0; 6124 } 6125 do_sched: 6126 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6127 return 0; 6128 } 6129 6130 static void 6131 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6132 { 6133 struct mbuf *m; 6134 6135 mtx_lock(&txr->hn_tx_lock); 6136 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6137 m_freem(m); 6138 mtx_unlock(&txr->hn_tx_lock); 6139 } 6140 6141 static void 6142 hn_xmit_qflush(struct ifnet *ifp) 6143 { 6144 struct hn_softc *sc = ifp->if_softc; 6145 struct rm_priotracker pt; 6146 int i; 6147 6148 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6149 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6150 if_qflush(ifp); 6151 6152 rm_rlock(&sc->hn_vf_lock, &pt); 6153 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6154 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6155 rm_runlock(&sc->hn_vf_lock, &pt); 6156 } 6157 6158 static void 6159 hn_xmit_txeof(struct hn_tx_ring *txr) 6160 { 6161 6162 if (txr->hn_sched_tx) 6163 goto do_sched; 6164 6165 if (mtx_trylock(&txr->hn_tx_lock)) { 6166 int sched; 6167 6168 txr->hn_oactive = 0; 6169 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6170 mtx_unlock(&txr->hn_tx_lock); 6171 if (sched) { 6172 taskqueue_enqueue(txr->hn_tx_taskq, 6173 &txr->hn_tx_task); 6174 } 6175 } else { 6176 do_sched: 6177 /* 6178 * Release the oactive earlier, with the hope, that 6179 * others could catch up. The task will clear the 6180 * oactive again with the hn_tx_lock to avoid possible 6181 * races. 6182 */ 6183 txr->hn_oactive = 0; 6184 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6185 } 6186 } 6187 6188 static void 6189 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6190 { 6191 struct hn_tx_ring *txr = xtxr; 6192 6193 mtx_lock(&txr->hn_tx_lock); 6194 hn_xmit(txr, 0); 6195 mtx_unlock(&txr->hn_tx_lock); 6196 } 6197 6198 static void 6199 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6200 { 6201 struct hn_tx_ring *txr = xtxr; 6202 6203 mtx_lock(&txr->hn_tx_lock); 6204 txr->hn_oactive = 0; 6205 hn_xmit(txr, 0); 6206 mtx_unlock(&txr->hn_tx_lock); 6207 } 6208 6209 static int 6210 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6211 { 6212 struct vmbus_chan_br cbr; 6213 struct hn_rx_ring *rxr; 6214 struct hn_tx_ring *txr = NULL; 6215 int idx, error; 6216 6217 idx = vmbus_chan_subidx(chan); 6218 6219 /* 6220 * Link this channel to RX/TX ring. 6221 */ 6222 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6223 ("invalid channel index %d, should > 0 && < %d", 6224 idx, sc->hn_rx_ring_inuse)); 6225 rxr = &sc->hn_rx_ring[idx]; 6226 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6227 ("RX ring %d already attached", idx)); 6228 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6229 rxr->hn_chan = chan; 6230 6231 if (bootverbose) { 6232 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6233 idx, vmbus_chan_id(chan)); 6234 } 6235 6236 if (idx < sc->hn_tx_ring_inuse) { 6237 txr = &sc->hn_tx_ring[idx]; 6238 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6239 ("TX ring %d already attached", idx)); 6240 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6241 6242 txr->hn_chan = chan; 6243 if (bootverbose) { 6244 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6245 idx, vmbus_chan_id(chan)); 6246 } 6247 } 6248 6249 /* Bind this channel to a proper CPU. */ 6250 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6251 6252 /* 6253 * Open this channel 6254 */ 6255 cbr.cbr = rxr->hn_br; 6256 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6257 cbr.cbr_txsz = HN_TXBR_SIZE; 6258 cbr.cbr_rxsz = HN_RXBR_SIZE; 6259 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6260 if (error) { 6261 if (error == EISCONN) { 6262 if_printf(sc->hn_ifp, "bufring is connected after " 6263 "chan%u open failure\n", vmbus_chan_id(chan)); 6264 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6265 } else { 6266 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6267 vmbus_chan_id(chan), error); 6268 } 6269 } 6270 return (error); 6271 } 6272 6273 static void 6274 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6275 { 6276 struct hn_rx_ring *rxr; 6277 int idx, error; 6278 6279 idx = vmbus_chan_subidx(chan); 6280 6281 /* 6282 * Link this channel to RX/TX ring. 6283 */ 6284 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6285 ("invalid channel index %d, should > 0 && < %d", 6286 idx, sc->hn_rx_ring_inuse)); 6287 rxr = &sc->hn_rx_ring[idx]; 6288 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6289 ("RX ring %d is not attached", idx)); 6290 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6291 6292 if (idx < sc->hn_tx_ring_inuse) { 6293 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6294 6295 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6296 ("TX ring %d is not attached attached", idx)); 6297 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6298 } 6299 6300 /* 6301 * Close this channel. 6302 * 6303 * NOTE: 6304 * Channel closing does _not_ destroy the target channel. 6305 */ 6306 error = vmbus_chan_close_direct(chan); 6307 if (error == EISCONN) { 6308 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6309 "after being closed\n", vmbus_chan_id(chan)); 6310 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6311 } else if (error) { 6312 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6313 vmbus_chan_id(chan), error); 6314 } 6315 } 6316 6317 static int 6318 hn_attach_subchans(struct hn_softc *sc) 6319 { 6320 struct vmbus_channel **subchans; 6321 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6322 int i, error = 0; 6323 6324 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6325 6326 /* Attach the sub-channels. */ 6327 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6328 for (i = 0; i < subchan_cnt; ++i) { 6329 int error1; 6330 6331 error1 = hn_chan_attach(sc, subchans[i]); 6332 if (error1) { 6333 error = error1; 6334 /* Move on; all channels will be detached later. */ 6335 } 6336 } 6337 vmbus_subchan_rel(subchans, subchan_cnt); 6338 6339 if (error) { 6340 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6341 } else { 6342 if (bootverbose) { 6343 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6344 subchan_cnt); 6345 } 6346 } 6347 return (error); 6348 } 6349 6350 static void 6351 hn_detach_allchans(struct hn_softc *sc) 6352 { 6353 struct vmbus_channel **subchans; 6354 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6355 int i; 6356 6357 if (subchan_cnt == 0) 6358 goto back; 6359 6360 /* Detach the sub-channels. */ 6361 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6362 for (i = 0; i < subchan_cnt; ++i) 6363 hn_chan_detach(sc, subchans[i]); 6364 vmbus_subchan_rel(subchans, subchan_cnt); 6365 6366 back: 6367 /* 6368 * Detach the primary channel, _after_ all sub-channels 6369 * are detached. 6370 */ 6371 hn_chan_detach(sc, sc->hn_prichan); 6372 6373 /* Wait for sub-channels to be destroyed, if any. */ 6374 vmbus_subchan_drain(sc->hn_prichan); 6375 6376 #ifdef INVARIANTS 6377 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6378 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6379 HN_RX_FLAG_ATTACHED) == 0, 6380 ("%dth RX ring is still attached", i)); 6381 } 6382 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6383 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6384 HN_TX_FLAG_ATTACHED) == 0, 6385 ("%dth TX ring is still attached", i)); 6386 } 6387 #endif 6388 } 6389 6390 static int 6391 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6392 { 6393 struct vmbus_channel **subchans; 6394 int nchan, rxr_cnt, error; 6395 6396 nchan = *nsubch + 1; 6397 if (nchan == 1) { 6398 /* 6399 * Multiple RX/TX rings are not requested. 6400 */ 6401 *nsubch = 0; 6402 return (0); 6403 } 6404 6405 /* 6406 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6407 * table entries. 6408 */ 6409 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6410 if (error) { 6411 /* No RSS; this is benign. */ 6412 *nsubch = 0; 6413 return (0); 6414 } 6415 if (bootverbose) { 6416 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6417 rxr_cnt, nchan); 6418 } 6419 6420 if (nchan > rxr_cnt) 6421 nchan = rxr_cnt; 6422 if (nchan == 1) { 6423 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6424 *nsubch = 0; 6425 return (0); 6426 } 6427 6428 /* 6429 * Allocate sub-channels from NVS. 6430 */ 6431 *nsubch = nchan - 1; 6432 error = hn_nvs_alloc_subchans(sc, nsubch); 6433 if (error || *nsubch == 0) { 6434 /* Failed to allocate sub-channels. */ 6435 *nsubch = 0; 6436 return (0); 6437 } 6438 6439 /* 6440 * Wait for all sub-channels to become ready before moving on. 6441 */ 6442 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6443 vmbus_subchan_rel(subchans, *nsubch); 6444 return (0); 6445 } 6446 6447 static bool 6448 hn_synth_attachable(const struct hn_softc *sc) 6449 { 6450 int i; 6451 6452 if (sc->hn_flags & HN_FLAG_ERRORS) 6453 return (false); 6454 6455 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6456 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6457 6458 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6459 return (false); 6460 } 6461 return (true); 6462 } 6463 6464 /* 6465 * Make sure that the RX filter is zero after the successful 6466 * RNDIS initialization. 6467 * 6468 * NOTE: 6469 * Under certain conditions on certain versions of Hyper-V, 6470 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6471 * after the successful RNDIS initialization, which breaks 6472 * the assumption of any following code (well, it breaks the 6473 * RNDIS API contract actually). Clear the RNDIS rxfilter 6474 * explicitly, drain packets sneaking through, and drain the 6475 * interrupt taskqueues scheduled due to the stealth packets. 6476 */ 6477 static void 6478 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6479 { 6480 6481 hn_disable_rx(sc); 6482 hn_drain_rxtx(sc, nchan); 6483 } 6484 6485 static int 6486 hn_synth_attach(struct hn_softc *sc, int mtu) 6487 { 6488 #define ATTACHED_NVS 0x0002 6489 #define ATTACHED_RNDIS 0x0004 6490 6491 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6492 int error, nsubch, nchan = 1, i, rndis_inited; 6493 uint32_t old_caps, attached = 0; 6494 6495 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6496 ("synthetic parts were attached")); 6497 6498 if (!hn_synth_attachable(sc)) 6499 return (ENXIO); 6500 6501 /* Save capabilities for later verification. */ 6502 old_caps = sc->hn_caps; 6503 sc->hn_caps = 0; 6504 6505 /* Clear RSS stuffs. */ 6506 sc->hn_rss_ind_size = 0; 6507 sc->hn_rss_hash = 0; 6508 sc->hn_rss_hcap = 0; 6509 6510 /* 6511 * Attach the primary channel _before_ attaching NVS and RNDIS. 6512 */ 6513 error = hn_chan_attach(sc, sc->hn_prichan); 6514 if (error) 6515 goto failed; 6516 6517 /* 6518 * Attach NVS. 6519 */ 6520 error = hn_nvs_attach(sc, mtu); 6521 if (error) 6522 goto failed; 6523 attached |= ATTACHED_NVS; 6524 6525 /* 6526 * Attach RNDIS _after_ NVS is attached. 6527 */ 6528 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6529 if (rndis_inited) 6530 attached |= ATTACHED_RNDIS; 6531 if (error) 6532 goto failed; 6533 6534 /* 6535 * Make sure capabilities are not changed. 6536 */ 6537 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6538 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6539 old_caps, sc->hn_caps); 6540 error = ENXIO; 6541 goto failed; 6542 } 6543 6544 /* 6545 * Allocate sub-channels for multi-TX/RX rings. 6546 * 6547 * NOTE: 6548 * The # of RX rings that can be used is equivalent to the # of 6549 * channels to be requested. 6550 */ 6551 nsubch = sc->hn_rx_ring_cnt - 1; 6552 error = hn_synth_alloc_subchans(sc, &nsubch); 6553 if (error) 6554 goto failed; 6555 /* NOTE: _Full_ synthetic parts detach is required now. */ 6556 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6557 6558 /* 6559 * Set the # of TX/RX rings that could be used according to 6560 * the # of channels that NVS offered. 6561 */ 6562 nchan = nsubch + 1; 6563 hn_set_ring_inuse(sc, nchan); 6564 if (nchan == 1) { 6565 /* Only the primary channel can be used; done */ 6566 goto back; 6567 } 6568 6569 /* 6570 * Attach the sub-channels. 6571 * 6572 * NOTE: hn_set_ring_inuse() _must_ have been called. 6573 */ 6574 error = hn_attach_subchans(sc); 6575 if (error) 6576 goto failed; 6577 6578 /* 6579 * Configure RSS key and indirect table _after_ all sub-channels 6580 * are attached. 6581 */ 6582 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6583 /* 6584 * RSS key is not set yet; set it to the default RSS key. 6585 */ 6586 if (bootverbose) 6587 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6588 #ifdef RSS 6589 rss_getkey(rss->rss_key); 6590 #else 6591 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6592 #endif 6593 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6594 } 6595 6596 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6597 /* 6598 * RSS indirect table is not set yet; set it up in round- 6599 * robin fashion. 6600 */ 6601 if (bootverbose) { 6602 if_printf(sc->hn_ifp, "setup default RSS indirect " 6603 "table\n"); 6604 } 6605 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6606 uint32_t subidx; 6607 6608 #ifdef RSS 6609 subidx = rss_get_indirection_to_bucket(i); 6610 #else 6611 subidx = i; 6612 #endif 6613 rss->rss_ind[i] = subidx % nchan; 6614 } 6615 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6616 } else { 6617 /* 6618 * # of usable channels may be changed, so we have to 6619 * make sure that all entries in RSS indirect table 6620 * are valid. 6621 * 6622 * NOTE: hn_set_ring_inuse() _must_ have been called. 6623 */ 6624 hn_rss_ind_fixup(sc); 6625 } 6626 6627 sc->hn_rss_hash = sc->hn_rss_hcap; 6628 if ((sc->hn_flags & HN_FLAG_RXVF) || 6629 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6630 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6631 hn_vf_rss_fixup(sc, false); 6632 } 6633 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6634 if (error) 6635 goto failed; 6636 back: 6637 /* 6638 * Fixup transmission aggregation setup. 6639 */ 6640 hn_set_txagg(sc); 6641 hn_rndis_init_fixat(sc, nchan); 6642 return (0); 6643 6644 failed: 6645 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6646 hn_rndis_init_fixat(sc, nchan); 6647 hn_synth_detach(sc); 6648 } else { 6649 if (attached & ATTACHED_RNDIS) { 6650 hn_rndis_init_fixat(sc, nchan); 6651 hn_rndis_detach(sc); 6652 } 6653 if (attached & ATTACHED_NVS) 6654 hn_nvs_detach(sc); 6655 hn_chan_detach(sc, sc->hn_prichan); 6656 /* Restore old capabilities. */ 6657 sc->hn_caps = old_caps; 6658 } 6659 return (error); 6660 6661 #undef ATTACHED_RNDIS 6662 #undef ATTACHED_NVS 6663 } 6664 6665 /* 6666 * NOTE: 6667 * The interface must have been suspended though hn_suspend(), before 6668 * this function get called. 6669 */ 6670 static void 6671 hn_synth_detach(struct hn_softc *sc) 6672 { 6673 6674 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6675 ("synthetic parts were not attached")); 6676 6677 /* Detach the RNDIS first. */ 6678 hn_rndis_detach(sc); 6679 6680 /* Detach NVS. */ 6681 hn_nvs_detach(sc); 6682 6683 /* Detach all of the channels. */ 6684 hn_detach_allchans(sc); 6685 6686 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6687 /* 6688 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6689 */ 6690 int error; 6691 6692 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6693 sc->hn_rxbuf_gpadl); 6694 if (error) { 6695 if_printf(sc->hn_ifp, 6696 "rxbuf gpadl disconn failed: %d\n", error); 6697 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6698 } 6699 sc->hn_rxbuf_gpadl = 0; 6700 } 6701 6702 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6703 /* 6704 * Host is post-Win2016, disconnect chimney sending buffer from 6705 * primary channel here. 6706 */ 6707 int error; 6708 6709 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6710 sc->hn_chim_gpadl); 6711 if (error) { 6712 if_printf(sc->hn_ifp, 6713 "chim gpadl disconn failed: %d\n", error); 6714 sc->hn_flags |= HN_FLAG_CHIM_REF; 6715 } 6716 sc->hn_chim_gpadl = 0; 6717 } 6718 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6719 } 6720 6721 static void 6722 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6723 { 6724 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6725 ("invalid ring count %d", ring_cnt)); 6726 6727 if (sc->hn_tx_ring_cnt > ring_cnt) 6728 sc->hn_tx_ring_inuse = ring_cnt; 6729 else 6730 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6731 sc->hn_rx_ring_inuse = ring_cnt; 6732 6733 #ifdef RSS 6734 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6735 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6736 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6737 rss_getnumbuckets()); 6738 } 6739 #endif 6740 6741 if (bootverbose) { 6742 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6743 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6744 } 6745 } 6746 6747 static void 6748 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6749 { 6750 6751 /* 6752 * NOTE: 6753 * The TX bufring will not be drained by the hypervisor, 6754 * if the primary channel is revoked. 6755 */ 6756 while (!vmbus_chan_rx_empty(chan) || 6757 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6758 !vmbus_chan_tx_empty(chan))) 6759 pause("waitch", 1); 6760 vmbus_chan_intr_drain(chan); 6761 } 6762 6763 static void 6764 hn_disable_rx(struct hn_softc *sc) 6765 { 6766 6767 /* 6768 * Disable RX by clearing RX filter forcefully. 6769 */ 6770 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6771 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6772 6773 /* 6774 * Give RNDIS enough time to flush all pending data packets. 6775 */ 6776 pause("waitrx", (200 * hz) / 1000); 6777 } 6778 6779 /* 6780 * NOTE: 6781 * RX/TX _must_ have been suspended/disabled, before this function 6782 * is called. 6783 */ 6784 static void 6785 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6786 { 6787 struct vmbus_channel **subch = NULL; 6788 int nsubch; 6789 6790 /* 6791 * Drain RX/TX bufrings and interrupts. 6792 */ 6793 nsubch = nchan - 1; 6794 if (nsubch > 0) 6795 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6796 6797 if (subch != NULL) { 6798 int i; 6799 6800 for (i = 0; i < nsubch; ++i) 6801 hn_chan_drain(sc, subch[i]); 6802 } 6803 hn_chan_drain(sc, sc->hn_prichan); 6804 6805 if (subch != NULL) 6806 vmbus_subchan_rel(subch, nsubch); 6807 } 6808 6809 static void 6810 hn_suspend_data(struct hn_softc *sc) 6811 { 6812 struct hn_tx_ring *txr; 6813 int i; 6814 6815 HN_LOCK_ASSERT(sc); 6816 6817 /* 6818 * Suspend TX. 6819 */ 6820 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6821 txr = &sc->hn_tx_ring[i]; 6822 6823 mtx_lock(&txr->hn_tx_lock); 6824 txr->hn_suspended = 1; 6825 mtx_unlock(&txr->hn_tx_lock); 6826 /* No one is able send more packets now. */ 6827 6828 /* 6829 * Wait for all pending sends to finish. 6830 * 6831 * NOTE: 6832 * We will _not_ receive all pending send-done, if the 6833 * primary channel is revoked. 6834 */ 6835 while (hn_tx_ring_pending(txr) && 6836 !vmbus_chan_is_revoked(sc->hn_prichan)) 6837 pause("hnwtx", 1 /* 1 tick */); 6838 } 6839 6840 /* 6841 * Disable RX. 6842 */ 6843 hn_disable_rx(sc); 6844 6845 /* 6846 * Drain RX/TX. 6847 */ 6848 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6849 6850 /* 6851 * Drain any pending TX tasks. 6852 * 6853 * NOTE: 6854 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6855 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6856 */ 6857 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6858 txr = &sc->hn_tx_ring[i]; 6859 6860 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6861 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6862 } 6863 } 6864 6865 static void 6866 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6867 { 6868 6869 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6870 } 6871 6872 static void 6873 hn_suspend_mgmt(struct hn_softc *sc) 6874 { 6875 struct task task; 6876 6877 HN_LOCK_ASSERT(sc); 6878 6879 /* 6880 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6881 * through hn_mgmt_taskq. 6882 */ 6883 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6884 vmbus_chan_run_task(sc->hn_prichan, &task); 6885 6886 /* 6887 * Make sure that all pending management tasks are completed. 6888 */ 6889 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6890 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6891 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6892 } 6893 6894 static void 6895 hn_suspend(struct hn_softc *sc) 6896 { 6897 6898 /* Disable polling. */ 6899 hn_polling(sc, 0); 6900 6901 /* 6902 * If the non-transparent mode VF is activated, the synthetic 6903 * device is receiving packets, so the data path of the 6904 * synthetic device must be suspended. 6905 */ 6906 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6907 (sc->hn_flags & HN_FLAG_RXVF)) 6908 hn_suspend_data(sc); 6909 hn_suspend_mgmt(sc); 6910 } 6911 6912 static void 6913 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6914 { 6915 int i; 6916 6917 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6918 ("invalid TX ring count %d", tx_ring_cnt)); 6919 6920 for (i = 0; i < tx_ring_cnt; ++i) { 6921 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6922 6923 mtx_lock(&txr->hn_tx_lock); 6924 txr->hn_suspended = 0; 6925 mtx_unlock(&txr->hn_tx_lock); 6926 } 6927 } 6928 6929 static void 6930 hn_resume_data(struct hn_softc *sc) 6931 { 6932 int i; 6933 6934 HN_LOCK_ASSERT(sc); 6935 6936 /* 6937 * Re-enable RX. 6938 */ 6939 hn_rxfilter_config(sc); 6940 6941 /* 6942 * Make sure to clear suspend status on "all" TX rings, 6943 * since hn_tx_ring_inuse can be changed after 6944 * hn_suspend_data(). 6945 */ 6946 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6947 6948 #ifdef HN_IFSTART_SUPPORT 6949 if (!hn_use_if_start) 6950 #endif 6951 { 6952 /* 6953 * Flush unused drbrs, since hn_tx_ring_inuse may be 6954 * reduced. 6955 */ 6956 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6957 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6958 } 6959 6960 /* 6961 * Kick start TX. 6962 */ 6963 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6964 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6965 6966 /* 6967 * Use txeof task, so that any pending oactive can be 6968 * cleared properly. 6969 */ 6970 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6971 } 6972 } 6973 6974 static void 6975 hn_resume_mgmt(struct hn_softc *sc) 6976 { 6977 6978 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6979 6980 /* 6981 * Kick off network change detection, if it was pending. 6982 * If no network change was pending, start link status 6983 * checks, which is more lightweight than network change 6984 * detection. 6985 */ 6986 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6987 hn_change_network(sc); 6988 else 6989 hn_update_link_status(sc); 6990 } 6991 6992 static void 6993 hn_resume(struct hn_softc *sc) 6994 { 6995 6996 /* 6997 * If the non-transparent mode VF is activated, the synthetic 6998 * device have to receive packets, so the data path of the 6999 * synthetic device must be resumed. 7000 */ 7001 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 7002 (sc->hn_flags & HN_FLAG_RXVF)) 7003 hn_resume_data(sc); 7004 7005 /* 7006 * Don't resume link status change if VF is attached/activated. 7007 * - In the non-transparent VF mode, the synthetic device marks 7008 * link down until the VF is deactivated; i.e. VF is down. 7009 * - In transparent VF mode, VF's media status is used until 7010 * the VF is detached. 7011 */ 7012 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7013 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7014 hn_resume_mgmt(sc); 7015 7016 /* 7017 * Re-enable polling if this interface is running and 7018 * the polling is requested. 7019 */ 7020 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7021 hn_polling(sc, sc->hn_pollhz); 7022 } 7023 7024 static void 7025 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7026 { 7027 const struct rndis_status_msg *msg; 7028 int ofs; 7029 7030 if (dlen < sizeof(*msg)) { 7031 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7032 return; 7033 } 7034 msg = data; 7035 7036 switch (msg->rm_status) { 7037 case RNDIS_STATUS_MEDIA_CONNECT: 7038 case RNDIS_STATUS_MEDIA_DISCONNECT: 7039 hn_update_link_status(sc); 7040 break; 7041 7042 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7043 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7044 /* Not really useful; ignore. */ 7045 break; 7046 7047 case RNDIS_STATUS_NETWORK_CHANGE: 7048 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7049 if (dlen < ofs + msg->rm_stbuflen || 7050 msg->rm_stbuflen < sizeof(uint32_t)) { 7051 if_printf(sc->hn_ifp, "network changed\n"); 7052 } else { 7053 uint32_t change; 7054 7055 memcpy(&change, ((const uint8_t *)msg) + ofs, 7056 sizeof(change)); 7057 if_printf(sc->hn_ifp, "network changed, change %u\n", 7058 change); 7059 } 7060 hn_change_network(sc); 7061 break; 7062 7063 default: 7064 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7065 msg->rm_status); 7066 break; 7067 } 7068 } 7069 7070 static int 7071 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7072 { 7073 const struct rndis_pktinfo *pi = info_data; 7074 uint32_t mask = 0; 7075 7076 while (info_dlen != 0) { 7077 const void *data; 7078 uint32_t dlen; 7079 7080 if (__predict_false(info_dlen < sizeof(*pi))) 7081 return (EINVAL); 7082 if (__predict_false(info_dlen < pi->rm_size)) 7083 return (EINVAL); 7084 info_dlen -= pi->rm_size; 7085 7086 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7087 return (EINVAL); 7088 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7089 return (EINVAL); 7090 dlen = pi->rm_size - pi->rm_pktinfooffset; 7091 data = pi->rm_data; 7092 7093 if (pi->rm_internal == 1) { 7094 switch (pi->rm_type) { 7095 case NDIS_PKTINFO_IT_PKTINFO_ID: 7096 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7097 return (EINVAL); 7098 info->pktinfo_id = 7099 (const struct packet_info_id *)data; 7100 mask |= HN_RXINFO_PKTINFO_ID; 7101 break; 7102 7103 default: 7104 goto next; 7105 } 7106 } else { 7107 switch (pi->rm_type) { 7108 case NDIS_PKTINFO_TYPE_VLAN: 7109 if (__predict_false(dlen 7110 < NDIS_VLAN_INFO_SIZE)) 7111 return (EINVAL); 7112 info->vlan_info = (const uint32_t *)data; 7113 mask |= HN_RXINFO_VLAN; 7114 break; 7115 7116 case NDIS_PKTINFO_TYPE_CSUM: 7117 if (__predict_false(dlen 7118 < NDIS_RXCSUM_INFO_SIZE)) 7119 return (EINVAL); 7120 info->csum_info = (const uint32_t *)data; 7121 mask |= HN_RXINFO_CSUM; 7122 break; 7123 7124 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7125 if (__predict_false(dlen 7126 < HN_NDIS_HASH_VALUE_SIZE)) 7127 return (EINVAL); 7128 info->hash_value = (const uint32_t *)data; 7129 mask |= HN_RXINFO_HASHVAL; 7130 break; 7131 7132 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7133 if (__predict_false(dlen 7134 < HN_NDIS_HASH_INFO_SIZE)) 7135 return (EINVAL); 7136 info->hash_info = (const uint32_t *)data; 7137 mask |= HN_RXINFO_HASHINF; 7138 break; 7139 7140 default: 7141 goto next; 7142 } 7143 } 7144 7145 if (mask == HN_RXINFO_ALL) { 7146 /* All found; done */ 7147 break; 7148 } 7149 next: 7150 pi = (const struct rndis_pktinfo *) 7151 ((const uint8_t *)pi + pi->rm_size); 7152 } 7153 7154 /* 7155 * Final fixup. 7156 * - If there is no hash value, invalidate the hash info. 7157 */ 7158 if ((mask & HN_RXINFO_HASHVAL) == 0) 7159 info->hash_info = NULL; 7160 return (0); 7161 } 7162 7163 static __inline bool 7164 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7165 { 7166 7167 if (off < check_off) { 7168 if (__predict_true(off + len <= check_off)) 7169 return (false); 7170 } else if (off > check_off) { 7171 if (__predict_true(check_off + check_len <= off)) 7172 return (false); 7173 } 7174 return (true); 7175 } 7176 7177 static __inline void 7178 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7179 uint32_t len, struct hn_rxinfo *info) 7180 { 7181 uint32_t cnt = rxr->rsc.cnt; 7182 7183 if (cnt) { 7184 rxr->rsc.pktlen += len; 7185 } else { 7186 rxr->rsc.vlan_info = info->vlan_info; 7187 rxr->rsc.csum_info = info->csum_info; 7188 rxr->rsc.hash_info = info->hash_info; 7189 rxr->rsc.hash_value = info->hash_value; 7190 rxr->rsc.pktlen = len; 7191 } 7192 7193 rxr->rsc.frag_data[cnt] = data; 7194 rxr->rsc.frag_len[cnt] = len; 7195 rxr->rsc.cnt++; 7196 } 7197 7198 static void 7199 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7200 { 7201 const struct rndis_packet_msg *pkt; 7202 struct hn_rxinfo info; 7203 int data_off, pktinfo_off, data_len, pktinfo_len; 7204 bool rsc_more= false; 7205 7206 /* 7207 * Check length. 7208 */ 7209 if (__predict_false(dlen < sizeof(*pkt))) { 7210 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7211 return; 7212 } 7213 pkt = data; 7214 7215 if (__predict_false(dlen < pkt->rm_len)) { 7216 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7217 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7218 return; 7219 } 7220 if (__predict_false(pkt->rm_len < 7221 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7222 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7223 "msglen %u, data %u, oob %u, pktinfo %u\n", 7224 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7225 pkt->rm_pktinfolen); 7226 return; 7227 } 7228 if (__predict_false(pkt->rm_datalen == 0)) { 7229 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7230 return; 7231 } 7232 7233 /* 7234 * Check offests. 7235 */ 7236 #define IS_OFFSET_INVALID(ofs) \ 7237 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7238 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7239 7240 /* XXX Hyper-V does not meet data offset alignment requirement */ 7241 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7242 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7243 "data offset %u\n", pkt->rm_dataoffset); 7244 return; 7245 } 7246 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7247 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7248 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7249 "oob offset %u\n", pkt->rm_oobdataoffset); 7250 return; 7251 } 7252 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7253 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7254 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7255 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7256 return; 7257 } 7258 7259 #undef IS_OFFSET_INVALID 7260 7261 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7262 data_len = pkt->rm_datalen; 7263 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7264 pktinfo_len = pkt->rm_pktinfolen; 7265 7266 /* 7267 * Check OOB coverage. 7268 */ 7269 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7270 int oob_off, oob_len; 7271 7272 if_printf(rxr->hn_ifp, "got oobdata\n"); 7273 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7274 oob_len = pkt->rm_oobdatalen; 7275 7276 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7277 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7278 "oob overflow, msglen %u, oob abs %d len %d\n", 7279 pkt->rm_len, oob_off, oob_len); 7280 return; 7281 } 7282 7283 /* 7284 * Check against data. 7285 */ 7286 if (hn_rndis_check_overlap(oob_off, oob_len, 7287 data_off, data_len)) { 7288 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7289 "oob overlaps data, oob abs %d len %d, " 7290 "data abs %d len %d\n", 7291 oob_off, oob_len, data_off, data_len); 7292 return; 7293 } 7294 7295 /* 7296 * Check against pktinfo. 7297 */ 7298 if (pktinfo_len != 0 && 7299 hn_rndis_check_overlap(oob_off, oob_len, 7300 pktinfo_off, pktinfo_len)) { 7301 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7302 "oob overlaps pktinfo, oob abs %d len %d, " 7303 "pktinfo abs %d len %d\n", 7304 oob_off, oob_len, pktinfo_off, pktinfo_len); 7305 return; 7306 } 7307 } 7308 7309 /* 7310 * Check per-packet-info coverage and find useful per-packet-info. 7311 */ 7312 info.vlan_info = NULL; 7313 info.csum_info = NULL; 7314 info.hash_info = NULL; 7315 info.pktinfo_id = NULL; 7316 7317 if (__predict_true(pktinfo_len != 0)) { 7318 bool overlap; 7319 int error; 7320 7321 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7322 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7323 "pktinfo overflow, msglen %u, " 7324 "pktinfo abs %d len %d\n", 7325 pkt->rm_len, pktinfo_off, pktinfo_len); 7326 return; 7327 } 7328 7329 /* 7330 * Check packet info coverage. 7331 */ 7332 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7333 data_off, data_len); 7334 if (__predict_false(overlap)) { 7335 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7336 "pktinfo overlap data, pktinfo abs %d len %d, " 7337 "data abs %d len %d\n", 7338 pktinfo_off, pktinfo_len, data_off, data_len); 7339 return; 7340 } 7341 7342 /* 7343 * Find useful per-packet-info. 7344 */ 7345 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7346 pktinfo_len, &info); 7347 if (__predict_false(error)) { 7348 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7349 "pktinfo\n"); 7350 return; 7351 } 7352 } 7353 7354 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7355 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7356 "data overflow, msglen %u, data abs %d len %d\n", 7357 pkt->rm_len, data_off, data_len); 7358 return; 7359 } 7360 7361 /* Identify RSC fragments, drop invalid packets */ 7362 if ((info.pktinfo_id != NULL) && 7363 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7364 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7365 rxr->rsc.cnt = 0; 7366 rxr->hn_rsc_pkts++; 7367 } else if (rxr->rsc.cnt == 0) 7368 goto drop; 7369 7370 rsc_more = true; 7371 7372 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7373 rsc_more = false; 7374 7375 if (rsc_more && rxr->rsc.is_last) 7376 goto drop; 7377 } else { 7378 rxr->rsc.cnt = 0; 7379 } 7380 7381 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7382 goto drop; 7383 7384 /* Store data in per rx ring structure */ 7385 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7386 data_len, &info); 7387 7388 if (rsc_more) 7389 return; 7390 7391 hn_rxpkt(rxr); 7392 rxr->rsc.cnt = 0; 7393 return; 7394 drop: 7395 rxr->hn_rsc_drop++; 7396 return; 7397 } 7398 7399 static __inline void 7400 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7401 { 7402 const struct rndis_msghdr *hdr; 7403 7404 if (__predict_false(dlen < sizeof(*hdr))) { 7405 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7406 return; 7407 } 7408 hdr = data; 7409 7410 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7411 /* Hot data path. */ 7412 hn_rndis_rx_data(rxr, data, dlen); 7413 /* Done! */ 7414 return; 7415 } 7416 7417 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7418 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7419 else 7420 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7421 } 7422 7423 static void 7424 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7425 { 7426 const struct hn_nvs_hdr *hdr; 7427 7428 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7429 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7430 return; 7431 } 7432 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7433 7434 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7435 /* Useless; ignore */ 7436 return; 7437 } 7438 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7439 } 7440 7441 static void 7442 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7443 const struct vmbus_chanpkt_hdr *pkt) 7444 { 7445 struct hn_nvs_sendctx *sndc; 7446 7447 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7448 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7449 VMBUS_CHANPKT_DATALEN(pkt)); 7450 /* 7451 * NOTE: 7452 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7453 * its callback. 7454 */ 7455 } 7456 7457 static void 7458 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7459 const struct vmbus_chanpkt_hdr *pkthdr) 7460 { 7461 const struct vmbus_chanpkt_rxbuf *pkt; 7462 const struct hn_nvs_hdr *nvs_hdr; 7463 int count, i, hlen; 7464 7465 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7466 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7467 return; 7468 } 7469 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7470 7471 /* Make sure that this is a RNDIS message. */ 7472 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7473 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7474 nvs_hdr->nvs_type); 7475 return; 7476 } 7477 7478 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7479 if (__predict_false(hlen < sizeof(*pkt))) { 7480 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7481 return; 7482 } 7483 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7484 7485 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7486 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7487 pkt->cp_rxbuf_id); 7488 return; 7489 } 7490 7491 count = pkt->cp_rxbuf_cnt; 7492 if (__predict_false(hlen < 7493 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7494 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7495 return; 7496 } 7497 7498 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7499 for (i = 0; i < count; ++i) { 7500 int ofs, len; 7501 7502 ofs = pkt->cp_rxbuf[i].rb_ofs; 7503 len = pkt->cp_rxbuf[i].rb_len; 7504 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7505 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7506 "ofs %d, len %d\n", i, ofs, len); 7507 continue; 7508 } 7509 7510 rxr->rsc.is_last = (i == (count - 1)); 7511 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7512 } 7513 7514 /* 7515 * Ack the consumed RXBUF associated w/ this channel packet, 7516 * so that this RXBUF can be recycled by the hypervisor. 7517 */ 7518 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7519 } 7520 7521 static void 7522 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7523 uint64_t tid) 7524 { 7525 struct hn_nvs_rndis_ack ack; 7526 int retries, error; 7527 7528 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7529 ack.nvs_status = HN_NVS_STATUS_OK; 7530 7531 retries = 0; 7532 again: 7533 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7534 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7535 if (__predict_false(error == EAGAIN)) { 7536 /* 7537 * NOTE: 7538 * This should _not_ happen in real world, since the 7539 * consumption of the TX bufring from the TX path is 7540 * controlled. 7541 */ 7542 if (rxr->hn_ack_failed == 0) 7543 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7544 rxr->hn_ack_failed++; 7545 retries++; 7546 if (retries < 10) { 7547 DELAY(100); 7548 goto again; 7549 } 7550 /* RXBUF leaks! */ 7551 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7552 } 7553 } 7554 7555 static void 7556 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7557 { 7558 struct hn_rx_ring *rxr = xrxr; 7559 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7560 7561 for (;;) { 7562 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7563 int error, pktlen; 7564 7565 pktlen = rxr->hn_pktbuf_len; 7566 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7567 if (__predict_false(error == ENOBUFS)) { 7568 void *nbuf; 7569 int nlen; 7570 7571 /* 7572 * Expand channel packet buffer. 7573 * 7574 * XXX 7575 * Use M_WAITOK here, since allocation failure 7576 * is fatal. 7577 */ 7578 nlen = rxr->hn_pktbuf_len * 2; 7579 while (nlen < pktlen) 7580 nlen *= 2; 7581 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7582 7583 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7584 rxr->hn_pktbuf_len, nlen); 7585 7586 free(rxr->hn_pktbuf, M_DEVBUF); 7587 rxr->hn_pktbuf = nbuf; 7588 rxr->hn_pktbuf_len = nlen; 7589 /* Retry! */ 7590 continue; 7591 } else if (__predict_false(error == EAGAIN)) { 7592 /* No more channel packets; done! */ 7593 break; 7594 } 7595 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7596 7597 switch (pkt->cph_type) { 7598 case VMBUS_CHANPKT_TYPE_COMP: 7599 hn_nvs_handle_comp(sc, chan, pkt); 7600 break; 7601 7602 case VMBUS_CHANPKT_TYPE_RXBUF: 7603 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7604 break; 7605 7606 case VMBUS_CHANPKT_TYPE_INBAND: 7607 hn_nvs_handle_notify(sc, pkt); 7608 break; 7609 7610 default: 7611 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7612 pkt->cph_type); 7613 break; 7614 } 7615 } 7616 hn_chan_rollup(rxr, rxr->hn_txr); 7617 } 7618 7619 static void 7620 hn_sysinit(void *arg __unused) 7621 { 7622 int i; 7623 7624 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7625 7626 #ifdef HN_IFSTART_SUPPORT 7627 /* 7628 * Don't use ifnet.if_start if transparent VF mode is requested; 7629 * mainly due to the IFF_DRV_OACTIVE flag. 7630 */ 7631 if (hn_xpnt_vf && hn_use_if_start) { 7632 hn_use_if_start = 0; 7633 printf("hn: tranparent VF mode, if_transmit will be used, " 7634 "instead of if_start\n"); 7635 } 7636 #endif 7637 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7638 printf("hn: invalid transparent VF attach routing " 7639 "wait timeout %d, reset to %d\n", 7640 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7641 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7642 } 7643 7644 /* 7645 * Initialize VF map. 7646 */ 7647 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7648 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7649 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7650 M_WAITOK | M_ZERO); 7651 7652 /* 7653 * Fix the # of TX taskqueues. 7654 */ 7655 if (hn_tx_taskq_cnt <= 0) 7656 hn_tx_taskq_cnt = 1; 7657 else if (hn_tx_taskq_cnt > mp_ncpus) 7658 hn_tx_taskq_cnt = mp_ncpus; 7659 7660 /* 7661 * Fix the TX taskqueue mode. 7662 */ 7663 switch (hn_tx_taskq_mode) { 7664 case HN_TX_TASKQ_M_INDEP: 7665 case HN_TX_TASKQ_M_GLOBAL: 7666 case HN_TX_TASKQ_M_EVTTQ: 7667 break; 7668 default: 7669 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7670 break; 7671 } 7672 7673 if (vm_guest != VM_GUEST_HV) 7674 return; 7675 7676 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7677 return; 7678 7679 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7680 M_DEVBUF, M_WAITOK); 7681 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7682 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7683 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7684 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7685 "hn tx%d", i); 7686 } 7687 } 7688 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7689 7690 static void 7691 hn_sysuninit(void *arg __unused) 7692 { 7693 7694 if (hn_tx_taskque != NULL) { 7695 int i; 7696 7697 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7698 taskqueue_free(hn_tx_taskque[i]); 7699 free(hn_tx_taskque, M_DEVBUF); 7700 } 7701 7702 if (hn_vfmap != NULL) 7703 free(hn_vfmap, M_DEVBUF); 7704 rm_destroy(&hn_vfmap_lock); 7705 7706 counter_u64_free(hn_udpcs_fixup); 7707 } 7708 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7709