1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 87 #include <machine/atomic.h> 88 #include <machine/in_cksum.h> 89 90 #include <net/bpf.h> 91 #include <net/ethernet.h> 92 #include <net/if.h> 93 #include <net/if_dl.h> 94 #include <net/if_media.h> 95 #include <net/if_types.h> 96 #include <net/if_var.h> 97 #include <net/rndis.h> 98 #ifdef RSS 99 #include <net/rss_config.h> 100 #endif 101 102 #include <netinet/in_systm.h> 103 #include <netinet/in.h> 104 #include <netinet/ip.h> 105 #include <netinet/ip6.h> 106 #include <netinet/tcp.h> 107 #include <netinet/tcp_lro.h> 108 #include <netinet/udp.h> 109 110 #include <dev/hyperv/include/hyperv.h> 111 #include <dev/hyperv/include/hyperv_busdma.h> 112 #include <dev/hyperv/include/vmbus.h> 113 #include <dev/hyperv/include/vmbus_xact.h> 114 115 #include <dev/hyperv/netvsc/ndis.h> 116 #include <dev/hyperv/netvsc/if_hnreg.h> 117 #include <dev/hyperv/netvsc/if_hnvar.h> 118 #include <dev/hyperv/netvsc/hn_nvs.h> 119 #include <dev/hyperv/netvsc/hn_rndis.h> 120 121 #include "vmbus_if.h" 122 123 #define HN_IFSTART_SUPPORT 124 125 #define HN_RING_CNT_DEF_MAX 8 126 127 #define HN_VFMAP_SIZE_DEF 8 128 129 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 130 131 /* YYY should get it from the underlying channel */ 132 #define HN_TX_DESC_CNT 512 133 134 #define HN_RNDIS_PKT_LEN \ 135 (sizeof(struct rndis_packet_msg) + \ 136 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 137 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 140 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 141 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 142 143 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 144 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 145 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 146 /* -1 for RNDIS packet message */ 147 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 148 149 #define HN_DIRECT_TX_SIZE_DEF 128 150 151 #define HN_EARLY_TXEOF_THRESH 8 152 153 #define HN_PKTBUF_LEN_DEF (16 * 1024) 154 155 #define HN_LROENT_CNT_DEF 128 156 157 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 158 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 159 /* YYY 2*MTU is a bit rough, but should be good enough. */ 160 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 161 162 #define HN_LRO_ACKCNT_DEF 1 163 164 #define HN_LOCK_INIT(sc) \ 165 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 166 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 167 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 168 #define HN_LOCK(sc) \ 169 do { \ 170 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 171 /* Relinquish cpu to avoid deadlock */ \ 172 sched_relinquish(curthread); \ 173 DELAY(1000); \ 174 } \ 175 } while (0) 176 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 177 178 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 179 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 180 #define HN_CSUM_IP_HWASSIST(sc) \ 181 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 182 #define HN_CSUM_IP6_HWASSIST(sc) \ 183 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 184 185 #define HN_PKTSIZE_MIN(align) \ 186 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 187 HN_RNDIS_PKT_LEN, (align)) 188 #define HN_PKTSIZE(m, align) \ 189 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 190 191 #ifdef RSS 192 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 193 #else 194 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 195 #endif 196 197 struct hn_txdesc { 198 #ifndef HN_USE_TXDESC_BUFRING 199 SLIST_ENTRY(hn_txdesc) link; 200 #endif 201 STAILQ_ENTRY(hn_txdesc) agg_link; 202 203 /* Aggregated txdescs, in sending order. */ 204 STAILQ_HEAD(, hn_txdesc) agg_list; 205 206 /* The oldest packet, if transmission aggregation happens. */ 207 struct mbuf *m; 208 struct hn_tx_ring *txr; 209 int refs; 210 uint32_t flags; /* HN_TXD_FLAG_ */ 211 struct hn_nvs_sendctx send_ctx; 212 uint32_t chim_index; 213 int chim_size; 214 215 bus_dmamap_t data_dmap; 216 217 bus_addr_t rndis_pkt_paddr; 218 struct rndis_packet_msg *rndis_pkt; 219 bus_dmamap_t rndis_pkt_dmap; 220 }; 221 222 #define HN_TXD_FLAG_ONLIST 0x0001 223 #define HN_TXD_FLAG_DMAMAP 0x0002 224 #define HN_TXD_FLAG_ONAGG 0x0004 225 226 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 227 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 228 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 229 230 struct packet_info_id { 231 uint8_t ver; 232 uint8_t flag; 233 uint16_t pkt_id; 234 }; 235 236 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 237 238 239 struct hn_rxinfo { 240 const uint32_t *vlan_info; 241 const uint32_t *csum_info; 242 const uint32_t *hash_info; 243 const uint32_t *hash_value; 244 const struct packet_info_id *pktinfo_id; 245 }; 246 247 struct hn_rxvf_setarg { 248 struct hn_rx_ring *rxr; 249 struct ifnet *vf_ifp; 250 }; 251 252 #define HN_RXINFO_VLAN 0x0001 253 #define HN_RXINFO_CSUM 0x0002 254 #define HN_RXINFO_HASHINF 0x0004 255 #define HN_RXINFO_HASHVAL 0x0008 256 #define HN_RXINFO_PKTINFO_ID 0x0010 257 #define HN_RXINFO_ALL \ 258 (HN_RXINFO_VLAN | \ 259 HN_RXINFO_CSUM | \ 260 HN_RXINFO_HASHINF | \ 261 HN_RXINFO_HASHVAL | \ 262 HN_RXINFO_PKTINFO_ID) 263 264 static int hn_probe(device_t); 265 static int hn_attach(device_t); 266 static int hn_detach(device_t); 267 static int hn_shutdown(device_t); 268 static void hn_chan_callback(struct vmbus_channel *, 269 void *); 270 271 static void hn_init(void *); 272 static int hn_ioctl(struct ifnet *, u_long, caddr_t); 273 #ifdef HN_IFSTART_SUPPORT 274 static void hn_start(struct ifnet *); 275 #endif 276 static int hn_transmit(struct ifnet *, struct mbuf *); 277 static void hn_xmit_qflush(struct ifnet *); 278 static int hn_ifmedia_upd(struct ifnet *); 279 static void hn_ifmedia_sts(struct ifnet *, 280 struct ifmediareq *); 281 282 static void hn_ifnet_event(void *, struct ifnet *, int); 283 static void hn_ifaddr_event(void *, struct ifnet *); 284 static void hn_ifnet_attevent(void *, struct ifnet *); 285 static void hn_ifnet_detevent(void *, struct ifnet *); 286 static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 287 288 static bool hn_ismyvf(const struct hn_softc *, 289 const struct ifnet *); 290 static void hn_rxvf_change(struct hn_softc *, 291 struct ifnet *, bool); 292 static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 293 static void hn_rxvf_set_task(void *, int); 294 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 295 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 296 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 297 struct ifreq *); 298 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 299 static bool hn_xpnt_vf_isready(struct hn_softc *); 300 static void hn_xpnt_vf_setready(struct hn_softc *); 301 static void hn_xpnt_vf_init_taskfunc(void *, int); 302 static void hn_xpnt_vf_init(struct hn_softc *); 303 static void hn_xpnt_vf_setenable(struct hn_softc *); 304 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 305 static void hn_vf_rss_fixup(struct hn_softc *, bool); 306 static void hn_vf_rss_restore(struct hn_softc *); 307 308 static int hn_rndis_rxinfo(const void *, int, 309 struct hn_rxinfo *); 310 static void hn_rndis_rx_data(struct hn_rx_ring *, 311 const void *, int); 312 static void hn_rndis_rx_status(struct hn_softc *, 313 const void *, int); 314 static void hn_rndis_init_fixat(struct hn_softc *, int); 315 316 static void hn_nvs_handle_notify(struct hn_softc *, 317 const struct vmbus_chanpkt_hdr *); 318 static void hn_nvs_handle_comp(struct hn_softc *, 319 struct vmbus_channel *, 320 const struct vmbus_chanpkt_hdr *); 321 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 322 struct vmbus_channel *, 323 const struct vmbus_chanpkt_hdr *); 324 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 325 struct vmbus_channel *, uint64_t); 326 327 #if __FreeBSD_version >= 1100099 328 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 329 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 330 #endif 331 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 333 #if __FreeBSD_version < 1100095 334 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 335 #else 336 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 337 #endif 338 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 345 #ifndef RSS 346 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 347 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 348 #endif 349 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 362 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 363 364 static void hn_stop(struct hn_softc *, bool); 365 static void hn_init_locked(struct hn_softc *); 366 static int hn_chan_attach(struct hn_softc *, 367 struct vmbus_channel *); 368 static void hn_chan_detach(struct hn_softc *, 369 struct vmbus_channel *); 370 static int hn_attach_subchans(struct hn_softc *); 371 static void hn_detach_allchans(struct hn_softc *); 372 static void hn_chan_rollup(struct hn_rx_ring *, 373 struct hn_tx_ring *); 374 static void hn_set_ring_inuse(struct hn_softc *, int); 375 static int hn_synth_attach(struct hn_softc *, int); 376 static void hn_synth_detach(struct hn_softc *); 377 static int hn_synth_alloc_subchans(struct hn_softc *, 378 int *); 379 static bool hn_synth_attachable(const struct hn_softc *); 380 static void hn_suspend(struct hn_softc *); 381 static void hn_suspend_data(struct hn_softc *); 382 static void hn_suspend_mgmt(struct hn_softc *); 383 static void hn_resume(struct hn_softc *); 384 static void hn_resume_data(struct hn_softc *); 385 static void hn_resume_mgmt(struct hn_softc *); 386 static void hn_suspend_mgmt_taskfunc(void *, int); 387 static void hn_chan_drain(struct hn_softc *, 388 struct vmbus_channel *); 389 static void hn_disable_rx(struct hn_softc *); 390 static void hn_drain_rxtx(struct hn_softc *, int); 391 static void hn_polling(struct hn_softc *, u_int); 392 static void hn_chan_polling(struct vmbus_channel *, u_int); 393 static void hn_mtu_change_fixup(struct hn_softc *); 394 395 static void hn_update_link_status(struct hn_softc *); 396 static void hn_change_network(struct hn_softc *); 397 static void hn_link_taskfunc(void *, int); 398 static void hn_netchg_init_taskfunc(void *, int); 399 static void hn_netchg_status_taskfunc(void *, int); 400 static void hn_link_status(struct hn_softc *); 401 402 static int hn_create_rx_data(struct hn_softc *, int); 403 static void hn_destroy_rx_data(struct hn_softc *); 404 static int hn_check_iplen(const struct mbuf *, int); 405 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 406 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 407 static int hn_rxfilter_config(struct hn_softc *); 408 static int hn_rss_reconfig(struct hn_softc *); 409 static void hn_rss_ind_fixup(struct hn_softc *); 410 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 411 static int hn_rxpkt(struct hn_rx_ring *); 412 static uint32_t hn_rss_type_fromndis(uint32_t); 413 static uint32_t hn_rss_type_tondis(uint32_t); 414 415 static int hn_tx_ring_create(struct hn_softc *, int); 416 static void hn_tx_ring_destroy(struct hn_tx_ring *); 417 static int hn_create_tx_data(struct hn_softc *, int); 418 static void hn_fixup_tx_data(struct hn_softc *); 419 static void hn_fixup_rx_data(struct hn_softc *); 420 static void hn_destroy_tx_data(struct hn_softc *); 421 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 422 static void hn_txdesc_gc(struct hn_tx_ring *, 423 struct hn_txdesc *); 424 static int hn_encap(struct ifnet *, struct hn_tx_ring *, 425 struct hn_txdesc *, struct mbuf **); 426 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 427 struct hn_txdesc *); 428 static void hn_set_chim_size(struct hn_softc *, int); 429 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 430 static bool hn_tx_ring_pending(struct hn_tx_ring *); 431 static void hn_tx_ring_qflush(struct hn_tx_ring *); 432 static void hn_resume_tx(struct hn_softc *, int); 433 static void hn_set_txagg(struct hn_softc *); 434 static void *hn_try_txagg(struct ifnet *, 435 struct hn_tx_ring *, struct hn_txdesc *, 436 int); 437 static int hn_get_txswq_depth(const struct hn_tx_ring *); 438 static void hn_txpkt_done(struct hn_nvs_sendctx *, 439 struct hn_softc *, struct vmbus_channel *, 440 const void *, int); 441 static int hn_txpkt_sglist(struct hn_tx_ring *, 442 struct hn_txdesc *); 443 static int hn_txpkt_chim(struct hn_tx_ring *, 444 struct hn_txdesc *); 445 static int hn_xmit(struct hn_tx_ring *, int); 446 static void hn_xmit_taskfunc(void *, int); 447 static void hn_xmit_txeof(struct hn_tx_ring *); 448 static void hn_xmit_txeof_taskfunc(void *, int); 449 #ifdef HN_IFSTART_SUPPORT 450 static int hn_start_locked(struct hn_tx_ring *, int); 451 static void hn_start_taskfunc(void *, int); 452 static void hn_start_txeof(struct hn_tx_ring *); 453 static void hn_start_txeof_taskfunc(void *, int); 454 #endif 455 456 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 457 "Hyper-V network interface"); 458 459 /* Trust tcp segements verification on host side. */ 460 static int hn_trust_hosttcp = 1; 461 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 462 &hn_trust_hosttcp, 0, 463 "Trust tcp segement verification on host side, " 464 "when csum info is missing (global setting)"); 465 466 /* Trust udp datagrams verification on host side. */ 467 static int hn_trust_hostudp = 1; 468 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 469 &hn_trust_hostudp, 0, 470 "Trust udp datagram verification on host side, " 471 "when csum info is missing (global setting)"); 472 473 /* Trust ip packets verification on host side. */ 474 static int hn_trust_hostip = 1; 475 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 476 &hn_trust_hostip, 0, 477 "Trust ip packet verification on host side, " 478 "when csum info is missing (global setting)"); 479 480 /* 481 * Offload UDP/IPv4 checksum. 482 */ 483 static int hn_enable_udp4cs = 1; 484 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 485 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 486 487 /* 488 * Offload UDP/IPv6 checksum. 489 */ 490 static int hn_enable_udp6cs = 1; 491 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 492 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 493 494 /* Stats. */ 495 static counter_u64_t hn_udpcs_fixup; 496 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 497 &hn_udpcs_fixup, "# of UDP checksum fixup"); 498 499 /* 500 * See hn_set_hlen(). 501 * 502 * This value is for Azure. For Hyper-V, set this above 503 * 65536 to disable UDP datagram checksum fixup. 504 */ 505 static int hn_udpcs_fixup_mtu = 1420; 506 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 507 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 508 509 /* Limit TSO burst size */ 510 static int hn_tso_maxlen = IP_MAXPACKET; 511 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 512 &hn_tso_maxlen, 0, "TSO burst limit"); 513 514 /* Limit chimney send size */ 515 static int hn_tx_chimney_size = 0; 516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 517 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 518 519 /* Limit the size of packet for direct transmission */ 520 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 521 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 522 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 523 524 /* # of LRO entries per RX ring */ 525 #if defined(INET) || defined(INET6) 526 #if __FreeBSD_version >= 1100095 527 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 528 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 529 &hn_lro_entry_count, 0, "LRO entry count"); 530 #endif 531 #endif 532 533 static int hn_tx_taskq_cnt = 1; 534 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 535 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 536 537 #define HN_TX_TASKQ_M_INDEP 0 538 #define HN_TX_TASKQ_M_GLOBAL 1 539 #define HN_TX_TASKQ_M_EVTTQ 2 540 541 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 542 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 543 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 544 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 545 546 #ifndef HN_USE_TXDESC_BUFRING 547 static int hn_use_txdesc_bufring = 0; 548 #else 549 static int hn_use_txdesc_bufring = 1; 550 #endif 551 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 552 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 553 554 #ifdef HN_IFSTART_SUPPORT 555 /* Use ifnet.if_start instead of ifnet.if_transmit */ 556 static int hn_use_if_start = 0; 557 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 558 &hn_use_if_start, 0, "Use if_start TX method"); 559 #endif 560 561 /* # of channels to use */ 562 static int hn_chan_cnt = 0; 563 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 564 &hn_chan_cnt, 0, 565 "# of channels to use; each channel has one RX ring and one TX ring"); 566 567 /* # of transmit rings to use */ 568 static int hn_tx_ring_cnt = 0; 569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 570 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 571 572 /* Software TX ring deptch */ 573 static int hn_tx_swq_depth = 0; 574 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 575 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 576 577 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 578 #if __FreeBSD_version >= 1100095 579 static u_int hn_lro_mbufq_depth = 0; 580 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 581 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 582 #endif 583 584 /* Packet transmission aggregation size limit */ 585 static int hn_tx_agg_size = -1; 586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 587 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 588 589 /* Packet transmission aggregation count limit */ 590 static int hn_tx_agg_pkts = -1; 591 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 592 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 593 594 /* VF list */ 595 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 596 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 597 hn_vflist_sysctl, "A", 598 "VF list"); 599 600 /* VF mapping */ 601 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 602 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 603 hn_vfmap_sysctl, "A", 604 "VF mapping"); 605 606 /* Transparent VF */ 607 static int hn_xpnt_vf = 1; 608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 609 &hn_xpnt_vf, 0, "Transparent VF mod"); 610 611 /* Accurate BPF support for Transparent VF */ 612 static int hn_xpnt_vf_accbpf = 0; 613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 614 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 615 616 /* Extra wait for transparent VF attach routing; unit seconds. */ 617 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 618 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 619 &hn_xpnt_vf_attwait, 0, 620 "Extra wait for transparent VF attach routing; unit: seconds"); 621 622 static u_int hn_cpu_index; /* next CPU for channel */ 623 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 624 625 static struct rmlock hn_vfmap_lock; 626 static int hn_vfmap_size; 627 static struct ifnet **hn_vfmap; 628 629 #ifndef RSS 630 static const uint8_t 631 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 632 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 633 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 634 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 635 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 636 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 637 }; 638 #endif /* !RSS */ 639 640 static const struct hyperv_guid hn_guid = { 641 .hv_guid = { 642 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 643 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 644 }; 645 646 static device_method_t hn_methods[] = { 647 /* Device interface */ 648 DEVMETHOD(device_probe, hn_probe), 649 DEVMETHOD(device_attach, hn_attach), 650 DEVMETHOD(device_detach, hn_detach), 651 DEVMETHOD(device_shutdown, hn_shutdown), 652 DEVMETHOD_END 653 }; 654 655 static driver_t hn_driver = { 656 "hn", 657 hn_methods, 658 sizeof(struct hn_softc) 659 }; 660 661 static devclass_t hn_devclass; 662 663 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 664 MODULE_VERSION(hn, 1); 665 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 666 667 #if __FreeBSD_version >= 1100099 668 static void 669 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 670 { 671 int i; 672 673 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 674 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 675 } 676 #endif 677 678 static int 679 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 680 { 681 682 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 683 txd->chim_size == 0, ("invalid rndis sglist txd")); 684 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 685 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 686 } 687 688 static int 689 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 690 { 691 struct hn_nvs_rndis rndis; 692 693 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 694 txd->chim_size > 0, ("invalid rndis chim txd")); 695 696 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 697 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 698 rndis.nvs_chim_idx = txd->chim_index; 699 rndis.nvs_chim_sz = txd->chim_size; 700 701 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 702 &rndis, sizeof(rndis), &txd->send_ctx)); 703 } 704 705 static __inline uint32_t 706 hn_chim_alloc(struct hn_softc *sc) 707 { 708 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 709 u_long *bmap = sc->hn_chim_bmap; 710 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 711 712 for (i = 0; i < bmap_cnt; ++i) { 713 int idx; 714 715 idx = ffsl(~bmap[i]); 716 if (idx == 0) 717 continue; 718 719 --idx; /* ffsl is 1-based */ 720 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 721 ("invalid i %d and idx %d", i, idx)); 722 723 if (atomic_testandset_long(&bmap[i], idx)) 724 continue; 725 726 ret = i * LONG_BIT + idx; 727 break; 728 } 729 return (ret); 730 } 731 732 static __inline void 733 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 734 { 735 u_long mask; 736 uint32_t idx; 737 738 idx = chim_idx / LONG_BIT; 739 KASSERT(idx < sc->hn_chim_bmap_cnt, 740 ("invalid chimney index 0x%x", chim_idx)); 741 742 mask = 1UL << (chim_idx % LONG_BIT); 743 KASSERT(sc->hn_chim_bmap[idx] & mask, 744 ("index bitmap 0x%lx, chimney index %u, " 745 "bitmap idx %d, bitmask 0x%lx", 746 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 747 748 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 749 } 750 751 #if defined(INET6) || defined(INET) 752 753 #define PULLUP_HDR(m, len) \ 754 do { \ 755 if (__predict_false((m)->m_len < (len))) { \ 756 (m) = m_pullup((m), (len)); \ 757 if ((m) == NULL) \ 758 return (NULL); \ 759 } \ 760 } while (0) 761 762 /* 763 * NOTE: If this function failed, the m_head would be freed. 764 */ 765 static __inline struct mbuf * 766 hn_tso_fixup(struct mbuf *m_head) 767 { 768 struct ether_vlan_header *evl; 769 struct tcphdr *th; 770 int ehlen; 771 772 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 773 774 PULLUP_HDR(m_head, sizeof(*evl)); 775 evl = mtod(m_head, struct ether_vlan_header *); 776 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 777 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 778 else 779 ehlen = ETHER_HDR_LEN; 780 m_head->m_pkthdr.l2hlen = ehlen; 781 782 #ifdef INET 783 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 784 struct ip *ip; 785 int iphlen; 786 787 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 788 ip = mtodo(m_head, ehlen); 789 iphlen = ip->ip_hl << 2; 790 m_head->m_pkthdr.l3hlen = iphlen; 791 792 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 793 th = mtodo(m_head, ehlen + iphlen); 794 795 ip->ip_len = 0; 796 ip->ip_sum = 0; 797 th->th_sum = in_pseudo(ip->ip_src.s_addr, 798 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 799 } 800 #endif 801 #if defined(INET6) && defined(INET) 802 else 803 #endif 804 #ifdef INET6 805 { 806 struct ip6_hdr *ip6; 807 808 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 809 ip6 = mtodo(m_head, ehlen); 810 if (ip6->ip6_nxt != IPPROTO_TCP) { 811 m_freem(m_head); 812 return (NULL); 813 } 814 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 815 816 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 817 th = mtodo(m_head, ehlen + sizeof(*ip6)); 818 819 ip6->ip6_plen = 0; 820 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 821 } 822 #endif 823 return (m_head); 824 } 825 826 /* 827 * NOTE: If this function failed, the m_head would be freed. 828 */ 829 static __inline struct mbuf * 830 hn_set_hlen(struct mbuf *m_head) 831 { 832 const struct ether_vlan_header *evl; 833 int ehlen; 834 835 PULLUP_HDR(m_head, sizeof(*evl)); 836 evl = mtod(m_head, const struct ether_vlan_header *); 837 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 838 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 839 else 840 ehlen = ETHER_HDR_LEN; 841 m_head->m_pkthdr.l2hlen = ehlen; 842 843 #ifdef INET 844 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 845 const struct ip *ip; 846 int iphlen; 847 848 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 849 ip = mtodo(m_head, ehlen); 850 iphlen = ip->ip_hl << 2; 851 m_head->m_pkthdr.l3hlen = iphlen; 852 853 /* 854 * UDP checksum offload does not work in Azure, if the 855 * following conditions meet: 856 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 857 * - IP_DF is not set in the IP hdr. 858 * 859 * Fallback to software checksum for these UDP datagrams. 860 */ 861 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 862 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 863 (ntohs(ip->ip_off) & IP_DF) == 0) { 864 uint16_t off = ehlen + iphlen; 865 866 counter_u64_add(hn_udpcs_fixup, 1); 867 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 868 *(uint16_t *)(m_head->m_data + off + 869 m_head->m_pkthdr.csum_data) = in_cksum_skip( 870 m_head, m_head->m_pkthdr.len, off); 871 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 872 } 873 } 874 #endif 875 #if defined(INET6) && defined(INET) 876 else 877 #endif 878 #ifdef INET6 879 { 880 const struct ip6_hdr *ip6; 881 882 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 883 ip6 = mtodo(m_head, ehlen); 884 if (ip6->ip6_nxt != IPPROTO_TCP && 885 ip6->ip6_nxt != IPPROTO_UDP) { 886 m_freem(m_head); 887 return (NULL); 888 } 889 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 890 } 891 #endif 892 return (m_head); 893 } 894 895 /* 896 * NOTE: If this function failed, the m_head would be freed. 897 */ 898 static __inline struct mbuf * 899 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 900 { 901 const struct tcphdr *th; 902 int ehlen, iphlen; 903 904 *tcpsyn = 0; 905 ehlen = m_head->m_pkthdr.l2hlen; 906 iphlen = m_head->m_pkthdr.l3hlen; 907 908 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 909 th = mtodo(m_head, ehlen + iphlen); 910 if (th->th_flags & TH_SYN) 911 *tcpsyn = 1; 912 return (m_head); 913 } 914 915 #undef PULLUP_HDR 916 917 #endif /* INET6 || INET */ 918 919 static int 920 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 921 { 922 int error = 0; 923 924 HN_LOCK_ASSERT(sc); 925 926 if (sc->hn_rx_filter != filter) { 927 error = hn_rndis_set_rxfilter(sc, filter); 928 if (!error) 929 sc->hn_rx_filter = filter; 930 } 931 return (error); 932 } 933 934 static int 935 hn_rxfilter_config(struct hn_softc *sc) 936 { 937 struct ifnet *ifp = sc->hn_ifp; 938 uint32_t filter; 939 940 HN_LOCK_ASSERT(sc); 941 942 /* 943 * If the non-transparent mode VF is activated, we don't know how 944 * its RX filter is configured, so stick the synthetic device in 945 * the promiscous mode. 946 */ 947 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 948 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 949 } else { 950 filter = NDIS_PACKET_TYPE_DIRECTED; 951 if (ifp->if_flags & IFF_BROADCAST) 952 filter |= NDIS_PACKET_TYPE_BROADCAST; 953 /* TODO: support multicast list */ 954 if ((ifp->if_flags & IFF_ALLMULTI) || 955 !CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 956 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 957 } 958 return (hn_set_rxfilter(sc, filter)); 959 } 960 961 static void 962 hn_set_txagg(struct hn_softc *sc) 963 { 964 uint32_t size, pkts; 965 int i; 966 967 /* 968 * Setup aggregation size. 969 */ 970 if (sc->hn_agg_size < 0) 971 size = UINT32_MAX; 972 else 973 size = sc->hn_agg_size; 974 975 if (sc->hn_rndis_agg_size < size) 976 size = sc->hn_rndis_agg_size; 977 978 /* NOTE: We only aggregate packets using chimney sending buffers. */ 979 if (size > (uint32_t)sc->hn_chim_szmax) 980 size = sc->hn_chim_szmax; 981 982 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 983 /* Disable */ 984 size = 0; 985 pkts = 0; 986 goto done; 987 } 988 989 /* NOTE: Type of the per TX ring setting is 'int'. */ 990 if (size > INT_MAX) 991 size = INT_MAX; 992 993 /* 994 * Setup aggregation packet count. 995 */ 996 if (sc->hn_agg_pkts < 0) 997 pkts = UINT32_MAX; 998 else 999 pkts = sc->hn_agg_pkts; 1000 1001 if (sc->hn_rndis_agg_pkts < pkts) 1002 pkts = sc->hn_rndis_agg_pkts; 1003 1004 if (pkts <= 1) { 1005 /* Disable */ 1006 size = 0; 1007 pkts = 0; 1008 goto done; 1009 } 1010 1011 /* NOTE: Type of the per TX ring setting is 'short'. */ 1012 if (pkts > SHRT_MAX) 1013 pkts = SHRT_MAX; 1014 1015 done: 1016 /* NOTE: Type of the per TX ring setting is 'short'. */ 1017 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1018 /* Disable */ 1019 size = 0; 1020 pkts = 0; 1021 } 1022 1023 if (bootverbose) { 1024 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1025 size, pkts, sc->hn_rndis_agg_align); 1026 } 1027 1028 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1029 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1030 1031 mtx_lock(&txr->hn_tx_lock); 1032 txr->hn_agg_szmax = size; 1033 txr->hn_agg_pktmax = pkts; 1034 txr->hn_agg_align = sc->hn_rndis_agg_align; 1035 mtx_unlock(&txr->hn_tx_lock); 1036 } 1037 } 1038 1039 static int 1040 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1041 { 1042 1043 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1044 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1045 return txr->hn_txdesc_cnt; 1046 return hn_tx_swq_depth; 1047 } 1048 1049 static int 1050 hn_rss_reconfig(struct hn_softc *sc) 1051 { 1052 int error; 1053 1054 HN_LOCK_ASSERT(sc); 1055 1056 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1057 return (ENXIO); 1058 1059 /* 1060 * Disable RSS first. 1061 * 1062 * NOTE: 1063 * Direct reconfiguration by setting the UNCHG flags does 1064 * _not_ work properly. 1065 */ 1066 if (bootverbose) 1067 if_printf(sc->hn_ifp, "disable RSS\n"); 1068 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1069 if (error) { 1070 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1071 return (error); 1072 } 1073 1074 /* 1075 * Reenable the RSS w/ the updated RSS key or indirect 1076 * table. 1077 */ 1078 if (bootverbose) 1079 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1080 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1081 if (error) { 1082 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1083 return (error); 1084 } 1085 return (0); 1086 } 1087 1088 static void 1089 hn_rss_ind_fixup(struct hn_softc *sc) 1090 { 1091 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1092 int i, nchan; 1093 1094 nchan = sc->hn_rx_ring_inuse; 1095 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1096 1097 /* 1098 * Check indirect table to make sure that all channels in it 1099 * can be used. 1100 */ 1101 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1102 if (rss->rss_ind[i] >= nchan) { 1103 if_printf(sc->hn_ifp, 1104 "RSS indirect table %d fixup: %u -> %d\n", 1105 i, rss->rss_ind[i], nchan - 1); 1106 rss->rss_ind[i] = nchan - 1; 1107 } 1108 } 1109 } 1110 1111 static int 1112 hn_ifmedia_upd(struct ifnet *ifp __unused) 1113 { 1114 1115 return EOPNOTSUPP; 1116 } 1117 1118 static void 1119 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1120 { 1121 struct hn_softc *sc = ifp->if_softc; 1122 1123 ifmr->ifm_status = IFM_AVALID; 1124 ifmr->ifm_active = IFM_ETHER; 1125 1126 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1127 ifmr->ifm_active |= IFM_NONE; 1128 return; 1129 } 1130 ifmr->ifm_status |= IFM_ACTIVE; 1131 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1132 } 1133 1134 static void 1135 hn_rxvf_set_task(void *xarg, int pending __unused) 1136 { 1137 struct hn_rxvf_setarg *arg = xarg; 1138 1139 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1140 } 1141 1142 static void 1143 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1144 { 1145 struct hn_rx_ring *rxr; 1146 struct hn_rxvf_setarg arg; 1147 struct task task; 1148 int i; 1149 1150 HN_LOCK_ASSERT(sc); 1151 1152 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1153 1154 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1155 rxr = &sc->hn_rx_ring[i]; 1156 1157 if (i < sc->hn_rx_ring_inuse) { 1158 arg.rxr = rxr; 1159 arg.vf_ifp = vf_ifp; 1160 vmbus_chan_run_task(rxr->hn_chan, &task); 1161 } else { 1162 rxr->hn_rxvf_ifp = vf_ifp; 1163 } 1164 } 1165 } 1166 1167 static bool 1168 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1169 { 1170 const struct ifnet *hn_ifp; 1171 1172 hn_ifp = sc->hn_ifp; 1173 1174 if (ifp == hn_ifp) 1175 return (false); 1176 1177 if (ifp->if_alloctype != IFT_ETHER) 1178 return (false); 1179 1180 /* Ignore lagg/vlan interfaces */ 1181 if (strcmp(ifp->if_dname, "lagg") == 0 || 1182 strcmp(ifp->if_dname, "vlan") == 0) 1183 return (false); 1184 1185 /* 1186 * During detach events ifp->if_addr might be NULL. 1187 * Make sure the bcmp() below doesn't panic on that: 1188 */ 1189 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1190 return (false); 1191 1192 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1193 return (false); 1194 1195 return (true); 1196 } 1197 1198 static void 1199 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1200 { 1201 struct ifnet *hn_ifp; 1202 1203 HN_LOCK(sc); 1204 1205 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1206 goto out; 1207 1208 if (!hn_ismyvf(sc, ifp)) 1209 goto out; 1210 hn_ifp = sc->hn_ifp; 1211 1212 if (rxvf) { 1213 if (sc->hn_flags & HN_FLAG_RXVF) 1214 goto out; 1215 1216 sc->hn_flags |= HN_FLAG_RXVF; 1217 hn_rxfilter_config(sc); 1218 } else { 1219 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1220 goto out; 1221 1222 sc->hn_flags &= ~HN_FLAG_RXVF; 1223 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1224 hn_rxfilter_config(sc); 1225 else 1226 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1227 } 1228 1229 hn_nvs_set_datapath(sc, 1230 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1231 1232 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1233 1234 if (rxvf) { 1235 hn_vf_rss_fixup(sc, true); 1236 hn_suspend_mgmt(sc); 1237 sc->hn_link_flags &= 1238 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1239 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1240 } else { 1241 hn_vf_rss_restore(sc); 1242 hn_resume_mgmt(sc); 1243 } 1244 1245 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1246 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1247 1248 if (bootverbose) { 1249 if_printf(hn_ifp, "datapath is switched %s %s\n", 1250 rxvf ? "to" : "from", ifp->if_xname); 1251 } 1252 out: 1253 HN_UNLOCK(sc); 1254 } 1255 1256 static void 1257 hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1258 { 1259 1260 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1261 return; 1262 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1263 } 1264 1265 static void 1266 hn_ifaddr_event(void *arg, struct ifnet *ifp) 1267 { 1268 1269 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1270 } 1271 1272 static int 1273 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1274 { 1275 struct ifnet *ifp, *vf_ifp; 1276 uint64_t tmp; 1277 int error; 1278 1279 HN_LOCK_ASSERT(sc); 1280 ifp = sc->hn_ifp; 1281 vf_ifp = sc->hn_vf_ifp; 1282 1283 /* 1284 * Fix up requested capabilities w/ supported capabilities, 1285 * since the supported capabilities could have been changed. 1286 */ 1287 ifr->ifr_reqcap &= ifp->if_capabilities; 1288 /* Pass SIOCSIFCAP to VF. */ 1289 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1290 1291 /* 1292 * NOTE: 1293 * The error will be propagated to the callers, however, it 1294 * is _not_ useful here. 1295 */ 1296 1297 /* 1298 * Merge VF's enabled capabilities. 1299 */ 1300 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1301 1302 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1303 if (ifp->if_capenable & IFCAP_TXCSUM) 1304 ifp->if_hwassist |= tmp; 1305 else 1306 ifp->if_hwassist &= ~tmp; 1307 1308 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1309 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1310 ifp->if_hwassist |= tmp; 1311 else 1312 ifp->if_hwassist &= ~tmp; 1313 1314 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1315 if (ifp->if_capenable & IFCAP_TSO4) 1316 ifp->if_hwassist |= tmp; 1317 else 1318 ifp->if_hwassist &= ~tmp; 1319 1320 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1321 if (ifp->if_capenable & IFCAP_TSO6) 1322 ifp->if_hwassist |= tmp; 1323 else 1324 ifp->if_hwassist &= ~tmp; 1325 1326 return (error); 1327 } 1328 1329 static int 1330 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1331 { 1332 struct ifnet *vf_ifp; 1333 struct ifreq ifr; 1334 1335 HN_LOCK_ASSERT(sc); 1336 vf_ifp = sc->hn_vf_ifp; 1337 1338 memset(&ifr, 0, sizeof(ifr)); 1339 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1340 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1341 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1342 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1343 } 1344 1345 static void 1346 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1347 { 1348 struct ifnet *ifp = sc->hn_ifp; 1349 int allmulti = 0; 1350 1351 HN_LOCK_ASSERT(sc); 1352 1353 /* XXX vlan(4) style mcast addr maintenance */ 1354 if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) 1355 allmulti = IFF_ALLMULTI; 1356 1357 /* Always set the VF's if_flags */ 1358 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1359 } 1360 1361 static void 1362 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1363 { 1364 struct rm_priotracker pt; 1365 struct ifnet *hn_ifp = NULL; 1366 struct mbuf *mn; 1367 1368 /* 1369 * XXX racy, if hn(4) ever detached. 1370 */ 1371 rm_rlock(&hn_vfmap_lock, &pt); 1372 if (vf_ifp->if_index < hn_vfmap_size) 1373 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1374 rm_runlock(&hn_vfmap_lock, &pt); 1375 1376 if (hn_ifp != NULL) { 1377 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1378 /* 1379 * Allow tapping on the VF. 1380 */ 1381 ETHER_BPF_MTAP(vf_ifp, mn); 1382 1383 /* 1384 * Update VF stats. 1385 */ 1386 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1387 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1388 mn->m_pkthdr.len); 1389 } 1390 /* 1391 * XXX IFCOUNTER_IMCAST 1392 * This stat updating is kinda invasive, since it 1393 * requires two checks on the mbuf: the length check 1394 * and the ethernet header check. As of this write, 1395 * all multicast packets go directly to hn(4), which 1396 * makes imcast stat updating in the VF a try in vian. 1397 */ 1398 1399 /* 1400 * Fix up rcvif and increase hn(4)'s ipackets. 1401 */ 1402 mn->m_pkthdr.rcvif = hn_ifp; 1403 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1404 } 1405 /* 1406 * Go through hn(4)'s if_input. 1407 */ 1408 hn_ifp->if_input(hn_ifp, m); 1409 } else { 1410 /* 1411 * In the middle of the transition; free this 1412 * mbuf chain. 1413 */ 1414 while (m != NULL) { 1415 mn = m->m_nextpkt; 1416 m->m_nextpkt = NULL; 1417 m_freem(m); 1418 m = mn; 1419 } 1420 } 1421 } 1422 1423 static void 1424 hn_mtu_change_fixup(struct hn_softc *sc) 1425 { 1426 struct ifnet *ifp; 1427 1428 HN_LOCK_ASSERT(sc); 1429 ifp = sc->hn_ifp; 1430 1431 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1432 #if __FreeBSD_version >= 1100099 1433 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1434 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1435 #endif 1436 } 1437 1438 static uint32_t 1439 hn_rss_type_fromndis(uint32_t rss_hash) 1440 { 1441 uint32_t types = 0; 1442 1443 if (rss_hash & NDIS_HASH_IPV4) 1444 types |= RSS_TYPE_IPV4; 1445 if (rss_hash & NDIS_HASH_TCP_IPV4) 1446 types |= RSS_TYPE_TCP_IPV4; 1447 if (rss_hash & NDIS_HASH_IPV6) 1448 types |= RSS_TYPE_IPV6; 1449 if (rss_hash & NDIS_HASH_IPV6_EX) 1450 types |= RSS_TYPE_IPV6_EX; 1451 if (rss_hash & NDIS_HASH_TCP_IPV6) 1452 types |= RSS_TYPE_TCP_IPV6; 1453 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1454 types |= RSS_TYPE_TCP_IPV6_EX; 1455 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1456 types |= RSS_TYPE_UDP_IPV4; 1457 return (types); 1458 } 1459 1460 static uint32_t 1461 hn_rss_type_tondis(uint32_t types) 1462 { 1463 uint32_t rss_hash = 0; 1464 1465 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1466 ("UDP6 and UDP6EX are not supported")); 1467 1468 if (types & RSS_TYPE_IPV4) 1469 rss_hash |= NDIS_HASH_IPV4; 1470 if (types & RSS_TYPE_TCP_IPV4) 1471 rss_hash |= NDIS_HASH_TCP_IPV4; 1472 if (types & RSS_TYPE_IPV6) 1473 rss_hash |= NDIS_HASH_IPV6; 1474 if (types & RSS_TYPE_IPV6_EX) 1475 rss_hash |= NDIS_HASH_IPV6_EX; 1476 if (types & RSS_TYPE_TCP_IPV6) 1477 rss_hash |= NDIS_HASH_TCP_IPV6; 1478 if (types & RSS_TYPE_TCP_IPV6_EX) 1479 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1480 if (types & RSS_TYPE_UDP_IPV4) 1481 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1482 return (rss_hash); 1483 } 1484 1485 static void 1486 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1487 { 1488 int i; 1489 1490 HN_LOCK_ASSERT(sc); 1491 1492 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1493 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1494 } 1495 1496 static void 1497 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1498 { 1499 struct ifnet *ifp, *vf_ifp; 1500 struct ifrsshash ifrh; 1501 struct ifrsskey ifrk; 1502 int error; 1503 uint32_t my_types, diff_types, mbuf_types = 0; 1504 1505 HN_LOCK_ASSERT(sc); 1506 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1507 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1508 1509 if (sc->hn_rx_ring_inuse == 1) { 1510 /* No RSS on synthetic parts; done. */ 1511 return; 1512 } 1513 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1514 /* Synthetic parts do not support Toeplitz; done. */ 1515 return; 1516 } 1517 1518 ifp = sc->hn_ifp; 1519 vf_ifp = sc->hn_vf_ifp; 1520 1521 /* 1522 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1523 * supported. 1524 */ 1525 memset(&ifrk, 0, sizeof(ifrk)); 1526 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1527 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1528 if (error) { 1529 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1530 vf_ifp->if_xname, error); 1531 goto done; 1532 } 1533 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1534 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1535 vf_ifp->if_xname, ifrk.ifrk_func); 1536 goto done; 1537 } 1538 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1539 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1540 vf_ifp->if_xname, ifrk.ifrk_keylen); 1541 goto done; 1542 } 1543 1544 /* 1545 * Extract VF's RSS hash. Only Toeplitz is supported. 1546 */ 1547 memset(&ifrh, 0, sizeof(ifrh)); 1548 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1549 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1550 if (error) { 1551 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1552 vf_ifp->if_xname, error); 1553 goto done; 1554 } 1555 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1556 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1557 vf_ifp->if_xname, ifrh.ifrh_func); 1558 goto done; 1559 } 1560 1561 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1562 if ((ifrh.ifrh_types & my_types) == 0) { 1563 /* This disables RSS; ignore it then */ 1564 if_printf(ifp, "%s intersection of RSS types failed. " 1565 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1566 ifrh.ifrh_types, my_types); 1567 goto done; 1568 } 1569 1570 diff_types = my_types ^ ifrh.ifrh_types; 1571 my_types &= ifrh.ifrh_types; 1572 mbuf_types = my_types; 1573 1574 /* 1575 * Detect RSS hash value/type confliction. 1576 * 1577 * NOTE: 1578 * We don't disable the hash type, but stop delivery the hash 1579 * value/type through mbufs on RX path. 1580 * 1581 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1582 * hash is delivered with type of TCP_IPV4. This means if 1583 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1584 * least to hn_mbuf_hash. However, given that _all_ of the 1585 * NICs implement TCP_IPV4, this will _not_ impose any issues 1586 * here. 1587 */ 1588 if ((my_types & RSS_TYPE_IPV4) && 1589 (diff_types & ifrh.ifrh_types & 1590 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1591 /* Conflict; disable IPV4 hash type/value delivery. */ 1592 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1593 mbuf_types &= ~RSS_TYPE_IPV4; 1594 } 1595 if ((my_types & RSS_TYPE_IPV6) && 1596 (diff_types & ifrh.ifrh_types & 1597 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1598 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1599 RSS_TYPE_IPV6_EX))) { 1600 /* Conflict; disable IPV6 hash type/value delivery. */ 1601 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1602 mbuf_types &= ~RSS_TYPE_IPV6; 1603 } 1604 if ((my_types & RSS_TYPE_IPV6_EX) && 1605 (diff_types & ifrh.ifrh_types & 1606 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1607 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1608 RSS_TYPE_IPV6))) { 1609 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1610 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1611 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1612 } 1613 if ((my_types & RSS_TYPE_TCP_IPV6) && 1614 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1615 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1616 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1617 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1618 } 1619 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1620 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1621 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1622 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1623 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1624 } 1625 if ((my_types & RSS_TYPE_UDP_IPV6) && 1626 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1627 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1628 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1629 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1630 } 1631 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1632 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1633 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1634 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1635 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1636 } 1637 1638 /* 1639 * Indirect table does not matter. 1640 */ 1641 1642 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1643 hn_rss_type_tondis(my_types); 1644 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1645 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1646 1647 if (reconf) { 1648 error = hn_rss_reconfig(sc); 1649 if (error) { 1650 /* XXX roll-back? */ 1651 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1652 /* XXX keep going. */ 1653 } 1654 } 1655 done: 1656 /* Hash deliverability for mbufs. */ 1657 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1658 } 1659 1660 static void 1661 hn_vf_rss_restore(struct hn_softc *sc) 1662 { 1663 1664 HN_LOCK_ASSERT(sc); 1665 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1666 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1667 1668 if (sc->hn_rx_ring_inuse == 1) 1669 goto done; 1670 1671 /* 1672 * Restore hash types. Key does _not_ matter. 1673 */ 1674 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1675 int error; 1676 1677 sc->hn_rss_hash = sc->hn_rss_hcap; 1678 error = hn_rss_reconfig(sc); 1679 if (error) { 1680 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1681 error); 1682 /* XXX keep going. */ 1683 } 1684 } 1685 done: 1686 /* Hash deliverability for mbufs. */ 1687 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1688 } 1689 1690 static void 1691 hn_xpnt_vf_setready(struct hn_softc *sc) 1692 { 1693 struct ifnet *ifp, *vf_ifp; 1694 struct ifreq ifr; 1695 1696 HN_LOCK_ASSERT(sc); 1697 ifp = sc->hn_ifp; 1698 vf_ifp = sc->hn_vf_ifp; 1699 1700 /* 1701 * Mark the VF ready. 1702 */ 1703 sc->hn_vf_rdytick = 0; 1704 1705 /* 1706 * Save information for restoration. 1707 */ 1708 sc->hn_saved_caps = ifp->if_capabilities; 1709 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1710 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1711 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1712 1713 /* 1714 * Intersect supported/enabled capabilities. 1715 * 1716 * NOTE: 1717 * if_hwassist is not changed here. 1718 */ 1719 ifp->if_capabilities &= vf_ifp->if_capabilities; 1720 ifp->if_capenable &= ifp->if_capabilities; 1721 1722 /* 1723 * Fix TSO settings. 1724 */ 1725 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1726 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1727 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1728 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1729 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1730 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1731 1732 /* 1733 * Change VF's enabled capabilities. 1734 */ 1735 memset(&ifr, 0, sizeof(ifr)); 1736 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1737 ifr.ifr_reqcap = ifp->if_capenable; 1738 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1739 1740 if (ifp->if_mtu != ETHERMTU) { 1741 int error; 1742 1743 /* 1744 * Change VF's MTU. 1745 */ 1746 memset(&ifr, 0, sizeof(ifr)); 1747 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1748 ifr.ifr_mtu = ifp->if_mtu; 1749 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1750 if (error) { 1751 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1752 vf_ifp->if_xname, ifp->if_mtu); 1753 if (ifp->if_mtu > ETHERMTU) { 1754 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1755 1756 /* 1757 * XXX 1758 * No need to adjust the synthetic parts' MTU; 1759 * failure of the adjustment will cause us 1760 * infinite headache. 1761 */ 1762 ifp->if_mtu = ETHERMTU; 1763 hn_mtu_change_fixup(sc); 1764 } 1765 } 1766 } 1767 } 1768 1769 static bool 1770 hn_xpnt_vf_isready(struct hn_softc *sc) 1771 { 1772 1773 HN_LOCK_ASSERT(sc); 1774 1775 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1776 return (false); 1777 1778 if (sc->hn_vf_rdytick == 0) 1779 return (true); 1780 1781 if (sc->hn_vf_rdytick > ticks) 1782 return (false); 1783 1784 /* Mark VF as ready. */ 1785 hn_xpnt_vf_setready(sc); 1786 return (true); 1787 } 1788 1789 static void 1790 hn_xpnt_vf_setenable(struct hn_softc *sc) 1791 { 1792 int i; 1793 1794 HN_LOCK_ASSERT(sc); 1795 1796 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1797 rm_wlock(&sc->hn_vf_lock); 1798 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1799 rm_wunlock(&sc->hn_vf_lock); 1800 1801 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1802 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1803 } 1804 1805 static void 1806 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1807 { 1808 int i; 1809 1810 HN_LOCK_ASSERT(sc); 1811 1812 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1813 rm_wlock(&sc->hn_vf_lock); 1814 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1815 if (clear_vf) 1816 sc->hn_vf_ifp = NULL; 1817 rm_wunlock(&sc->hn_vf_lock); 1818 1819 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1820 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1821 } 1822 1823 static void 1824 hn_xpnt_vf_init(struct hn_softc *sc) 1825 { 1826 int error; 1827 1828 HN_LOCK_ASSERT(sc); 1829 1830 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1831 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1832 1833 if (bootverbose) { 1834 if_printf(sc->hn_ifp, "try bringing up %s\n", 1835 sc->hn_vf_ifp->if_xname); 1836 } 1837 1838 /* 1839 * Bring the VF up. 1840 */ 1841 hn_xpnt_vf_saveifflags(sc); 1842 sc->hn_vf_ifp->if_flags |= IFF_UP; 1843 error = hn_xpnt_vf_iocsetflags(sc); 1844 if (error) { 1845 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1846 sc->hn_vf_ifp->if_xname, error); 1847 return; 1848 } 1849 1850 /* 1851 * NOTE: 1852 * Datapath setting must happen _after_ bringing the VF up. 1853 */ 1854 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1855 1856 /* 1857 * NOTE: 1858 * Fixup RSS related bits _after_ the VF is brought up, since 1859 * many VFs generate RSS key during it's initialization. 1860 */ 1861 hn_vf_rss_fixup(sc, true); 1862 1863 /* Mark transparent mode VF as enabled. */ 1864 hn_xpnt_vf_setenable(sc); 1865 } 1866 1867 static void 1868 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1869 { 1870 struct hn_softc *sc = xsc; 1871 1872 HN_LOCK(sc); 1873 1874 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1875 goto done; 1876 if (sc->hn_vf_ifp == NULL) 1877 goto done; 1878 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1879 goto done; 1880 1881 if (sc->hn_vf_rdytick != 0) { 1882 /* Mark VF as ready. */ 1883 hn_xpnt_vf_setready(sc); 1884 } 1885 1886 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1887 /* 1888 * Delayed VF initialization. 1889 */ 1890 if (bootverbose) { 1891 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1892 sc->hn_vf_ifp->if_xname); 1893 } 1894 hn_xpnt_vf_init(sc); 1895 } 1896 done: 1897 HN_UNLOCK(sc); 1898 } 1899 1900 static void 1901 hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1902 { 1903 struct hn_softc *sc = xsc; 1904 1905 HN_LOCK(sc); 1906 1907 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1908 goto done; 1909 1910 if (!hn_ismyvf(sc, ifp)) 1911 goto done; 1912 1913 if (sc->hn_vf_ifp != NULL) { 1914 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1915 sc->hn_vf_ifp->if_xname); 1916 goto done; 1917 } 1918 1919 if (hn_xpnt_vf && ifp->if_start != NULL) { 1920 /* 1921 * ifnet.if_start is _not_ supported by transparent 1922 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1923 */ 1924 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1925 "in transparent VF mode.\n", ifp->if_xname); 1926 goto done; 1927 } 1928 1929 rm_wlock(&hn_vfmap_lock); 1930 1931 if (ifp->if_index >= hn_vfmap_size) { 1932 struct ifnet **newmap; 1933 int newsize; 1934 1935 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1936 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1937 M_WAITOK | M_ZERO); 1938 1939 memcpy(newmap, hn_vfmap, 1940 sizeof(struct ifnet *) * hn_vfmap_size); 1941 free(hn_vfmap, M_DEVBUF); 1942 hn_vfmap = newmap; 1943 hn_vfmap_size = newsize; 1944 } 1945 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1946 ("%s: ifindex %d was mapped to %s", 1947 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1948 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1949 1950 rm_wunlock(&hn_vfmap_lock); 1951 1952 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1953 rm_wlock(&sc->hn_vf_lock); 1954 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1955 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1956 sc->hn_vf_ifp = ifp; 1957 rm_wunlock(&sc->hn_vf_lock); 1958 1959 if (hn_xpnt_vf) { 1960 int wait_ticks; 1961 1962 /* 1963 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1964 * Save vf_ifp's current if_input for later restoration. 1965 */ 1966 sc->hn_vf_input = ifp->if_input; 1967 ifp->if_input = hn_xpnt_vf_input; 1968 1969 /* 1970 * Stop link status management; use the VF's. 1971 */ 1972 hn_suspend_mgmt(sc); 1973 1974 /* 1975 * Give VF sometime to complete its attach routing. 1976 */ 1977 wait_ticks = hn_xpnt_vf_attwait * hz; 1978 sc->hn_vf_rdytick = ticks + wait_ticks; 1979 1980 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1981 wait_ticks); 1982 } 1983 done: 1984 HN_UNLOCK(sc); 1985 } 1986 1987 static void 1988 hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1989 { 1990 struct hn_softc *sc = xsc; 1991 1992 HN_LOCK(sc); 1993 1994 if (sc->hn_vf_ifp == NULL) 1995 goto done; 1996 1997 if (!hn_ismyvf(sc, ifp)) 1998 goto done; 1999 2000 if (hn_xpnt_vf) { 2001 /* 2002 * Make sure that the delayed initialization is not running. 2003 * 2004 * NOTE: 2005 * - This lock _must_ be released, since the hn_vf_init task 2006 * will try holding this lock. 2007 * - It is safe to release this lock here, since the 2008 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2009 * 2010 * XXX racy, if hn(4) ever detached. 2011 */ 2012 HN_UNLOCK(sc); 2013 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2014 HN_LOCK(sc); 2015 2016 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2017 sc->hn_ifp->if_xname)); 2018 ifp->if_input = sc->hn_vf_input; 2019 sc->hn_vf_input = NULL; 2020 2021 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2022 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2023 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2024 2025 if (sc->hn_vf_rdytick == 0) { 2026 /* 2027 * The VF was ready; restore some settings. 2028 */ 2029 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2030 /* 2031 * NOTE: 2032 * There is _no_ need to fixup if_capenable and 2033 * if_hwassist, since the if_capabilities before 2034 * restoration was an intersection of the VF's 2035 * if_capabilites and the synthetic device's 2036 * if_capabilites. 2037 */ 2038 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2039 sc->hn_ifp->if_hw_tsomaxsegcount = 2040 sc->hn_saved_tsosegcnt; 2041 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2042 } 2043 2044 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2045 /* 2046 * Restore RSS settings. 2047 */ 2048 hn_vf_rss_restore(sc); 2049 2050 /* 2051 * Resume link status management, which was suspended 2052 * by hn_ifnet_attevent(). 2053 */ 2054 hn_resume_mgmt(sc); 2055 } 2056 } 2057 2058 /* Mark transparent mode VF as disabled. */ 2059 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2060 2061 rm_wlock(&hn_vfmap_lock); 2062 2063 KASSERT(ifp->if_index < hn_vfmap_size, 2064 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2065 if (hn_vfmap[ifp->if_index] != NULL) { 2066 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2067 ("%s: ifindex %d was mapped to %s", 2068 ifp->if_xname, ifp->if_index, 2069 hn_vfmap[ifp->if_index]->if_xname)); 2070 hn_vfmap[ifp->if_index] = NULL; 2071 } 2072 2073 rm_wunlock(&hn_vfmap_lock); 2074 done: 2075 HN_UNLOCK(sc); 2076 } 2077 2078 static void 2079 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2080 { 2081 struct hn_softc *sc = xsc; 2082 2083 if (sc->hn_vf_ifp == ifp) 2084 if_link_state_change(sc->hn_ifp, link_state); 2085 } 2086 2087 static int 2088 hn_probe(device_t dev) 2089 { 2090 2091 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2092 device_set_desc(dev, "Hyper-V Network Interface"); 2093 return BUS_PROBE_DEFAULT; 2094 } 2095 return ENXIO; 2096 } 2097 2098 static int 2099 hn_attach(device_t dev) 2100 { 2101 struct hn_softc *sc = device_get_softc(dev); 2102 struct sysctl_oid_list *child; 2103 struct sysctl_ctx_list *ctx; 2104 uint8_t eaddr[ETHER_ADDR_LEN]; 2105 struct ifnet *ifp = NULL; 2106 int error, ring_cnt, tx_ring_cnt; 2107 uint32_t mtu; 2108 2109 sc->hn_dev = dev; 2110 sc->hn_prichan = vmbus_get_channel(dev); 2111 HN_LOCK_INIT(sc); 2112 rm_init(&sc->hn_vf_lock, "hnvf"); 2113 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2114 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2115 2116 /* 2117 * Initialize these tunables once. 2118 */ 2119 sc->hn_agg_size = hn_tx_agg_size; 2120 sc->hn_agg_pkts = hn_tx_agg_pkts; 2121 2122 /* 2123 * Setup taskqueue for transmission. 2124 */ 2125 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2126 int i; 2127 2128 sc->hn_tx_taskqs = 2129 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2130 M_DEVBUF, M_WAITOK); 2131 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2132 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2133 M_WAITOK, taskqueue_thread_enqueue, 2134 &sc->hn_tx_taskqs[i]); 2135 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2136 "%s tx%d", device_get_nameunit(dev), i); 2137 } 2138 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2139 sc->hn_tx_taskqs = hn_tx_taskque; 2140 } 2141 2142 /* 2143 * Setup taskqueue for mangement tasks, e.g. link status. 2144 */ 2145 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2146 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2147 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2148 device_get_nameunit(dev)); 2149 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2150 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2151 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2152 hn_netchg_status_taskfunc, sc); 2153 2154 if (hn_xpnt_vf) { 2155 /* 2156 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2157 */ 2158 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2159 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2160 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2161 device_get_nameunit(dev)); 2162 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2163 hn_xpnt_vf_init_taskfunc, sc); 2164 } 2165 2166 /* 2167 * Allocate ifnet and setup its name earlier, so that if_printf 2168 * can be used by functions, which will be called after 2169 * ether_ifattach(). 2170 */ 2171 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2172 ifp->if_softc = sc; 2173 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2174 2175 /* 2176 * Initialize ifmedia earlier so that it can be unconditionally 2177 * destroyed, if error happened later on. 2178 */ 2179 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2180 2181 /* 2182 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2183 * to use (tx_ring_cnt). 2184 * 2185 * NOTE: 2186 * The # of RX rings to use is same as the # of channels to use. 2187 */ 2188 ring_cnt = hn_chan_cnt; 2189 if (ring_cnt <= 0) { 2190 /* Default */ 2191 ring_cnt = mp_ncpus; 2192 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2193 ring_cnt = HN_RING_CNT_DEF_MAX; 2194 } else if (ring_cnt > mp_ncpus) { 2195 ring_cnt = mp_ncpus; 2196 } 2197 #ifdef RSS 2198 if (ring_cnt > rss_getnumbuckets()) 2199 ring_cnt = rss_getnumbuckets(); 2200 #endif 2201 2202 tx_ring_cnt = hn_tx_ring_cnt; 2203 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2204 tx_ring_cnt = ring_cnt; 2205 #ifdef HN_IFSTART_SUPPORT 2206 if (hn_use_if_start) { 2207 /* ifnet.if_start only needs one TX ring. */ 2208 tx_ring_cnt = 1; 2209 } 2210 #endif 2211 2212 /* 2213 * Set the leader CPU for channels. 2214 */ 2215 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2216 2217 /* 2218 * Create enough TX/RX rings, even if only limited number of 2219 * channels can be allocated. 2220 */ 2221 error = hn_create_tx_data(sc, tx_ring_cnt); 2222 if (error) 2223 goto failed; 2224 error = hn_create_rx_data(sc, ring_cnt); 2225 if (error) 2226 goto failed; 2227 2228 /* 2229 * Create transaction context for NVS and RNDIS transactions. 2230 */ 2231 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2232 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2233 if (sc->hn_xact == NULL) { 2234 error = ENXIO; 2235 goto failed; 2236 } 2237 2238 /* 2239 * Install orphan handler for the revocation of this device's 2240 * primary channel. 2241 * 2242 * NOTE: 2243 * The processing order is critical here: 2244 * Install the orphan handler, _before_ testing whether this 2245 * device's primary channel has been revoked or not. 2246 */ 2247 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2248 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2249 error = ENXIO; 2250 goto failed; 2251 } 2252 2253 /* 2254 * Attach the synthetic parts, i.e. NVS and RNDIS. 2255 */ 2256 error = hn_synth_attach(sc, ETHERMTU); 2257 if (error) 2258 goto failed; 2259 2260 error = hn_rndis_get_eaddr(sc, eaddr); 2261 if (error) 2262 goto failed; 2263 2264 error = hn_rndis_get_mtu(sc, &mtu); 2265 if (error) 2266 mtu = ETHERMTU; 2267 else if (bootverbose) 2268 device_printf(dev, "RNDIS mtu %u\n", mtu); 2269 2270 #if __FreeBSD_version >= 1100099 2271 if (sc->hn_rx_ring_inuse > 1) { 2272 /* 2273 * Reduce TCP segment aggregation limit for multiple 2274 * RX rings to increase ACK timeliness. 2275 */ 2276 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2277 } 2278 #endif 2279 2280 /* 2281 * Fixup TX/RX stuffs after synthetic parts are attached. 2282 */ 2283 hn_fixup_tx_data(sc); 2284 hn_fixup_rx_data(sc); 2285 2286 ctx = device_get_sysctl_ctx(dev); 2287 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2288 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2289 &sc->hn_nvs_ver, 0, "NVS version"); 2290 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2291 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2292 hn_ndis_version_sysctl, "A", "NDIS version"); 2293 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2294 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2295 hn_caps_sysctl, "A", "capabilities"); 2296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2297 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2298 hn_hwassist_sysctl, "A", "hwassist"); 2299 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2300 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2301 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2302 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2303 "max # of TSO segments"); 2304 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2305 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2306 "max size of TSO segment"); 2307 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2308 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2309 hn_rxfilter_sysctl, "A", "rxfilter"); 2310 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2311 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2312 hn_rss_hash_sysctl, "A", "RSS hash"); 2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2314 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2315 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2317 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2318 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2319 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2320 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2321 #ifndef RSS 2322 /* 2323 * Don't allow RSS key/indirect table changes, if RSS is defined. 2324 */ 2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2326 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2327 hn_rss_key_sysctl, "IU", "RSS key"); 2328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2329 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2330 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2331 #endif 2332 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2333 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2334 "RNDIS offered packet transmission aggregation size limit"); 2335 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2336 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2337 "RNDIS offered packet transmission aggregation count limit"); 2338 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2339 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2340 "RNDIS packet transmission aggregation alignment"); 2341 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2342 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2343 hn_txagg_size_sysctl, "I", 2344 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2345 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2346 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2347 hn_txagg_pkts_sysctl, "I", 2348 "Packet transmission aggregation packets, " 2349 "0 -- disable, -1 -- auto"); 2350 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2351 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2352 hn_polling_sysctl, "I", 2353 "Polling frequency: [100,1000000], 0 disable polling"); 2354 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2355 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2356 hn_vf_sysctl, "A", "Virtual Function's name"); 2357 if (!hn_xpnt_vf) { 2358 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2359 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2360 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2361 } else { 2362 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2363 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2364 hn_xpnt_vf_enabled_sysctl, "I", 2365 "Transparent VF enabled"); 2366 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2367 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2368 hn_xpnt_vf_accbpf_sysctl, "I", 2369 "Accurate BPF for transparent VF"); 2370 } 2371 2372 /* 2373 * Setup the ifmedia, which has been initialized earlier. 2374 */ 2375 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2376 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2377 /* XXX ifmedia_set really should do this for us */ 2378 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2379 2380 /* 2381 * Setup the ifnet for this interface. 2382 */ 2383 2384 ifp->if_baudrate = IF_Gbps(10); 2385 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2386 ifp->if_ioctl = hn_ioctl; 2387 ifp->if_init = hn_init; 2388 #ifdef HN_IFSTART_SUPPORT 2389 if (hn_use_if_start) { 2390 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2391 2392 ifp->if_start = hn_start; 2393 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2394 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2395 IFQ_SET_READY(&ifp->if_snd); 2396 } else 2397 #endif 2398 { 2399 ifp->if_transmit = hn_transmit; 2400 ifp->if_qflush = hn_xmit_qflush; 2401 } 2402 2403 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2404 #ifdef foo 2405 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2406 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2407 #endif 2408 if (sc->hn_caps & HN_CAP_VLAN) { 2409 /* XXX not sure about VLAN_MTU. */ 2410 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2411 } 2412 2413 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2414 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2415 ifp->if_capabilities |= IFCAP_TXCSUM; 2416 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2417 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2418 if (sc->hn_caps & HN_CAP_TSO4) { 2419 ifp->if_capabilities |= IFCAP_TSO4; 2420 ifp->if_hwassist |= CSUM_IP_TSO; 2421 } 2422 if (sc->hn_caps & HN_CAP_TSO6) { 2423 ifp->if_capabilities |= IFCAP_TSO6; 2424 ifp->if_hwassist |= CSUM_IP6_TSO; 2425 } 2426 2427 /* Enable all available capabilities by default. */ 2428 ifp->if_capenable = ifp->if_capabilities; 2429 2430 /* 2431 * Disable IPv6 TSO and TXCSUM by default, they still can 2432 * be enabled through SIOCSIFCAP. 2433 */ 2434 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2435 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2436 2437 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2438 /* 2439 * Lock hn_set_tso_maxsize() to simplify its 2440 * internal logic. 2441 */ 2442 HN_LOCK(sc); 2443 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2444 HN_UNLOCK(sc); 2445 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2446 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2447 } 2448 2449 ether_ifattach(ifp, eaddr); 2450 2451 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2452 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2453 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2454 } 2455 if (mtu < ETHERMTU) { 2456 if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu); 2457 ifp->if_mtu = mtu; 2458 } 2459 2460 /* Inform the upper layer about the long frame support. */ 2461 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2462 2463 /* 2464 * Kick off link status check. 2465 */ 2466 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2467 hn_update_link_status(sc); 2468 2469 if (!hn_xpnt_vf) { 2470 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2471 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2472 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2473 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2474 } else { 2475 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2476 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2477 } 2478 2479 /* 2480 * NOTE: 2481 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2482 * since interface's LLADDR is needed; interface LLADDR is not 2483 * available when ifnet_arrival event is triggered. 2484 */ 2485 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2486 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2487 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2488 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2489 2490 return (0); 2491 failed: 2492 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2493 hn_synth_detach(sc); 2494 hn_detach(dev); 2495 return (error); 2496 } 2497 2498 static int 2499 hn_detach(device_t dev) 2500 { 2501 struct hn_softc *sc = device_get_softc(dev); 2502 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2503 2504 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2505 /* 2506 * In case that the vmbus missed the orphan handler 2507 * installation. 2508 */ 2509 vmbus_xact_ctx_orphan(sc->hn_xact); 2510 } 2511 2512 if (sc->hn_ifaddr_evthand != NULL) 2513 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2514 if (sc->hn_ifnet_evthand != NULL) 2515 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2516 if (sc->hn_ifnet_atthand != NULL) { 2517 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2518 sc->hn_ifnet_atthand); 2519 } 2520 if (sc->hn_ifnet_dethand != NULL) { 2521 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2522 sc->hn_ifnet_dethand); 2523 } 2524 if (sc->hn_ifnet_lnkhand != NULL) 2525 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2526 2527 vf_ifp = sc->hn_vf_ifp; 2528 __compiler_membar(); 2529 if (vf_ifp != NULL) 2530 hn_ifnet_detevent(sc, vf_ifp); 2531 2532 if (device_is_attached(dev)) { 2533 HN_LOCK(sc); 2534 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2535 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2536 hn_stop(sc, true); 2537 /* 2538 * NOTE: 2539 * hn_stop() only suspends data, so managment 2540 * stuffs have to be suspended manually here. 2541 */ 2542 hn_suspend_mgmt(sc); 2543 hn_synth_detach(sc); 2544 } 2545 HN_UNLOCK(sc); 2546 ether_ifdetach(ifp); 2547 } 2548 2549 ifmedia_removeall(&sc->hn_media); 2550 hn_destroy_rx_data(sc); 2551 hn_destroy_tx_data(sc); 2552 2553 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2554 int i; 2555 2556 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2557 taskqueue_free(sc->hn_tx_taskqs[i]); 2558 free(sc->hn_tx_taskqs, M_DEVBUF); 2559 } 2560 taskqueue_free(sc->hn_mgmt_taskq0); 2561 if (sc->hn_vf_taskq != NULL) 2562 taskqueue_free(sc->hn_vf_taskq); 2563 2564 if (sc->hn_xact != NULL) { 2565 /* 2566 * Uninstall the orphan handler _before_ the xact is 2567 * destructed. 2568 */ 2569 vmbus_chan_unset_orphan(sc->hn_prichan); 2570 vmbus_xact_ctx_destroy(sc->hn_xact); 2571 } 2572 2573 if_free(ifp); 2574 2575 HN_LOCK_DESTROY(sc); 2576 rm_destroy(&sc->hn_vf_lock); 2577 return (0); 2578 } 2579 2580 static int 2581 hn_shutdown(device_t dev) 2582 { 2583 2584 return (0); 2585 } 2586 2587 static void 2588 hn_link_status(struct hn_softc *sc) 2589 { 2590 uint32_t link_status; 2591 int error; 2592 2593 error = hn_rndis_get_linkstatus(sc, &link_status); 2594 if (error) { 2595 /* XXX what to do? */ 2596 return; 2597 } 2598 2599 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2600 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2601 else 2602 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2603 if_link_state_change(sc->hn_ifp, 2604 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2605 LINK_STATE_UP : LINK_STATE_DOWN); 2606 } 2607 2608 static void 2609 hn_link_taskfunc(void *xsc, int pending __unused) 2610 { 2611 struct hn_softc *sc = xsc; 2612 2613 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2614 return; 2615 hn_link_status(sc); 2616 } 2617 2618 static void 2619 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2620 { 2621 struct hn_softc *sc = xsc; 2622 2623 /* Prevent any link status checks from running. */ 2624 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2625 2626 /* 2627 * Fake up a [link down --> link up] state change; 5 seconds 2628 * delay is used, which closely simulates miibus reaction 2629 * upon link down event. 2630 */ 2631 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2632 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2633 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2634 &sc->hn_netchg_status, 5 * hz); 2635 } 2636 2637 static void 2638 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2639 { 2640 struct hn_softc *sc = xsc; 2641 2642 /* Re-allow link status checks. */ 2643 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2644 hn_link_status(sc); 2645 } 2646 2647 static void 2648 hn_update_link_status(struct hn_softc *sc) 2649 { 2650 2651 if (sc->hn_mgmt_taskq != NULL) 2652 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2653 } 2654 2655 static void 2656 hn_change_network(struct hn_softc *sc) 2657 { 2658 2659 if (sc->hn_mgmt_taskq != NULL) 2660 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2661 } 2662 2663 static __inline int 2664 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2665 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2666 { 2667 struct mbuf *m = *m_head; 2668 int error; 2669 2670 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2671 2672 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2673 m, segs, nsegs, BUS_DMA_NOWAIT); 2674 if (error == EFBIG) { 2675 struct mbuf *m_new; 2676 2677 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2678 if (m_new == NULL) 2679 return ENOBUFS; 2680 else 2681 *m_head = m = m_new; 2682 txr->hn_tx_collapsed++; 2683 2684 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2685 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2686 } 2687 if (!error) { 2688 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2689 BUS_DMASYNC_PREWRITE); 2690 txd->flags |= HN_TXD_FLAG_DMAMAP; 2691 } 2692 return error; 2693 } 2694 2695 static __inline int 2696 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2697 { 2698 2699 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2700 ("put an onlist txd %#x", txd->flags)); 2701 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2702 ("put an onagg txd %#x", txd->flags)); 2703 2704 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2705 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2706 return 0; 2707 2708 if (!STAILQ_EMPTY(&txd->agg_list)) { 2709 struct hn_txdesc *tmp_txd; 2710 2711 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2712 int freed; 2713 2714 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2715 ("resursive aggregation on aggregated txdesc")); 2716 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2717 ("not aggregated txdesc")); 2718 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2719 ("aggregated txdesc uses dmamap")); 2720 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2721 ("aggregated txdesc consumes " 2722 "chimney sending buffer")); 2723 KASSERT(tmp_txd->chim_size == 0, 2724 ("aggregated txdesc has non-zero " 2725 "chimney sending size")); 2726 2727 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2728 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2729 freed = hn_txdesc_put(txr, tmp_txd); 2730 KASSERT(freed, ("failed to free aggregated txdesc")); 2731 } 2732 } 2733 2734 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2735 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2736 ("chim txd uses dmamap")); 2737 hn_chim_free(txr->hn_sc, txd->chim_index); 2738 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2739 txd->chim_size = 0; 2740 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2741 bus_dmamap_sync(txr->hn_tx_data_dtag, 2742 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2743 bus_dmamap_unload(txr->hn_tx_data_dtag, 2744 txd->data_dmap); 2745 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2746 } 2747 2748 if (txd->m != NULL) { 2749 m_freem(txd->m); 2750 txd->m = NULL; 2751 } 2752 2753 txd->flags |= HN_TXD_FLAG_ONLIST; 2754 #ifndef HN_USE_TXDESC_BUFRING 2755 mtx_lock_spin(&txr->hn_txlist_spin); 2756 KASSERT(txr->hn_txdesc_avail >= 0 && 2757 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2758 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2759 txr->hn_txdesc_avail++; 2760 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2761 mtx_unlock_spin(&txr->hn_txlist_spin); 2762 #else /* HN_USE_TXDESC_BUFRING */ 2763 #ifdef HN_DEBUG 2764 atomic_add_int(&txr->hn_txdesc_avail, 1); 2765 #endif 2766 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2767 #endif /* !HN_USE_TXDESC_BUFRING */ 2768 2769 return 1; 2770 } 2771 2772 static __inline struct hn_txdesc * 2773 hn_txdesc_get(struct hn_tx_ring *txr) 2774 { 2775 struct hn_txdesc *txd; 2776 2777 #ifndef HN_USE_TXDESC_BUFRING 2778 mtx_lock_spin(&txr->hn_txlist_spin); 2779 txd = SLIST_FIRST(&txr->hn_txlist); 2780 if (txd != NULL) { 2781 KASSERT(txr->hn_txdesc_avail > 0, 2782 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2783 txr->hn_txdesc_avail--; 2784 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2785 } 2786 mtx_unlock_spin(&txr->hn_txlist_spin); 2787 #else 2788 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2789 #endif 2790 2791 if (txd != NULL) { 2792 #ifdef HN_USE_TXDESC_BUFRING 2793 #ifdef HN_DEBUG 2794 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2795 #endif 2796 #endif /* HN_USE_TXDESC_BUFRING */ 2797 KASSERT(txd->m == NULL && txd->refs == 0 && 2798 STAILQ_EMPTY(&txd->agg_list) && 2799 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2800 txd->chim_size == 0 && 2801 (txd->flags & HN_TXD_FLAG_ONLIST) && 2802 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2803 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2804 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2805 txd->refs = 1; 2806 } 2807 return txd; 2808 } 2809 2810 static __inline void 2811 hn_txdesc_hold(struct hn_txdesc *txd) 2812 { 2813 2814 /* 0->1 transition will never work */ 2815 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2816 atomic_add_int(&txd->refs, 1); 2817 } 2818 2819 static __inline void 2820 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2821 { 2822 2823 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2824 ("recursive aggregation on aggregating txdesc")); 2825 2826 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2827 ("already aggregated")); 2828 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2829 ("recursive aggregation on to-be-aggregated txdesc")); 2830 2831 txd->flags |= HN_TXD_FLAG_ONAGG; 2832 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2833 } 2834 2835 static bool 2836 hn_tx_ring_pending(struct hn_tx_ring *txr) 2837 { 2838 bool pending = false; 2839 2840 #ifndef HN_USE_TXDESC_BUFRING 2841 mtx_lock_spin(&txr->hn_txlist_spin); 2842 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2843 pending = true; 2844 mtx_unlock_spin(&txr->hn_txlist_spin); 2845 #else 2846 if (!buf_ring_full(txr->hn_txdesc_br)) 2847 pending = true; 2848 #endif 2849 return (pending); 2850 } 2851 2852 static __inline void 2853 hn_txeof(struct hn_tx_ring *txr) 2854 { 2855 txr->hn_has_txeof = 0; 2856 txr->hn_txeof(txr); 2857 } 2858 2859 static void 2860 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2861 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2862 { 2863 struct hn_txdesc *txd = sndc->hn_cbarg; 2864 struct hn_tx_ring *txr; 2865 2866 txr = txd->txr; 2867 KASSERT(txr->hn_chan == chan, 2868 ("channel mismatch, on chan%u, should be chan%u", 2869 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2870 2871 txr->hn_has_txeof = 1; 2872 hn_txdesc_put(txr, txd); 2873 2874 ++txr->hn_txdone_cnt; 2875 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2876 txr->hn_txdone_cnt = 0; 2877 if (txr->hn_oactive) 2878 hn_txeof(txr); 2879 } 2880 } 2881 2882 static void 2883 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2884 { 2885 #if defined(INET) || defined(INET6) 2886 tcp_lro_flush_all(&rxr->hn_lro); 2887 #endif 2888 2889 /* 2890 * NOTE: 2891 * 'txr' could be NULL, if multiple channels and 2892 * ifnet.if_start method are enabled. 2893 */ 2894 if (txr == NULL || !txr->hn_has_txeof) 2895 return; 2896 2897 txr->hn_txdone_cnt = 0; 2898 hn_txeof(txr); 2899 } 2900 2901 static __inline uint32_t 2902 hn_rndis_pktmsg_offset(uint32_t ofs) 2903 { 2904 2905 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2906 ("invalid RNDIS packet msg offset %u", ofs)); 2907 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2908 } 2909 2910 static __inline void * 2911 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2912 size_t pi_dlen, uint32_t pi_type) 2913 { 2914 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2915 struct rndis_pktinfo *pi; 2916 2917 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2918 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2919 2920 /* 2921 * Per-packet-info does not move; it only grows. 2922 * 2923 * NOTE: 2924 * rm_pktinfooffset in this phase counts from the beginning 2925 * of rndis_packet_msg. 2926 */ 2927 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2928 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2929 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2930 pkt->rm_pktinfolen); 2931 pkt->rm_pktinfolen += pi_size; 2932 2933 pi->rm_size = pi_size; 2934 pi->rm_type = pi_type; 2935 pi->rm_internal = 0; 2936 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2937 2938 return (pi->rm_data); 2939 } 2940 2941 static __inline int 2942 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2943 { 2944 struct hn_txdesc *txd; 2945 struct mbuf *m; 2946 int error, pkts; 2947 2948 txd = txr->hn_agg_txd; 2949 KASSERT(txd != NULL, ("no aggregate txdesc")); 2950 2951 /* 2952 * Since hn_txpkt() will reset this temporary stat, save 2953 * it now, so that oerrors can be updated properly, if 2954 * hn_txpkt() ever fails. 2955 */ 2956 pkts = txr->hn_stat_pkts; 2957 2958 /* 2959 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2960 * failure, save it for later freeing, if hn_txpkt() ever 2961 * fails. 2962 */ 2963 m = txd->m; 2964 error = hn_txpkt(ifp, txr, txd); 2965 if (__predict_false(error)) { 2966 /* txd is freed, but m is not. */ 2967 m_freem(m); 2968 2969 txr->hn_flush_failed++; 2970 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2971 } 2972 2973 /* Reset all aggregation states. */ 2974 txr->hn_agg_txd = NULL; 2975 txr->hn_agg_szleft = 0; 2976 txr->hn_agg_pktleft = 0; 2977 txr->hn_agg_prevpkt = NULL; 2978 2979 return (error); 2980 } 2981 2982 static void * 2983 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2984 int pktsize) 2985 { 2986 void *chim; 2987 2988 if (txr->hn_agg_txd != NULL) { 2989 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2990 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2991 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2992 int olen; 2993 2994 /* 2995 * Update the previous RNDIS packet's total length, 2996 * it can be increased due to the mandatory alignment 2997 * padding for this RNDIS packet. And update the 2998 * aggregating txdesc's chimney sending buffer size 2999 * accordingly. 3000 * 3001 * XXX 3002 * Zero-out the padding, as required by the RNDIS spec. 3003 */ 3004 olen = pkt->rm_len; 3005 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3006 agg_txd->chim_size += pkt->rm_len - olen; 3007 3008 /* Link this txdesc to the parent. */ 3009 hn_txdesc_agg(agg_txd, txd); 3010 3011 chim = (uint8_t *)pkt + pkt->rm_len; 3012 /* Save the current packet for later fixup. */ 3013 txr->hn_agg_prevpkt = chim; 3014 3015 txr->hn_agg_pktleft--; 3016 txr->hn_agg_szleft -= pktsize; 3017 if (txr->hn_agg_szleft <= 3018 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3019 /* 3020 * Probably can't aggregate more packets, 3021 * flush this aggregating txdesc proactively. 3022 */ 3023 txr->hn_agg_pktleft = 0; 3024 } 3025 /* Done! */ 3026 return (chim); 3027 } 3028 hn_flush_txagg(ifp, txr); 3029 } 3030 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3031 3032 txr->hn_tx_chimney_tried++; 3033 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3034 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3035 return (NULL); 3036 txr->hn_tx_chimney++; 3037 3038 chim = txr->hn_sc->hn_chim + 3039 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3040 3041 if (txr->hn_agg_pktmax > 1 && 3042 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3043 txr->hn_agg_txd = txd; 3044 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3045 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3046 txr->hn_agg_prevpkt = chim; 3047 } 3048 return (chim); 3049 } 3050 3051 /* 3052 * NOTE: 3053 * If this function fails, then both txd and m_head0 will be freed. 3054 */ 3055 static int 3056 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3057 struct mbuf **m_head0) 3058 { 3059 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3060 int error, nsegs, i; 3061 struct mbuf *m_head = *m_head0; 3062 struct rndis_packet_msg *pkt; 3063 uint32_t *pi_data; 3064 void *chim = NULL; 3065 int pkt_hlen, pkt_size; 3066 3067 pkt = txd->rndis_pkt; 3068 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3069 if (pkt_size < txr->hn_chim_size) { 3070 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3071 if (chim != NULL) 3072 pkt = chim; 3073 } else { 3074 if (txr->hn_agg_txd != NULL) 3075 hn_flush_txagg(ifp, txr); 3076 } 3077 3078 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3079 pkt->rm_len = m_head->m_pkthdr.len; 3080 pkt->rm_dataoffset = 0; 3081 pkt->rm_datalen = m_head->m_pkthdr.len; 3082 pkt->rm_oobdataoffset = 0; 3083 pkt->rm_oobdatalen = 0; 3084 pkt->rm_oobdataelements = 0; 3085 pkt->rm_pktinfooffset = sizeof(*pkt); 3086 pkt->rm_pktinfolen = 0; 3087 pkt->rm_vchandle = 0; 3088 pkt->rm_reserved = 0; 3089 3090 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3091 /* 3092 * Set the hash value for this packet. 3093 */ 3094 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3095 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3096 3097 if (M_HASHTYPE_ISHASH(m_head)) 3098 /* 3099 * The flowid field contains the hash value host 3100 * set in the rx queue if it is a ip forwarding pkt. 3101 * Set the same hash value so host can send on the 3102 * cpu it was received. 3103 */ 3104 *pi_data = m_head->m_pkthdr.flowid; 3105 else 3106 /* 3107 * Otherwise just put the tx queue index. 3108 */ 3109 *pi_data = txr->hn_tx_idx; 3110 } 3111 3112 if (m_head->m_flags & M_VLANTAG) { 3113 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3114 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3115 *pi_data = NDIS_VLAN_INFO_MAKE( 3116 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3117 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3118 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3119 } 3120 3121 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3122 #if defined(INET6) || defined(INET) 3123 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3124 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3125 #ifdef INET 3126 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3127 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3128 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3129 m_head->m_pkthdr.tso_segsz); 3130 } 3131 #endif 3132 #if defined(INET6) && defined(INET) 3133 else 3134 #endif 3135 #ifdef INET6 3136 { 3137 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3138 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3139 m_head->m_pkthdr.tso_segsz); 3140 } 3141 #endif 3142 #endif /* INET6 || INET */ 3143 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3144 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3145 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3146 if (m_head->m_pkthdr.csum_flags & 3147 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3148 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3149 } else { 3150 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3151 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3152 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3153 } 3154 3155 if (m_head->m_pkthdr.csum_flags & 3156 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3157 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3158 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3159 } else if (m_head->m_pkthdr.csum_flags & 3160 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3161 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3162 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3163 } 3164 } 3165 3166 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3167 /* Fixup RNDIS packet message total length */ 3168 pkt->rm_len += pkt_hlen; 3169 /* Convert RNDIS packet message offsets */ 3170 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3171 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3172 3173 /* 3174 * Fast path: Chimney sending. 3175 */ 3176 if (chim != NULL) { 3177 struct hn_txdesc *tgt_txd = txd; 3178 3179 if (txr->hn_agg_txd != NULL) { 3180 tgt_txd = txr->hn_agg_txd; 3181 #ifdef INVARIANTS 3182 *m_head0 = NULL; 3183 #endif 3184 } 3185 3186 KASSERT(pkt == chim, 3187 ("RNDIS pkt not in chimney sending buffer")); 3188 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3189 ("chimney sending buffer is not used")); 3190 tgt_txd->chim_size += pkt->rm_len; 3191 3192 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3193 ((uint8_t *)chim) + pkt_hlen); 3194 3195 txr->hn_gpa_cnt = 0; 3196 txr->hn_sendpkt = hn_txpkt_chim; 3197 goto done; 3198 } 3199 3200 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3201 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3202 ("chimney buffer is used")); 3203 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3204 3205 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3206 if (__predict_false(error)) { 3207 int freed; 3208 3209 /* 3210 * This mbuf is not linked w/ the txd yet, so free it now. 3211 */ 3212 m_freem(m_head); 3213 *m_head0 = NULL; 3214 3215 freed = hn_txdesc_put(txr, txd); 3216 KASSERT(freed != 0, 3217 ("fail to free txd upon txdma error")); 3218 3219 txr->hn_txdma_failed++; 3220 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3221 return error; 3222 } 3223 *m_head0 = m_head; 3224 3225 /* +1 RNDIS packet message */ 3226 txr->hn_gpa_cnt = nsegs + 1; 3227 3228 /* send packet with page buffer */ 3229 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3230 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3231 txr->hn_gpa[0].gpa_len = pkt_hlen; 3232 3233 /* 3234 * Fill the page buffers with mbuf info after the page 3235 * buffer for RNDIS packet message. 3236 */ 3237 for (i = 0; i < nsegs; ++i) { 3238 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3239 3240 gpa->gpa_page = atop(segs[i].ds_addr); 3241 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3242 gpa->gpa_len = segs[i].ds_len; 3243 } 3244 3245 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3246 txd->chim_size = 0; 3247 txr->hn_sendpkt = hn_txpkt_sglist; 3248 done: 3249 txd->m = m_head; 3250 3251 /* Set the completion routine */ 3252 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3253 3254 /* Update temporary stats for later use. */ 3255 txr->hn_stat_pkts++; 3256 txr->hn_stat_size += m_head->m_pkthdr.len; 3257 if (m_head->m_flags & M_MCAST) 3258 txr->hn_stat_mcasts++; 3259 3260 return 0; 3261 } 3262 3263 /* 3264 * NOTE: 3265 * If this function fails, then txd will be freed, but the mbuf 3266 * associated w/ the txd will _not_ be freed. 3267 */ 3268 static int 3269 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3270 { 3271 int error, send_failed = 0, has_bpf; 3272 3273 again: 3274 has_bpf = bpf_peers_present(ifp->if_bpf); 3275 if (has_bpf) { 3276 /* 3277 * Make sure that this txd and any aggregated txds are not 3278 * freed before ETHER_BPF_MTAP. 3279 */ 3280 hn_txdesc_hold(txd); 3281 } 3282 error = txr->hn_sendpkt(txr, txd); 3283 if (!error) { 3284 if (has_bpf) { 3285 const struct hn_txdesc *tmp_txd; 3286 3287 ETHER_BPF_MTAP(ifp, txd->m); 3288 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3289 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3290 } 3291 3292 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3293 #ifdef HN_IFSTART_SUPPORT 3294 if (!hn_use_if_start) 3295 #endif 3296 { 3297 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3298 txr->hn_stat_size); 3299 if (txr->hn_stat_mcasts != 0) { 3300 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3301 txr->hn_stat_mcasts); 3302 } 3303 } 3304 txr->hn_pkts += txr->hn_stat_pkts; 3305 txr->hn_sends++; 3306 } 3307 if (has_bpf) 3308 hn_txdesc_put(txr, txd); 3309 3310 if (__predict_false(error)) { 3311 int freed; 3312 3313 /* 3314 * This should "really rarely" happen. 3315 * 3316 * XXX Too many RX to be acked or too many sideband 3317 * commands to run? Ask netvsc_channel_rollup() 3318 * to kick start later. 3319 */ 3320 txr->hn_has_txeof = 1; 3321 if (!send_failed) { 3322 txr->hn_send_failed++; 3323 send_failed = 1; 3324 /* 3325 * Try sending again after set hn_has_txeof; 3326 * in case that we missed the last 3327 * netvsc_channel_rollup(). 3328 */ 3329 goto again; 3330 } 3331 if_printf(ifp, "send failed\n"); 3332 3333 /* 3334 * Caller will perform further processing on the 3335 * associated mbuf, so don't free it in hn_txdesc_put(); 3336 * only unload it from the DMA map in hn_txdesc_put(), 3337 * if it was loaded. 3338 */ 3339 txd->m = NULL; 3340 freed = hn_txdesc_put(txr, txd); 3341 KASSERT(freed != 0, 3342 ("fail to free txd upon send error")); 3343 3344 txr->hn_send_failed++; 3345 } 3346 3347 /* Reset temporary stats, after this sending is done. */ 3348 txr->hn_stat_size = 0; 3349 txr->hn_stat_pkts = 0; 3350 txr->hn_stat_mcasts = 0; 3351 3352 return (error); 3353 } 3354 3355 /* 3356 * Append the specified data to the indicated mbuf chain, 3357 * Extend the mbuf chain if the new data does not fit in 3358 * existing space. 3359 * 3360 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3361 * There should be an equivalent in the kernel mbuf code, 3362 * but there does not appear to be one yet. 3363 * 3364 * Differs from m_append() in that additional mbufs are 3365 * allocated with cluster size MJUMPAGESIZE, and filled 3366 * accordingly. 3367 * 3368 * Return the last mbuf in the chain or NULL if failed to 3369 * allocate new mbuf. 3370 */ 3371 static struct mbuf * 3372 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3373 { 3374 struct mbuf *m, *n; 3375 int remainder, space; 3376 3377 for (m = m0; m->m_next != NULL; m = m->m_next) 3378 ; 3379 remainder = len; 3380 space = M_TRAILINGSPACE(m); 3381 if (space > 0) { 3382 /* 3383 * Copy into available space. 3384 */ 3385 if (space > remainder) 3386 space = remainder; 3387 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3388 m->m_len += space; 3389 cp += space; 3390 remainder -= space; 3391 } 3392 while (remainder > 0) { 3393 /* 3394 * Allocate a new mbuf; could check space 3395 * and allocate a cluster instead. 3396 */ 3397 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3398 if (n == NULL) 3399 return NULL; 3400 n->m_len = min(MJUMPAGESIZE, remainder); 3401 bcopy(cp, mtod(n, caddr_t), n->m_len); 3402 cp += n->m_len; 3403 remainder -= n->m_len; 3404 m->m_next = n; 3405 m = n; 3406 } 3407 3408 return m; 3409 } 3410 3411 #if defined(INET) || defined(INET6) 3412 static __inline int 3413 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3414 { 3415 #if __FreeBSD_version >= 1100095 3416 if (hn_lro_mbufq_depth) { 3417 tcp_lro_queue_mbuf(lc, m); 3418 return 0; 3419 } 3420 #endif 3421 return tcp_lro_rx(lc, m, 0); 3422 } 3423 #endif 3424 3425 static int 3426 hn_rxpkt(struct hn_rx_ring *rxr) 3427 { 3428 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3429 struct mbuf *m_new, *n; 3430 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3431 int hash_type = M_HASHTYPE_NONE; 3432 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3433 int i; 3434 3435 ifp = hn_ifp; 3436 if (rxr->hn_rxvf_ifp != NULL) { 3437 /* 3438 * Non-transparent mode VF; pretend this packet is from 3439 * the VF. 3440 */ 3441 ifp = rxr->hn_rxvf_ifp; 3442 is_vf = 1; 3443 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3444 /* Transparent mode VF. */ 3445 is_vf = 1; 3446 } 3447 3448 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3449 /* 3450 * NOTE: 3451 * See the NOTE of hn_rndis_init_fixat(). This 3452 * function can be reached, immediately after the 3453 * RNDIS is initialized but before the ifnet is 3454 * setup on the hn_attach() path; drop the unexpected 3455 * packets. 3456 */ 3457 return (0); 3458 } 3459 3460 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3461 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3462 return (0); 3463 } 3464 3465 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3466 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3467 if (m_new == NULL) { 3468 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3469 return (0); 3470 } 3471 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3472 rxr->rsc.frag_len[0]); 3473 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3474 } else { 3475 /* 3476 * Get an mbuf with a cluster. For packets 2K or less, 3477 * get a standard 2K cluster. For anything larger, get a 3478 * 4K cluster. Any buffers larger than 4K can cause problems 3479 * if looped around to the Hyper-V TX channel, so avoid them. 3480 */ 3481 size = MCLBYTES; 3482 if (rxr->rsc.pktlen > MCLBYTES) { 3483 /* 4096 */ 3484 size = MJUMPAGESIZE; 3485 } 3486 3487 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3488 if (m_new == NULL) { 3489 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3490 return (0); 3491 } 3492 3493 n = m_new; 3494 for (i = 0; i < rxr->rsc.cnt; i++) { 3495 n = hv_m_append(n, rxr->rsc.frag_len[i], 3496 rxr->rsc.frag_data[i]); 3497 if (n == NULL) { 3498 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3499 return (0); 3500 } else { 3501 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3502 } 3503 } 3504 } 3505 if (rxr->rsc.pktlen <= MHLEN) 3506 rxr->hn_small_pkts++; 3507 3508 m_new->m_pkthdr.rcvif = ifp; 3509 3510 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3511 do_csum = 0; 3512 3513 /* receive side checksum offload */ 3514 if (rxr->rsc.csum_info != NULL) { 3515 /* IP csum offload */ 3516 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3517 m_new->m_pkthdr.csum_flags |= 3518 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3519 rxr->hn_csum_ip++; 3520 } 3521 3522 /* TCP/UDP csum offload */ 3523 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3524 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3525 m_new->m_pkthdr.csum_flags |= 3526 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3527 m_new->m_pkthdr.csum_data = 0xffff; 3528 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3529 rxr->hn_csum_tcp++; 3530 else 3531 rxr->hn_csum_udp++; 3532 } 3533 3534 /* 3535 * XXX 3536 * As of this write (Oct 28th, 2016), host side will turn 3537 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3538 * the do_lro setting here is actually _not_ accurate. We 3539 * depend on the RSS hash type check to reset do_lro. 3540 */ 3541 if ((*(rxr->rsc.csum_info) & 3542 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3543 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3544 do_lro = 1; 3545 } else { 3546 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3547 if (l3proto == ETHERTYPE_IP) { 3548 if (l4proto == IPPROTO_TCP) { 3549 if (do_csum && 3550 (rxr->hn_trust_hcsum & 3551 HN_TRUST_HCSUM_TCP)) { 3552 rxr->hn_csum_trusted++; 3553 m_new->m_pkthdr.csum_flags |= 3554 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3555 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3556 m_new->m_pkthdr.csum_data = 0xffff; 3557 } 3558 do_lro = 1; 3559 } else if (l4proto == IPPROTO_UDP) { 3560 if (do_csum && 3561 (rxr->hn_trust_hcsum & 3562 HN_TRUST_HCSUM_UDP)) { 3563 rxr->hn_csum_trusted++; 3564 m_new->m_pkthdr.csum_flags |= 3565 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3566 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3567 m_new->m_pkthdr.csum_data = 0xffff; 3568 } 3569 } else if (l4proto != IPPROTO_DONE && do_csum && 3570 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3571 rxr->hn_csum_trusted++; 3572 m_new->m_pkthdr.csum_flags |= 3573 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3574 } 3575 } 3576 } 3577 3578 if (rxr->rsc.vlan_info != NULL) { 3579 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3580 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3581 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3582 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3583 m_new->m_flags |= M_VLANTAG; 3584 } 3585 3586 /* 3587 * If VF is activated (tranparent/non-transparent mode does not 3588 * matter here). 3589 * 3590 * - Disable LRO 3591 * 3592 * hn(4) will only receive broadcast packets, multicast packets, 3593 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3594 * packet types. 3595 * 3596 * For non-transparent, we definitely _cannot_ enable LRO at 3597 * all, since the LRO flush will use hn(4) as the receiving 3598 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3599 */ 3600 if (is_vf) 3601 do_lro = 0; 3602 3603 /* 3604 * If VF is activated (tranparent/non-transparent mode does not 3605 * matter here), do _not_ mess with unsupported hash types or 3606 * functions. 3607 */ 3608 if (rxr->rsc.hash_info != NULL) { 3609 rxr->hn_rss_pkts++; 3610 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3611 if (!is_vf) 3612 hash_type = M_HASHTYPE_OPAQUE_HASH; 3613 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3614 NDIS_HASH_FUNCTION_TOEPLITZ) { 3615 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3616 rxr->hn_mbuf_hash); 3617 3618 /* 3619 * NOTE: 3620 * do_lro is resetted, if the hash types are not TCP 3621 * related. See the comment in the above csum_flags 3622 * setup section. 3623 */ 3624 switch (type) { 3625 case NDIS_HASH_IPV4: 3626 hash_type = M_HASHTYPE_RSS_IPV4; 3627 do_lro = 0; 3628 break; 3629 3630 case NDIS_HASH_TCP_IPV4: 3631 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3632 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3633 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3634 3635 if (is_vf) 3636 def_htype = M_HASHTYPE_NONE; 3637 3638 /* 3639 * UDP 4-tuple hash is delivered as 3640 * TCP 4-tuple hash. 3641 */ 3642 if (l3proto == ETHERTYPE_MAX) { 3643 hn_rxpkt_proto(m_new, 3644 &l3proto, &l4proto); 3645 } 3646 if (l3proto == ETHERTYPE_IP) { 3647 if (l4proto == IPPROTO_UDP && 3648 (rxr->hn_mbuf_hash & 3649 NDIS_HASH_UDP_IPV4_X)) { 3650 hash_type = 3651 M_HASHTYPE_RSS_UDP_IPV4; 3652 do_lro = 0; 3653 } else if (l4proto != 3654 IPPROTO_TCP) { 3655 hash_type = def_htype; 3656 do_lro = 0; 3657 } 3658 } else { 3659 hash_type = def_htype; 3660 do_lro = 0; 3661 } 3662 } 3663 break; 3664 3665 case NDIS_HASH_IPV6: 3666 hash_type = M_HASHTYPE_RSS_IPV6; 3667 do_lro = 0; 3668 break; 3669 3670 case NDIS_HASH_IPV6_EX: 3671 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3672 do_lro = 0; 3673 break; 3674 3675 case NDIS_HASH_TCP_IPV6: 3676 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3677 break; 3678 3679 case NDIS_HASH_TCP_IPV6_EX: 3680 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3681 break; 3682 } 3683 } 3684 } else if (!is_vf) { 3685 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3686 hash_type = M_HASHTYPE_OPAQUE; 3687 } 3688 M_HASHTYPE_SET(m_new, hash_type); 3689 3690 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3691 if (hn_ifp != ifp) { 3692 const struct ether_header *eh; 3693 3694 /* 3695 * Non-transparent mode VF is activated. 3696 */ 3697 3698 /* 3699 * Allow tapping on hn(4). 3700 */ 3701 ETHER_BPF_MTAP(hn_ifp, m_new); 3702 3703 /* 3704 * Update hn(4)'s stats. 3705 */ 3706 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3707 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3708 /* Checked at the beginning of this function. */ 3709 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3710 eh = mtod(m_new, struct ether_header *); 3711 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3712 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3713 } 3714 rxr->hn_pkts++; 3715 3716 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3717 #if defined(INET) || defined(INET6) 3718 struct lro_ctrl *lro = &rxr->hn_lro; 3719 3720 if (lro->lro_cnt) { 3721 rxr->hn_lro_tried++; 3722 if (hn_lro_rx(lro, m_new) == 0) { 3723 /* DONE! */ 3724 return 0; 3725 } 3726 } 3727 #endif 3728 } 3729 ifp->if_input(ifp, m_new); 3730 3731 return (0); 3732 } 3733 3734 static int 3735 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3736 { 3737 struct hn_softc *sc = ifp->if_softc; 3738 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3739 struct ifnet *vf_ifp; 3740 int mask, error = 0; 3741 struct ifrsskey *ifrk; 3742 struct ifrsshash *ifrh; 3743 uint32_t mtu; 3744 3745 switch (cmd) { 3746 case SIOCSIFMTU: 3747 if (ifr->ifr_mtu > HN_MTU_MAX) { 3748 error = EINVAL; 3749 break; 3750 } 3751 3752 HN_LOCK(sc); 3753 3754 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3755 HN_UNLOCK(sc); 3756 break; 3757 } 3758 3759 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3760 /* Can't change MTU */ 3761 HN_UNLOCK(sc); 3762 error = EOPNOTSUPP; 3763 break; 3764 } 3765 3766 if (ifp->if_mtu == ifr->ifr_mtu) { 3767 HN_UNLOCK(sc); 3768 break; 3769 } 3770 3771 if (hn_xpnt_vf_isready(sc)) { 3772 vf_ifp = sc->hn_vf_ifp; 3773 ifr_vf = *ifr; 3774 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3775 sizeof(ifr_vf.ifr_name)); 3776 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3777 (caddr_t)&ifr_vf); 3778 if (error) { 3779 HN_UNLOCK(sc); 3780 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3781 vf_ifp->if_xname, ifr->ifr_mtu, error); 3782 break; 3783 } 3784 } 3785 3786 /* 3787 * Suspend this interface before the synthetic parts 3788 * are ripped. 3789 */ 3790 hn_suspend(sc); 3791 3792 /* 3793 * Detach the synthetics parts, i.e. NVS and RNDIS. 3794 */ 3795 hn_synth_detach(sc); 3796 3797 /* 3798 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3799 * with the new MTU setting. 3800 */ 3801 error = hn_synth_attach(sc, ifr->ifr_mtu); 3802 if (error) { 3803 HN_UNLOCK(sc); 3804 break; 3805 } 3806 3807 error = hn_rndis_get_mtu(sc, &mtu); 3808 if (error) 3809 mtu = ifr->ifr_mtu; 3810 else if (bootverbose) 3811 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3812 3813 /* 3814 * Commit the requested MTU, after the synthetic parts 3815 * have been successfully attached. 3816 */ 3817 if (mtu >= ifr->ifr_mtu) { 3818 mtu = ifr->ifr_mtu; 3819 } else { 3820 if_printf(ifp, "fixup mtu %d -> %u\n", 3821 ifr->ifr_mtu, mtu); 3822 } 3823 ifp->if_mtu = mtu; 3824 3825 /* 3826 * Synthetic parts' reattach may change the chimney 3827 * sending size; update it. 3828 */ 3829 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3830 hn_set_chim_size(sc, sc->hn_chim_szmax); 3831 3832 /* 3833 * Make sure that various parameters based on MTU are 3834 * still valid, after the MTU change. 3835 */ 3836 hn_mtu_change_fixup(sc); 3837 3838 /* 3839 * All done! Resume the interface now. 3840 */ 3841 hn_resume(sc); 3842 3843 if ((sc->hn_flags & HN_FLAG_RXVF) || 3844 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3845 /* 3846 * Since we have reattached the NVS part, 3847 * change the datapath to VF again; in case 3848 * that it is lost, after the NVS was detached. 3849 */ 3850 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3851 } 3852 3853 HN_UNLOCK(sc); 3854 break; 3855 3856 case SIOCSIFFLAGS: 3857 HN_LOCK(sc); 3858 3859 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3860 HN_UNLOCK(sc); 3861 break; 3862 } 3863 3864 if (hn_xpnt_vf_isready(sc)) 3865 hn_xpnt_vf_saveifflags(sc); 3866 3867 if (ifp->if_flags & IFF_UP) { 3868 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3869 /* 3870 * Caller meight hold mutex, e.g. 3871 * bpf; use busy-wait for the RNDIS 3872 * reply. 3873 */ 3874 HN_NO_SLEEPING(sc); 3875 hn_rxfilter_config(sc); 3876 HN_SLEEPING_OK(sc); 3877 3878 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3879 error = hn_xpnt_vf_iocsetflags(sc); 3880 } else { 3881 hn_init_locked(sc); 3882 } 3883 } else { 3884 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3885 hn_stop(sc, false); 3886 } 3887 sc->hn_if_flags = ifp->if_flags; 3888 3889 HN_UNLOCK(sc); 3890 break; 3891 3892 case SIOCSIFCAP: 3893 HN_LOCK(sc); 3894 3895 if (hn_xpnt_vf_isready(sc)) { 3896 ifr_vf = *ifr; 3897 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3898 sizeof(ifr_vf.ifr_name)); 3899 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3900 HN_UNLOCK(sc); 3901 break; 3902 } 3903 3904 /* 3905 * Fix up requested capabilities w/ supported capabilities, 3906 * since the supported capabilities could have been changed. 3907 */ 3908 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3909 ifp->if_capenable; 3910 3911 if (mask & IFCAP_TXCSUM) { 3912 ifp->if_capenable ^= IFCAP_TXCSUM; 3913 if (ifp->if_capenable & IFCAP_TXCSUM) 3914 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3915 else 3916 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3917 } 3918 if (mask & IFCAP_TXCSUM_IPV6) { 3919 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3920 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3921 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3922 else 3923 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3924 } 3925 3926 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3927 if (mask & IFCAP_RXCSUM) 3928 ifp->if_capenable ^= IFCAP_RXCSUM; 3929 #ifdef foo 3930 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3931 if (mask & IFCAP_RXCSUM_IPV6) 3932 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3933 #endif 3934 3935 if (mask & IFCAP_LRO) 3936 ifp->if_capenable ^= IFCAP_LRO; 3937 3938 if (mask & IFCAP_TSO4) { 3939 ifp->if_capenable ^= IFCAP_TSO4; 3940 if (ifp->if_capenable & IFCAP_TSO4) 3941 ifp->if_hwassist |= CSUM_IP_TSO; 3942 else 3943 ifp->if_hwassist &= ~CSUM_IP_TSO; 3944 } 3945 if (mask & IFCAP_TSO6) { 3946 ifp->if_capenable ^= IFCAP_TSO6; 3947 if (ifp->if_capenable & IFCAP_TSO6) 3948 ifp->if_hwassist |= CSUM_IP6_TSO; 3949 else 3950 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3951 } 3952 3953 HN_UNLOCK(sc); 3954 break; 3955 3956 case SIOCADDMULTI: 3957 case SIOCDELMULTI: 3958 HN_LOCK(sc); 3959 3960 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3961 HN_UNLOCK(sc); 3962 break; 3963 } 3964 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3965 /* 3966 * Multicast uses mutex; use busy-wait for 3967 * the RNDIS reply. 3968 */ 3969 HN_NO_SLEEPING(sc); 3970 hn_rxfilter_config(sc); 3971 HN_SLEEPING_OK(sc); 3972 } 3973 3974 /* XXX vlan(4) style mcast addr maintenance */ 3975 if (hn_xpnt_vf_isready(sc)) { 3976 int old_if_flags; 3977 3978 old_if_flags = sc->hn_vf_ifp->if_flags; 3979 hn_xpnt_vf_saveifflags(sc); 3980 3981 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3982 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3983 IFF_ALLMULTI)) 3984 error = hn_xpnt_vf_iocsetflags(sc); 3985 } 3986 3987 HN_UNLOCK(sc); 3988 break; 3989 3990 case SIOCSIFMEDIA: 3991 case SIOCGIFMEDIA: 3992 HN_LOCK(sc); 3993 if (hn_xpnt_vf_isready(sc)) { 3994 /* 3995 * SIOCGIFMEDIA expects ifmediareq, so don't 3996 * create and pass ifr_vf to the VF here; just 3997 * replace the ifr_name. 3998 */ 3999 vf_ifp = sc->hn_vf_ifp; 4000 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 4001 sizeof(ifr->ifr_name)); 4002 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 4003 /* Restore the ifr_name. */ 4004 strlcpy(ifr->ifr_name, ifp->if_xname, 4005 sizeof(ifr->ifr_name)); 4006 HN_UNLOCK(sc); 4007 break; 4008 } 4009 HN_UNLOCK(sc); 4010 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4011 break; 4012 4013 case SIOCGIFRSSHASH: 4014 ifrh = (struct ifrsshash *)data; 4015 HN_LOCK(sc); 4016 if (sc->hn_rx_ring_inuse == 1) { 4017 HN_UNLOCK(sc); 4018 ifrh->ifrh_func = RSS_FUNC_NONE; 4019 ifrh->ifrh_types = 0; 4020 break; 4021 } 4022 4023 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4024 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4025 else 4026 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4027 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4028 HN_UNLOCK(sc); 4029 break; 4030 4031 case SIOCGIFRSSKEY: 4032 ifrk = (struct ifrsskey *)data; 4033 HN_LOCK(sc); 4034 if (sc->hn_rx_ring_inuse == 1) { 4035 HN_UNLOCK(sc); 4036 ifrk->ifrk_func = RSS_FUNC_NONE; 4037 ifrk->ifrk_keylen = 0; 4038 break; 4039 } 4040 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4041 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4042 else 4043 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4044 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4045 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4046 NDIS_HASH_KEYSIZE_TOEPLITZ); 4047 HN_UNLOCK(sc); 4048 break; 4049 4050 default: 4051 error = ether_ioctl(ifp, cmd, data); 4052 break; 4053 } 4054 return (error); 4055 } 4056 4057 static void 4058 hn_stop(struct hn_softc *sc, bool detaching) 4059 { 4060 struct ifnet *ifp = sc->hn_ifp; 4061 int i; 4062 4063 HN_LOCK_ASSERT(sc); 4064 4065 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4066 ("synthetic parts were not attached")); 4067 4068 /* Clear RUNNING bit ASAP. */ 4069 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4070 4071 /* Disable polling. */ 4072 hn_polling(sc, 0); 4073 4074 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4075 KASSERT(sc->hn_vf_ifp != NULL, 4076 ("%s: VF is not attached", ifp->if_xname)); 4077 4078 /* Mark transparent mode VF as disabled. */ 4079 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4080 4081 /* 4082 * NOTE: 4083 * Datapath setting must happen _before_ bringing 4084 * the VF down. 4085 */ 4086 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4087 4088 /* 4089 * Bring the VF down. 4090 */ 4091 hn_xpnt_vf_saveifflags(sc); 4092 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4093 hn_xpnt_vf_iocsetflags(sc); 4094 } 4095 4096 /* Suspend data transfers. */ 4097 hn_suspend_data(sc); 4098 4099 /* Clear OACTIVE bit. */ 4100 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4101 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4102 sc->hn_tx_ring[i].hn_oactive = 0; 4103 4104 /* 4105 * If the non-transparent mode VF is active, make sure 4106 * that the RX filter still allows packet reception. 4107 */ 4108 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4109 hn_rxfilter_config(sc); 4110 } 4111 4112 static void 4113 hn_init_locked(struct hn_softc *sc) 4114 { 4115 struct ifnet *ifp = sc->hn_ifp; 4116 int i; 4117 4118 HN_LOCK_ASSERT(sc); 4119 4120 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4121 return; 4122 4123 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4124 return; 4125 4126 /* Configure RX filter */ 4127 hn_rxfilter_config(sc); 4128 4129 /* Clear OACTIVE bit. */ 4130 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4131 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4132 sc->hn_tx_ring[i].hn_oactive = 0; 4133 4134 /* Clear TX 'suspended' bit. */ 4135 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4136 4137 if (hn_xpnt_vf_isready(sc)) { 4138 /* Initialize transparent VF. */ 4139 hn_xpnt_vf_init(sc); 4140 } 4141 4142 /* Everything is ready; unleash! */ 4143 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4144 4145 /* Re-enable polling if requested. */ 4146 if (sc->hn_pollhz > 0) 4147 hn_polling(sc, sc->hn_pollhz); 4148 } 4149 4150 static void 4151 hn_init(void *xsc) 4152 { 4153 struct hn_softc *sc = xsc; 4154 4155 HN_LOCK(sc); 4156 hn_init_locked(sc); 4157 HN_UNLOCK(sc); 4158 } 4159 4160 #if __FreeBSD_version >= 1100099 4161 4162 static int 4163 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4164 { 4165 struct hn_softc *sc = arg1; 4166 unsigned int lenlim; 4167 int error; 4168 4169 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4170 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4171 if (error || req->newptr == NULL) 4172 return error; 4173 4174 HN_LOCK(sc); 4175 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4176 lenlim > TCP_LRO_LENGTH_MAX) { 4177 HN_UNLOCK(sc); 4178 return EINVAL; 4179 } 4180 hn_set_lro_lenlim(sc, lenlim); 4181 HN_UNLOCK(sc); 4182 4183 return 0; 4184 } 4185 4186 static int 4187 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4188 { 4189 struct hn_softc *sc = arg1; 4190 int ackcnt, error, i; 4191 4192 /* 4193 * lro_ackcnt_lim is append count limit, 4194 * +1 to turn it into aggregation limit. 4195 */ 4196 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4197 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4198 if (error || req->newptr == NULL) 4199 return error; 4200 4201 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4202 return EINVAL; 4203 4204 /* 4205 * Convert aggregation limit back to append 4206 * count limit. 4207 */ 4208 --ackcnt; 4209 HN_LOCK(sc); 4210 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4211 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4212 HN_UNLOCK(sc); 4213 return 0; 4214 } 4215 4216 #endif 4217 4218 static int 4219 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4220 { 4221 struct hn_softc *sc = arg1; 4222 int hcsum = arg2; 4223 int on, error, i; 4224 4225 on = 0; 4226 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4227 on = 1; 4228 4229 error = sysctl_handle_int(oidp, &on, 0, req); 4230 if (error || req->newptr == NULL) 4231 return error; 4232 4233 HN_LOCK(sc); 4234 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4235 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4236 4237 if (on) 4238 rxr->hn_trust_hcsum |= hcsum; 4239 else 4240 rxr->hn_trust_hcsum &= ~hcsum; 4241 } 4242 HN_UNLOCK(sc); 4243 return 0; 4244 } 4245 4246 static int 4247 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4248 { 4249 struct hn_softc *sc = arg1; 4250 int chim_size, error; 4251 4252 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4253 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4254 if (error || req->newptr == NULL) 4255 return error; 4256 4257 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4258 return EINVAL; 4259 4260 HN_LOCK(sc); 4261 hn_set_chim_size(sc, chim_size); 4262 HN_UNLOCK(sc); 4263 return 0; 4264 } 4265 4266 #if __FreeBSD_version < 1100095 4267 static int 4268 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4269 { 4270 struct hn_softc *sc = arg1; 4271 int ofs = arg2, i, error; 4272 struct hn_rx_ring *rxr; 4273 uint64_t stat; 4274 4275 stat = 0; 4276 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4277 rxr = &sc->hn_rx_ring[i]; 4278 stat += *((int *)((uint8_t *)rxr + ofs)); 4279 } 4280 4281 error = sysctl_handle_64(oidp, &stat, 0, req); 4282 if (error || req->newptr == NULL) 4283 return error; 4284 4285 /* Zero out this stat. */ 4286 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4287 rxr = &sc->hn_rx_ring[i]; 4288 *((int *)((uint8_t *)rxr + ofs)) = 0; 4289 } 4290 return 0; 4291 } 4292 #else 4293 static int 4294 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4295 { 4296 struct hn_softc *sc = arg1; 4297 int ofs = arg2, i, error; 4298 struct hn_rx_ring *rxr; 4299 uint64_t stat; 4300 4301 stat = 0; 4302 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4303 rxr = &sc->hn_rx_ring[i]; 4304 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4305 } 4306 4307 error = sysctl_handle_64(oidp, &stat, 0, req); 4308 if (error || req->newptr == NULL) 4309 return error; 4310 4311 /* Zero out this stat. */ 4312 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4313 rxr = &sc->hn_rx_ring[i]; 4314 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4315 } 4316 return 0; 4317 } 4318 4319 #endif 4320 4321 static int 4322 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4323 { 4324 struct hn_softc *sc = arg1; 4325 int ofs = arg2, i, error; 4326 struct hn_rx_ring *rxr; 4327 u_long stat; 4328 4329 stat = 0; 4330 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4331 rxr = &sc->hn_rx_ring[i]; 4332 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4333 } 4334 4335 error = sysctl_handle_long(oidp, &stat, 0, req); 4336 if (error || req->newptr == NULL) 4337 return error; 4338 4339 /* Zero out this stat. */ 4340 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4341 rxr = &sc->hn_rx_ring[i]; 4342 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4343 } 4344 return 0; 4345 } 4346 4347 static int 4348 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4349 { 4350 struct hn_softc *sc = arg1; 4351 int ofs = arg2, i, error; 4352 struct hn_tx_ring *txr; 4353 u_long stat; 4354 4355 stat = 0; 4356 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4357 txr = &sc->hn_tx_ring[i]; 4358 stat += *((u_long *)((uint8_t *)txr + ofs)); 4359 } 4360 4361 error = sysctl_handle_long(oidp, &stat, 0, req); 4362 if (error || req->newptr == NULL) 4363 return error; 4364 4365 /* Zero out this stat. */ 4366 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4367 txr = &sc->hn_tx_ring[i]; 4368 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4369 } 4370 return 0; 4371 } 4372 4373 static int 4374 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4375 { 4376 struct hn_softc *sc = arg1; 4377 int ofs = arg2, i, error, conf; 4378 struct hn_tx_ring *txr; 4379 4380 txr = &sc->hn_tx_ring[0]; 4381 conf = *((int *)((uint8_t *)txr + ofs)); 4382 4383 error = sysctl_handle_int(oidp, &conf, 0, req); 4384 if (error || req->newptr == NULL) 4385 return error; 4386 4387 HN_LOCK(sc); 4388 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4389 txr = &sc->hn_tx_ring[i]; 4390 *((int *)((uint8_t *)txr + ofs)) = conf; 4391 } 4392 HN_UNLOCK(sc); 4393 4394 return 0; 4395 } 4396 4397 static int 4398 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4399 { 4400 struct hn_softc *sc = arg1; 4401 int error, size; 4402 4403 size = sc->hn_agg_size; 4404 error = sysctl_handle_int(oidp, &size, 0, req); 4405 if (error || req->newptr == NULL) 4406 return (error); 4407 4408 HN_LOCK(sc); 4409 sc->hn_agg_size = size; 4410 hn_set_txagg(sc); 4411 HN_UNLOCK(sc); 4412 4413 return (0); 4414 } 4415 4416 static int 4417 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4418 { 4419 struct hn_softc *sc = arg1; 4420 int error, pkts; 4421 4422 pkts = sc->hn_agg_pkts; 4423 error = sysctl_handle_int(oidp, &pkts, 0, req); 4424 if (error || req->newptr == NULL) 4425 return (error); 4426 4427 HN_LOCK(sc); 4428 sc->hn_agg_pkts = pkts; 4429 hn_set_txagg(sc); 4430 HN_UNLOCK(sc); 4431 4432 return (0); 4433 } 4434 4435 static int 4436 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4437 { 4438 struct hn_softc *sc = arg1; 4439 int pkts; 4440 4441 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4442 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4443 } 4444 4445 static int 4446 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4447 { 4448 struct hn_softc *sc = arg1; 4449 int align; 4450 4451 align = sc->hn_tx_ring[0].hn_agg_align; 4452 return (sysctl_handle_int(oidp, &align, 0, req)); 4453 } 4454 4455 static void 4456 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4457 { 4458 if (pollhz == 0) 4459 vmbus_chan_poll_disable(chan); 4460 else 4461 vmbus_chan_poll_enable(chan, pollhz); 4462 } 4463 4464 static void 4465 hn_polling(struct hn_softc *sc, u_int pollhz) 4466 { 4467 int nsubch = sc->hn_rx_ring_inuse - 1; 4468 4469 HN_LOCK_ASSERT(sc); 4470 4471 if (nsubch > 0) { 4472 struct vmbus_channel **subch; 4473 int i; 4474 4475 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4476 for (i = 0; i < nsubch; ++i) 4477 hn_chan_polling(subch[i], pollhz); 4478 vmbus_subchan_rel(subch, nsubch); 4479 } 4480 hn_chan_polling(sc->hn_prichan, pollhz); 4481 } 4482 4483 static int 4484 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4485 { 4486 struct hn_softc *sc = arg1; 4487 int pollhz, error; 4488 4489 pollhz = sc->hn_pollhz; 4490 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4491 if (error || req->newptr == NULL) 4492 return (error); 4493 4494 if (pollhz != 0 && 4495 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4496 return (EINVAL); 4497 4498 HN_LOCK(sc); 4499 if (sc->hn_pollhz != pollhz) { 4500 sc->hn_pollhz = pollhz; 4501 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4502 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4503 hn_polling(sc, sc->hn_pollhz); 4504 } 4505 HN_UNLOCK(sc); 4506 4507 return (0); 4508 } 4509 4510 static int 4511 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4512 { 4513 struct hn_softc *sc = arg1; 4514 char verstr[16]; 4515 4516 snprintf(verstr, sizeof(verstr), "%u.%u", 4517 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4518 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4519 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4520 } 4521 4522 static int 4523 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4524 { 4525 struct hn_softc *sc = arg1; 4526 char caps_str[128]; 4527 uint32_t caps; 4528 4529 HN_LOCK(sc); 4530 caps = sc->hn_caps; 4531 HN_UNLOCK(sc); 4532 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4533 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4534 } 4535 4536 static int 4537 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4538 { 4539 struct hn_softc *sc = arg1; 4540 char assist_str[128]; 4541 uint32_t hwassist; 4542 4543 HN_LOCK(sc); 4544 hwassist = sc->hn_ifp->if_hwassist; 4545 HN_UNLOCK(sc); 4546 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4547 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4548 } 4549 4550 static int 4551 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4552 { 4553 struct hn_softc *sc = arg1; 4554 char filter_str[128]; 4555 uint32_t filter; 4556 4557 HN_LOCK(sc); 4558 filter = sc->hn_rx_filter; 4559 HN_UNLOCK(sc); 4560 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4561 NDIS_PACKET_TYPES); 4562 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4563 } 4564 4565 #ifndef RSS 4566 4567 static int 4568 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4569 { 4570 struct hn_softc *sc = arg1; 4571 int error; 4572 4573 HN_LOCK(sc); 4574 4575 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4576 if (error || req->newptr == NULL) 4577 goto back; 4578 4579 if ((sc->hn_flags & HN_FLAG_RXVF) || 4580 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4581 /* 4582 * RSS key is synchronized w/ VF's, don't allow users 4583 * to change it. 4584 */ 4585 error = EBUSY; 4586 goto back; 4587 } 4588 4589 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4590 if (error) 4591 goto back; 4592 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4593 4594 if (sc->hn_rx_ring_inuse > 1) { 4595 error = hn_rss_reconfig(sc); 4596 } else { 4597 /* Not RSS capable, at least for now; just save the RSS key. */ 4598 error = 0; 4599 } 4600 back: 4601 HN_UNLOCK(sc); 4602 return (error); 4603 } 4604 4605 static int 4606 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4607 { 4608 struct hn_softc *sc = arg1; 4609 int error; 4610 4611 HN_LOCK(sc); 4612 4613 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4614 if (error || req->newptr == NULL) 4615 goto back; 4616 4617 /* 4618 * Don't allow RSS indirect table change, if this interface is not 4619 * RSS capable currently. 4620 */ 4621 if (sc->hn_rx_ring_inuse == 1) { 4622 error = EOPNOTSUPP; 4623 goto back; 4624 } 4625 4626 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4627 if (error) 4628 goto back; 4629 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4630 4631 hn_rss_ind_fixup(sc); 4632 error = hn_rss_reconfig(sc); 4633 back: 4634 HN_UNLOCK(sc); 4635 return (error); 4636 } 4637 4638 #endif /* !RSS */ 4639 4640 static int 4641 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4642 { 4643 struct hn_softc *sc = arg1; 4644 char hash_str[128]; 4645 uint32_t hash; 4646 4647 HN_LOCK(sc); 4648 hash = sc->hn_rss_hash; 4649 HN_UNLOCK(sc); 4650 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4651 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4652 } 4653 4654 static int 4655 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4656 { 4657 struct hn_softc *sc = arg1; 4658 char hash_str[128]; 4659 uint32_t hash; 4660 4661 HN_LOCK(sc); 4662 hash = sc->hn_rss_hcap; 4663 HN_UNLOCK(sc); 4664 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4665 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4666 } 4667 4668 static int 4669 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4670 { 4671 struct hn_softc *sc = arg1; 4672 char hash_str[128]; 4673 uint32_t hash; 4674 4675 HN_LOCK(sc); 4676 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4677 HN_UNLOCK(sc); 4678 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4679 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4680 } 4681 4682 static int 4683 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4684 { 4685 struct hn_softc *sc = arg1; 4686 char vf_name[IFNAMSIZ + 1]; 4687 struct ifnet *vf_ifp; 4688 4689 HN_LOCK(sc); 4690 vf_name[0] = '\0'; 4691 vf_ifp = sc->hn_vf_ifp; 4692 if (vf_ifp != NULL) 4693 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4694 HN_UNLOCK(sc); 4695 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4696 } 4697 4698 static int 4699 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4700 { 4701 struct hn_softc *sc = arg1; 4702 char vf_name[IFNAMSIZ + 1]; 4703 struct ifnet *vf_ifp; 4704 4705 HN_LOCK(sc); 4706 vf_name[0] = '\0'; 4707 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4708 if (vf_ifp != NULL) 4709 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4710 HN_UNLOCK(sc); 4711 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4712 } 4713 4714 static int 4715 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4716 { 4717 struct rm_priotracker pt; 4718 struct sbuf *sb; 4719 int error, i; 4720 bool first; 4721 4722 error = sysctl_wire_old_buffer(req, 0); 4723 if (error != 0) 4724 return (error); 4725 4726 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4727 if (sb == NULL) 4728 return (ENOMEM); 4729 4730 rm_rlock(&hn_vfmap_lock, &pt); 4731 4732 first = true; 4733 for (i = 0; i < hn_vfmap_size; ++i) { 4734 struct ifnet *ifp; 4735 4736 if (hn_vfmap[i] == NULL) 4737 continue; 4738 4739 ifp = ifnet_byindex(i); 4740 if (ifp != NULL) { 4741 if (first) 4742 sbuf_printf(sb, "%s", ifp->if_xname); 4743 else 4744 sbuf_printf(sb, " %s", ifp->if_xname); 4745 first = false; 4746 } 4747 } 4748 4749 rm_runlock(&hn_vfmap_lock, &pt); 4750 4751 error = sbuf_finish(sb); 4752 sbuf_delete(sb); 4753 return (error); 4754 } 4755 4756 static int 4757 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4758 { 4759 struct rm_priotracker pt; 4760 struct sbuf *sb; 4761 int error, i; 4762 bool first; 4763 4764 error = sysctl_wire_old_buffer(req, 0); 4765 if (error != 0) 4766 return (error); 4767 4768 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4769 if (sb == NULL) 4770 return (ENOMEM); 4771 4772 rm_rlock(&hn_vfmap_lock, &pt); 4773 4774 first = true; 4775 for (i = 0; i < hn_vfmap_size; ++i) { 4776 struct ifnet *ifp, *hn_ifp; 4777 4778 hn_ifp = hn_vfmap[i]; 4779 if (hn_ifp == NULL) 4780 continue; 4781 4782 ifp = ifnet_byindex(i); 4783 if (ifp != NULL) { 4784 if (first) { 4785 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4786 hn_ifp->if_xname); 4787 } else { 4788 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4789 hn_ifp->if_xname); 4790 } 4791 first = false; 4792 } 4793 } 4794 4795 rm_runlock(&hn_vfmap_lock, &pt); 4796 4797 error = sbuf_finish(sb); 4798 sbuf_delete(sb); 4799 return (error); 4800 } 4801 4802 static int 4803 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4804 { 4805 struct hn_softc *sc = arg1; 4806 int error, onoff = 0; 4807 4808 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4809 onoff = 1; 4810 error = sysctl_handle_int(oidp, &onoff, 0, req); 4811 if (error || req->newptr == NULL) 4812 return (error); 4813 4814 HN_LOCK(sc); 4815 /* NOTE: hn_vf_lock for hn_transmit() */ 4816 rm_wlock(&sc->hn_vf_lock); 4817 if (onoff) 4818 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4819 else 4820 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4821 rm_wunlock(&sc->hn_vf_lock); 4822 HN_UNLOCK(sc); 4823 4824 return (0); 4825 } 4826 4827 static int 4828 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4829 { 4830 struct hn_softc *sc = arg1; 4831 int enabled = 0; 4832 4833 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4834 enabled = 1; 4835 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4836 } 4837 4838 static int 4839 hn_check_iplen(const struct mbuf *m, int hoff) 4840 { 4841 const struct ip *ip; 4842 int len, iphlen, iplen; 4843 const struct tcphdr *th; 4844 int thoff; /* TCP data offset */ 4845 4846 len = hoff + sizeof(struct ip); 4847 4848 /* The packet must be at least the size of an IP header. */ 4849 if (m->m_pkthdr.len < len) 4850 return IPPROTO_DONE; 4851 4852 /* The fixed IP header must reside completely in the first mbuf. */ 4853 if (m->m_len < len) 4854 return IPPROTO_DONE; 4855 4856 ip = mtodo(m, hoff); 4857 4858 /* Bound check the packet's stated IP header length. */ 4859 iphlen = ip->ip_hl << 2; 4860 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4861 return IPPROTO_DONE; 4862 4863 /* The full IP header must reside completely in the one mbuf. */ 4864 if (m->m_len < hoff + iphlen) 4865 return IPPROTO_DONE; 4866 4867 iplen = ntohs(ip->ip_len); 4868 4869 /* 4870 * Check that the amount of data in the buffers is as 4871 * at least much as the IP header would have us expect. 4872 */ 4873 if (m->m_pkthdr.len < hoff + iplen) 4874 return IPPROTO_DONE; 4875 4876 /* 4877 * Ignore IP fragments. 4878 */ 4879 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4880 return IPPROTO_DONE; 4881 4882 /* 4883 * The TCP/IP or UDP/IP header must be entirely contained within 4884 * the first fragment of a packet. 4885 */ 4886 switch (ip->ip_p) { 4887 case IPPROTO_TCP: 4888 if (iplen < iphlen + sizeof(struct tcphdr)) 4889 return IPPROTO_DONE; 4890 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4891 return IPPROTO_DONE; 4892 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4893 thoff = th->th_off << 2; 4894 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4895 return IPPROTO_DONE; 4896 if (m->m_len < hoff + iphlen + thoff) 4897 return IPPROTO_DONE; 4898 break; 4899 case IPPROTO_UDP: 4900 if (iplen < iphlen + sizeof(struct udphdr)) 4901 return IPPROTO_DONE; 4902 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4903 return IPPROTO_DONE; 4904 break; 4905 default: 4906 if (iplen < iphlen) 4907 return IPPROTO_DONE; 4908 break; 4909 } 4910 return ip->ip_p; 4911 } 4912 4913 static void 4914 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4915 { 4916 const struct ether_header *eh; 4917 uint16_t etype; 4918 int hoff; 4919 4920 hoff = sizeof(*eh); 4921 /* Checked at the beginning of this function. */ 4922 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4923 4924 eh = mtod(m_new, const struct ether_header *); 4925 etype = ntohs(eh->ether_type); 4926 if (etype == ETHERTYPE_VLAN) { 4927 const struct ether_vlan_header *evl; 4928 4929 hoff = sizeof(*evl); 4930 if (m_new->m_len < hoff) 4931 return; 4932 evl = mtod(m_new, const struct ether_vlan_header *); 4933 etype = ntohs(evl->evl_proto); 4934 } 4935 *l3proto = etype; 4936 4937 if (etype == ETHERTYPE_IP) 4938 *l4proto = hn_check_iplen(m_new, hoff); 4939 else 4940 *l4proto = IPPROTO_DONE; 4941 } 4942 4943 static int 4944 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4945 { 4946 struct sysctl_oid_list *child; 4947 struct sysctl_ctx_list *ctx; 4948 device_t dev = sc->hn_dev; 4949 #if defined(INET) || defined(INET6) 4950 #if __FreeBSD_version >= 1100095 4951 int lroent_cnt; 4952 #endif 4953 #endif 4954 int i; 4955 4956 /* 4957 * Create RXBUF for reception. 4958 * 4959 * NOTE: 4960 * - It is shared by all channels. 4961 * - A large enough buffer is allocated, certain version of NVSes 4962 * may further limit the usable space. 4963 */ 4964 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4965 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4966 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4967 if (sc->hn_rxbuf == NULL) { 4968 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4969 return (ENOMEM); 4970 } 4971 4972 sc->hn_rx_ring_cnt = ring_cnt; 4973 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4974 4975 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4976 M_DEVBUF, M_WAITOK | M_ZERO); 4977 4978 #if defined(INET) || defined(INET6) 4979 #if __FreeBSD_version >= 1100095 4980 lroent_cnt = hn_lro_entry_count; 4981 if (lroent_cnt < TCP_LRO_ENTRIES) 4982 lroent_cnt = TCP_LRO_ENTRIES; 4983 if (bootverbose) 4984 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4985 #endif 4986 #endif /* INET || INET6 */ 4987 4988 ctx = device_get_sysctl_ctx(dev); 4989 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4990 4991 /* Create dev.hn.UNIT.rx sysctl tree */ 4992 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4993 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4994 4995 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4996 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4997 4998 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4999 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 5000 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5001 if (rxr->hn_br == NULL) { 5002 device_printf(dev, "allocate bufring failed\n"); 5003 return (ENOMEM); 5004 } 5005 5006 if (hn_trust_hosttcp) 5007 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5008 if (hn_trust_hostudp) 5009 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5010 if (hn_trust_hostip) 5011 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5012 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5013 rxr->hn_ifp = sc->hn_ifp; 5014 if (i < sc->hn_tx_ring_cnt) 5015 rxr->hn_txr = &sc->hn_tx_ring[i]; 5016 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5017 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5018 rxr->hn_rx_idx = i; 5019 rxr->hn_rxbuf = sc->hn_rxbuf; 5020 5021 /* 5022 * Initialize LRO. 5023 */ 5024 #if defined(INET) || defined(INET6) 5025 #if __FreeBSD_version >= 1100095 5026 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5027 hn_lro_mbufq_depth); 5028 #else 5029 tcp_lro_init(&rxr->hn_lro); 5030 rxr->hn_lro.ifp = sc->hn_ifp; 5031 #endif 5032 #if __FreeBSD_version >= 1100099 5033 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5034 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5035 #endif 5036 #endif /* INET || INET6 */ 5037 5038 if (sc->hn_rx_sysctl_tree != NULL) { 5039 char name[16]; 5040 5041 /* 5042 * Create per RX ring sysctl tree: 5043 * dev.hn.UNIT.rx.RINGID 5044 */ 5045 snprintf(name, sizeof(name), "%d", i); 5046 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5047 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5048 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5049 5050 if (rxr->hn_rx_sysctl_tree != NULL) { 5051 SYSCTL_ADD_ULONG(ctx, 5052 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5053 OID_AUTO, "packets", CTLFLAG_RW, 5054 &rxr->hn_pkts, "# of packets received"); 5055 SYSCTL_ADD_ULONG(ctx, 5056 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5057 OID_AUTO, "rss_pkts", CTLFLAG_RW, 5058 &rxr->hn_rss_pkts, 5059 "# of packets w/ RSS info received"); 5060 SYSCTL_ADD_ULONG(ctx, 5061 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5062 OID_AUTO, "rsc_pkts", CTLFLAG_RW, 5063 &rxr->hn_rsc_pkts, 5064 "# of RSC packets received"); 5065 SYSCTL_ADD_ULONG(ctx, 5066 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5067 OID_AUTO, "rsc_drop", CTLFLAG_RW, 5068 &rxr->hn_rsc_drop, 5069 "# of RSC fragments dropped"); 5070 SYSCTL_ADD_INT(ctx, 5071 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5072 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5073 &rxr->hn_pktbuf_len, 0, 5074 "Temporary channel packet buffer length"); 5075 } 5076 } 5077 } 5078 5079 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5080 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5081 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5082 #if __FreeBSD_version < 1100095 5083 hn_rx_stat_int_sysctl, 5084 #else 5085 hn_rx_stat_u64_sysctl, 5086 #endif 5087 "LU", "LRO queued"); 5088 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5089 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5090 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5091 #if __FreeBSD_version < 1100095 5092 hn_rx_stat_int_sysctl, 5093 #else 5094 hn_rx_stat_u64_sysctl, 5095 #endif 5096 "LU", "LRO flushed"); 5097 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5098 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5099 __offsetof(struct hn_rx_ring, hn_lro_tried), 5100 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5101 #if __FreeBSD_version >= 1100099 5102 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5103 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5104 hn_lro_lenlim_sysctl, "IU", 5105 "Max # of data bytes to be aggregated by LRO"); 5106 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5107 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5108 hn_lro_ackcnt_sysctl, "I", 5109 "Max # of ACKs to be aggregated by LRO"); 5110 #endif 5111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5112 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5113 hn_trust_hcsum_sysctl, "I", 5114 "Trust tcp segement verification on host side, " 5115 "when csum info is missing"); 5116 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5117 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5118 hn_trust_hcsum_sysctl, "I", 5119 "Trust udp datagram verification on host side, " 5120 "when csum info is missing"); 5121 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5122 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5123 hn_trust_hcsum_sysctl, "I", 5124 "Trust ip packet verification on host side, " 5125 "when csum info is missing"); 5126 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5127 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5128 __offsetof(struct hn_rx_ring, hn_csum_ip), 5129 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5130 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5131 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5132 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5133 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5134 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5135 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5136 __offsetof(struct hn_rx_ring, hn_csum_udp), 5137 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5138 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5139 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5140 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5141 hn_rx_stat_ulong_sysctl, "LU", 5142 "# of packets that we trust host's csum verification"); 5143 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5144 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5145 __offsetof(struct hn_rx_ring, hn_small_pkts), 5146 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5147 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5148 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5149 __offsetof(struct hn_rx_ring, hn_ack_failed), 5150 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5151 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5152 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5153 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5154 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5155 5156 return (0); 5157 } 5158 5159 static void 5160 hn_destroy_rx_data(struct hn_softc *sc) 5161 { 5162 int i; 5163 5164 if (sc->hn_rxbuf != NULL) { 5165 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5166 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5167 else 5168 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5169 sc->hn_rxbuf = NULL; 5170 } 5171 5172 if (sc->hn_rx_ring_cnt == 0) 5173 return; 5174 5175 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5176 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5177 5178 if (rxr->hn_br == NULL) 5179 continue; 5180 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5181 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5182 } else { 5183 device_printf(sc->hn_dev, 5184 "%dth channel bufring is referenced", i); 5185 } 5186 rxr->hn_br = NULL; 5187 5188 #if defined(INET) || defined(INET6) 5189 tcp_lro_free(&rxr->hn_lro); 5190 #endif 5191 free(rxr->hn_pktbuf, M_DEVBUF); 5192 } 5193 free(sc->hn_rx_ring, M_DEVBUF); 5194 sc->hn_rx_ring = NULL; 5195 5196 sc->hn_rx_ring_cnt = 0; 5197 sc->hn_rx_ring_inuse = 0; 5198 } 5199 5200 static int 5201 hn_tx_ring_create(struct hn_softc *sc, int id) 5202 { 5203 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5204 device_t dev = sc->hn_dev; 5205 bus_dma_tag_t parent_dtag; 5206 int error, i; 5207 5208 txr->hn_sc = sc; 5209 txr->hn_tx_idx = id; 5210 5211 #ifndef HN_USE_TXDESC_BUFRING 5212 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5213 #endif 5214 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5215 5216 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5217 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5218 M_DEVBUF, M_WAITOK | M_ZERO); 5219 #ifndef HN_USE_TXDESC_BUFRING 5220 SLIST_INIT(&txr->hn_txlist); 5221 #else 5222 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5223 M_WAITOK, &txr->hn_tx_lock); 5224 #endif 5225 5226 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5227 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5228 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5229 } else { 5230 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5231 } 5232 5233 #ifdef HN_IFSTART_SUPPORT 5234 if (hn_use_if_start) { 5235 txr->hn_txeof = hn_start_txeof; 5236 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5237 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5238 } else 5239 #endif 5240 { 5241 int br_depth; 5242 5243 txr->hn_txeof = hn_xmit_txeof; 5244 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5245 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5246 5247 br_depth = hn_get_txswq_depth(txr); 5248 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5249 M_WAITOK, &txr->hn_tx_lock); 5250 } 5251 5252 txr->hn_direct_tx_size = hn_direct_tx_size; 5253 5254 /* 5255 * Always schedule transmission instead of trying to do direct 5256 * transmission. This one gives the best performance so far. 5257 */ 5258 txr->hn_sched_tx = 1; 5259 5260 parent_dtag = bus_get_dma_tag(dev); 5261 5262 /* DMA tag for RNDIS packet messages. */ 5263 error = bus_dma_tag_create(parent_dtag, /* parent */ 5264 HN_RNDIS_PKT_ALIGN, /* alignment */ 5265 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5266 BUS_SPACE_MAXADDR, /* lowaddr */ 5267 BUS_SPACE_MAXADDR, /* highaddr */ 5268 NULL, NULL, /* filter, filterarg */ 5269 HN_RNDIS_PKT_LEN, /* maxsize */ 5270 1, /* nsegments */ 5271 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5272 0, /* flags */ 5273 NULL, /* lockfunc */ 5274 NULL, /* lockfuncarg */ 5275 &txr->hn_tx_rndis_dtag); 5276 if (error) { 5277 device_printf(dev, "failed to create rndis dmatag\n"); 5278 return error; 5279 } 5280 5281 /* DMA tag for data. */ 5282 error = bus_dma_tag_create(parent_dtag, /* parent */ 5283 1, /* alignment */ 5284 HN_TX_DATA_BOUNDARY, /* boundary */ 5285 BUS_SPACE_MAXADDR, /* lowaddr */ 5286 BUS_SPACE_MAXADDR, /* highaddr */ 5287 NULL, NULL, /* filter, filterarg */ 5288 HN_TX_DATA_MAXSIZE, /* maxsize */ 5289 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5290 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5291 0, /* flags */ 5292 NULL, /* lockfunc */ 5293 NULL, /* lockfuncarg */ 5294 &txr->hn_tx_data_dtag); 5295 if (error) { 5296 device_printf(dev, "failed to create data dmatag\n"); 5297 return error; 5298 } 5299 5300 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5301 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5302 5303 txd->txr = txr; 5304 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5305 STAILQ_INIT(&txd->agg_list); 5306 5307 /* 5308 * Allocate and load RNDIS packet message. 5309 */ 5310 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5311 (void **)&txd->rndis_pkt, 5312 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5313 &txd->rndis_pkt_dmap); 5314 if (error) { 5315 device_printf(dev, 5316 "failed to allocate rndis_packet_msg, %d\n", i); 5317 return error; 5318 } 5319 5320 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5321 txd->rndis_pkt_dmap, 5322 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5323 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5324 BUS_DMA_NOWAIT); 5325 if (error) { 5326 device_printf(dev, 5327 "failed to load rndis_packet_msg, %d\n", i); 5328 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5329 txd->rndis_pkt, txd->rndis_pkt_dmap); 5330 return error; 5331 } 5332 5333 /* DMA map for TX data. */ 5334 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5335 &txd->data_dmap); 5336 if (error) { 5337 device_printf(dev, 5338 "failed to allocate tx data dmamap\n"); 5339 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5340 txd->rndis_pkt_dmap); 5341 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5342 txd->rndis_pkt, txd->rndis_pkt_dmap); 5343 return error; 5344 } 5345 5346 /* All set, put it to list */ 5347 txd->flags |= HN_TXD_FLAG_ONLIST; 5348 #ifndef HN_USE_TXDESC_BUFRING 5349 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5350 #else 5351 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5352 #endif 5353 } 5354 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5355 5356 if (sc->hn_tx_sysctl_tree != NULL) { 5357 struct sysctl_oid_list *child; 5358 struct sysctl_ctx_list *ctx; 5359 char name[16]; 5360 5361 /* 5362 * Create per TX ring sysctl tree: 5363 * dev.hn.UNIT.tx.RINGID 5364 */ 5365 ctx = device_get_sysctl_ctx(dev); 5366 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5367 5368 snprintf(name, sizeof(name), "%d", id); 5369 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5370 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5371 5372 if (txr->hn_tx_sysctl_tree != NULL) { 5373 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5374 5375 #ifdef HN_DEBUG 5376 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5377 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5378 "# of available TX descs"); 5379 #endif 5380 #ifdef HN_IFSTART_SUPPORT 5381 if (!hn_use_if_start) 5382 #endif 5383 { 5384 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5385 CTLFLAG_RD, &txr->hn_oactive, 0, 5386 "over active"); 5387 } 5388 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5389 CTLFLAG_RW, &txr->hn_pkts, 5390 "# of packets transmitted"); 5391 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5392 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 5393 } 5394 } 5395 5396 return 0; 5397 } 5398 5399 static void 5400 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5401 { 5402 struct hn_tx_ring *txr = txd->txr; 5403 5404 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5405 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5406 5407 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5408 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5409 txd->rndis_pkt_dmap); 5410 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5411 } 5412 5413 static void 5414 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5415 { 5416 5417 KASSERT(txd->refs == 0 || txd->refs == 1, 5418 ("invalid txd refs %d", txd->refs)); 5419 5420 /* Aggregated txds will be freed by their aggregating txd. */ 5421 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5422 int freed; 5423 5424 freed = hn_txdesc_put(txr, txd); 5425 KASSERT(freed, ("can't free txdesc")); 5426 } 5427 } 5428 5429 static void 5430 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5431 { 5432 int i; 5433 5434 if (txr->hn_txdesc == NULL) 5435 return; 5436 5437 /* 5438 * NOTE: 5439 * Because the freeing of aggregated txds will be deferred 5440 * to the aggregating txd, two passes are used here: 5441 * - The first pass GCes any pending txds. This GC is necessary, 5442 * since if the channels are revoked, hypervisor will not 5443 * deliver send-done for all pending txds. 5444 * - The second pass frees the busdma stuffs, i.e. after all txds 5445 * were freed. 5446 */ 5447 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5448 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5449 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5450 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5451 5452 if (txr->hn_tx_data_dtag != NULL) 5453 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5454 if (txr->hn_tx_rndis_dtag != NULL) 5455 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5456 5457 #ifdef HN_USE_TXDESC_BUFRING 5458 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5459 #endif 5460 5461 free(txr->hn_txdesc, M_DEVBUF); 5462 txr->hn_txdesc = NULL; 5463 5464 if (txr->hn_mbuf_br != NULL) 5465 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5466 5467 #ifndef HN_USE_TXDESC_BUFRING 5468 mtx_destroy(&txr->hn_txlist_spin); 5469 #endif 5470 mtx_destroy(&txr->hn_tx_lock); 5471 } 5472 5473 static int 5474 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5475 { 5476 struct sysctl_oid_list *child; 5477 struct sysctl_ctx_list *ctx; 5478 int i; 5479 5480 /* 5481 * Create TXBUF for chimney sending. 5482 * 5483 * NOTE: It is shared by all channels. 5484 */ 5485 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5486 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5487 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5488 if (sc->hn_chim == NULL) { 5489 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5490 return (ENOMEM); 5491 } 5492 5493 sc->hn_tx_ring_cnt = ring_cnt; 5494 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5495 5496 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5497 M_DEVBUF, M_WAITOK | M_ZERO); 5498 5499 ctx = device_get_sysctl_ctx(sc->hn_dev); 5500 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5501 5502 /* Create dev.hn.UNIT.tx sysctl tree */ 5503 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5504 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5505 5506 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5507 int error; 5508 5509 error = hn_tx_ring_create(sc, i); 5510 if (error) 5511 return error; 5512 } 5513 5514 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5515 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5516 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5517 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5518 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5519 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5520 __offsetof(struct hn_tx_ring, hn_send_failed), 5521 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5522 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5523 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5524 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5525 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5526 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5527 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5528 __offsetof(struct hn_tx_ring, hn_flush_failed), 5529 hn_tx_stat_ulong_sysctl, "LU", 5530 "# of packet transmission aggregation flush failure"); 5531 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5532 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5533 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5534 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5535 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5536 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5537 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5538 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5539 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5540 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5541 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5542 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5543 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5544 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5545 "# of total TX descs"); 5546 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5547 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5548 "Chimney send packet size upper boundary"); 5549 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5550 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5551 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5552 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5553 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5554 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5555 hn_tx_conf_int_sysctl, "I", 5556 "Size of the packet for direct transmission"); 5557 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5558 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5559 __offsetof(struct hn_tx_ring, hn_sched_tx), 5560 hn_tx_conf_int_sysctl, "I", 5561 "Always schedule transmission " 5562 "instead of doing direct transmission"); 5563 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5564 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5565 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5566 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5567 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5568 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5569 "Applied packet transmission aggregation size"); 5570 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5571 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5572 hn_txagg_pktmax_sysctl, "I", 5573 "Applied packet transmission aggregation packets"); 5574 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5575 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5576 hn_txagg_align_sysctl, "I", 5577 "Applied packet transmission aggregation alignment"); 5578 5579 return 0; 5580 } 5581 5582 static void 5583 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5584 { 5585 int i; 5586 5587 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5588 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5589 } 5590 5591 static void 5592 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5593 { 5594 struct ifnet *ifp = sc->hn_ifp; 5595 u_int hw_tsomax; 5596 int tso_minlen; 5597 5598 HN_LOCK_ASSERT(sc); 5599 5600 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5601 return; 5602 5603 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5604 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5605 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5606 5607 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5608 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5609 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5610 5611 if (tso_maxlen < tso_minlen) 5612 tso_maxlen = tso_minlen; 5613 else if (tso_maxlen > IP_MAXPACKET) 5614 tso_maxlen = IP_MAXPACKET; 5615 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5616 tso_maxlen = sc->hn_ndis_tso_szmax; 5617 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5618 5619 if (hn_xpnt_vf_isready(sc)) { 5620 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5621 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5622 } 5623 ifp->if_hw_tsomax = hw_tsomax; 5624 if (bootverbose) 5625 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5626 } 5627 5628 static void 5629 hn_fixup_tx_data(struct hn_softc *sc) 5630 { 5631 uint64_t csum_assist; 5632 int i; 5633 5634 hn_set_chim_size(sc, sc->hn_chim_szmax); 5635 if (hn_tx_chimney_size > 0 && 5636 hn_tx_chimney_size < sc->hn_chim_szmax) 5637 hn_set_chim_size(sc, hn_tx_chimney_size); 5638 5639 csum_assist = 0; 5640 if (sc->hn_caps & HN_CAP_IPCS) 5641 csum_assist |= CSUM_IP; 5642 if (sc->hn_caps & HN_CAP_TCP4CS) 5643 csum_assist |= CSUM_IP_TCP; 5644 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5645 csum_assist |= CSUM_IP_UDP; 5646 if (sc->hn_caps & HN_CAP_TCP6CS) 5647 csum_assist |= CSUM_IP6_TCP; 5648 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5649 csum_assist |= CSUM_IP6_UDP; 5650 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5651 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5652 5653 if (sc->hn_caps & HN_CAP_HASHVAL) { 5654 /* 5655 * Support HASHVAL pktinfo on TX path. 5656 */ 5657 if (bootverbose) 5658 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5659 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5660 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5661 } 5662 } 5663 5664 static void 5665 hn_fixup_rx_data(struct hn_softc *sc) 5666 { 5667 5668 if (sc->hn_caps & HN_CAP_UDPHASH) { 5669 int i; 5670 5671 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5672 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5673 } 5674 } 5675 5676 static void 5677 hn_destroy_tx_data(struct hn_softc *sc) 5678 { 5679 int i; 5680 5681 if (sc->hn_chim != NULL) { 5682 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5683 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5684 } else { 5685 device_printf(sc->hn_dev, 5686 "chimney sending buffer is referenced"); 5687 } 5688 sc->hn_chim = NULL; 5689 } 5690 5691 if (sc->hn_tx_ring_cnt == 0) 5692 return; 5693 5694 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5695 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5696 5697 free(sc->hn_tx_ring, M_DEVBUF); 5698 sc->hn_tx_ring = NULL; 5699 5700 sc->hn_tx_ring_cnt = 0; 5701 sc->hn_tx_ring_inuse = 0; 5702 } 5703 5704 #ifdef HN_IFSTART_SUPPORT 5705 5706 static void 5707 hn_start_taskfunc(void *xtxr, int pending __unused) 5708 { 5709 struct hn_tx_ring *txr = xtxr; 5710 5711 mtx_lock(&txr->hn_tx_lock); 5712 hn_start_locked(txr, 0); 5713 mtx_unlock(&txr->hn_tx_lock); 5714 } 5715 5716 static int 5717 hn_start_locked(struct hn_tx_ring *txr, int len) 5718 { 5719 struct hn_softc *sc = txr->hn_sc; 5720 struct ifnet *ifp = sc->hn_ifp; 5721 int sched = 0; 5722 5723 KASSERT(hn_use_if_start, 5724 ("hn_start_locked is called, when if_start is disabled")); 5725 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5726 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5727 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5728 5729 if (__predict_false(txr->hn_suspended)) 5730 return (0); 5731 5732 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5733 IFF_DRV_RUNNING) 5734 return (0); 5735 5736 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5737 struct hn_txdesc *txd; 5738 struct mbuf *m_head; 5739 int error; 5740 5741 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5742 if (m_head == NULL) 5743 break; 5744 5745 if (len > 0 && m_head->m_pkthdr.len > len) { 5746 /* 5747 * This sending could be time consuming; let callers 5748 * dispatch this packet sending (and sending of any 5749 * following up packets) to tx taskqueue. 5750 */ 5751 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5752 sched = 1; 5753 break; 5754 } 5755 5756 #if defined(INET6) || defined(INET) 5757 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5758 m_head = hn_tso_fixup(m_head); 5759 if (__predict_false(m_head == NULL)) { 5760 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5761 continue; 5762 } 5763 } else if (m_head->m_pkthdr.csum_flags & 5764 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5765 m_head = hn_set_hlen(m_head); 5766 if (__predict_false(m_head == NULL)) { 5767 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5768 continue; 5769 } 5770 } 5771 #endif 5772 5773 txd = hn_txdesc_get(txr); 5774 if (txd == NULL) { 5775 txr->hn_no_txdescs++; 5776 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5777 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5778 break; 5779 } 5780 5781 error = hn_encap(ifp, txr, txd, &m_head); 5782 if (error) { 5783 /* Both txd and m_head are freed */ 5784 KASSERT(txr->hn_agg_txd == NULL, 5785 ("encap failed w/ pending aggregating txdesc")); 5786 continue; 5787 } 5788 5789 if (txr->hn_agg_pktleft == 0) { 5790 if (txr->hn_agg_txd != NULL) { 5791 KASSERT(m_head == NULL, 5792 ("pending mbuf for aggregating txdesc")); 5793 error = hn_flush_txagg(ifp, txr); 5794 if (__predict_false(error)) { 5795 atomic_set_int(&ifp->if_drv_flags, 5796 IFF_DRV_OACTIVE); 5797 break; 5798 } 5799 } else { 5800 KASSERT(m_head != NULL, ("mbuf was freed")); 5801 error = hn_txpkt(ifp, txr, txd); 5802 if (__predict_false(error)) { 5803 /* txd is freed, but m_head is not */ 5804 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5805 atomic_set_int(&ifp->if_drv_flags, 5806 IFF_DRV_OACTIVE); 5807 break; 5808 } 5809 } 5810 } 5811 #ifdef INVARIANTS 5812 else { 5813 KASSERT(txr->hn_agg_txd != NULL, 5814 ("no aggregating txdesc")); 5815 KASSERT(m_head == NULL, 5816 ("pending mbuf for aggregating txdesc")); 5817 } 5818 #endif 5819 } 5820 5821 /* Flush pending aggerated transmission. */ 5822 if (txr->hn_agg_txd != NULL) 5823 hn_flush_txagg(ifp, txr); 5824 return (sched); 5825 } 5826 5827 static void 5828 hn_start(struct ifnet *ifp) 5829 { 5830 struct hn_softc *sc = ifp->if_softc; 5831 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5832 5833 if (txr->hn_sched_tx) 5834 goto do_sched; 5835 5836 if (mtx_trylock(&txr->hn_tx_lock)) { 5837 int sched; 5838 5839 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5840 mtx_unlock(&txr->hn_tx_lock); 5841 if (!sched) 5842 return; 5843 } 5844 do_sched: 5845 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5846 } 5847 5848 static void 5849 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5850 { 5851 struct hn_tx_ring *txr = xtxr; 5852 5853 mtx_lock(&txr->hn_tx_lock); 5854 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5855 hn_start_locked(txr, 0); 5856 mtx_unlock(&txr->hn_tx_lock); 5857 } 5858 5859 static void 5860 hn_start_txeof(struct hn_tx_ring *txr) 5861 { 5862 struct hn_softc *sc = txr->hn_sc; 5863 struct ifnet *ifp = sc->hn_ifp; 5864 5865 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5866 5867 if (txr->hn_sched_tx) 5868 goto do_sched; 5869 5870 if (mtx_trylock(&txr->hn_tx_lock)) { 5871 int sched; 5872 5873 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5874 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5875 mtx_unlock(&txr->hn_tx_lock); 5876 if (sched) { 5877 taskqueue_enqueue(txr->hn_tx_taskq, 5878 &txr->hn_tx_task); 5879 } 5880 } else { 5881 do_sched: 5882 /* 5883 * Release the OACTIVE earlier, with the hope, that 5884 * others could catch up. The task will clear the 5885 * flag again with the hn_tx_lock to avoid possible 5886 * races. 5887 */ 5888 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5889 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5890 } 5891 } 5892 5893 #endif /* HN_IFSTART_SUPPORT */ 5894 5895 static int 5896 hn_xmit(struct hn_tx_ring *txr, int len) 5897 { 5898 struct hn_softc *sc = txr->hn_sc; 5899 struct ifnet *ifp = sc->hn_ifp; 5900 struct mbuf *m_head; 5901 int sched = 0; 5902 5903 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5904 #ifdef HN_IFSTART_SUPPORT 5905 KASSERT(hn_use_if_start == 0, 5906 ("hn_xmit is called, when if_start is enabled")); 5907 #endif 5908 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5909 5910 if (__predict_false(txr->hn_suspended)) 5911 return (0); 5912 5913 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5914 return (0); 5915 5916 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5917 struct hn_txdesc *txd; 5918 int error; 5919 5920 if (len > 0 && m_head->m_pkthdr.len > len) { 5921 /* 5922 * This sending could be time consuming; let callers 5923 * dispatch this packet sending (and sending of any 5924 * following up packets) to tx taskqueue. 5925 */ 5926 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5927 sched = 1; 5928 break; 5929 } 5930 5931 txd = hn_txdesc_get(txr); 5932 if (txd == NULL) { 5933 txr->hn_no_txdescs++; 5934 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5935 txr->hn_oactive = 1; 5936 break; 5937 } 5938 5939 error = hn_encap(ifp, txr, txd, &m_head); 5940 if (error) { 5941 /* Both txd and m_head are freed; discard */ 5942 KASSERT(txr->hn_agg_txd == NULL, 5943 ("encap failed w/ pending aggregating txdesc")); 5944 drbr_advance(ifp, txr->hn_mbuf_br); 5945 continue; 5946 } 5947 5948 if (txr->hn_agg_pktleft == 0) { 5949 if (txr->hn_agg_txd != NULL) { 5950 KASSERT(m_head == NULL, 5951 ("pending mbuf for aggregating txdesc")); 5952 error = hn_flush_txagg(ifp, txr); 5953 if (__predict_false(error)) { 5954 txr->hn_oactive = 1; 5955 break; 5956 } 5957 } else { 5958 KASSERT(m_head != NULL, ("mbuf was freed")); 5959 error = hn_txpkt(ifp, txr, txd); 5960 if (__predict_false(error)) { 5961 /* txd is freed, but m_head is not */ 5962 drbr_putback(ifp, txr->hn_mbuf_br, 5963 m_head); 5964 txr->hn_oactive = 1; 5965 break; 5966 } 5967 } 5968 } 5969 #ifdef INVARIANTS 5970 else { 5971 KASSERT(txr->hn_agg_txd != NULL, 5972 ("no aggregating txdesc")); 5973 KASSERT(m_head == NULL, 5974 ("pending mbuf for aggregating txdesc")); 5975 } 5976 #endif 5977 5978 /* Sent */ 5979 drbr_advance(ifp, txr->hn_mbuf_br); 5980 } 5981 5982 /* Flush pending aggerated transmission. */ 5983 if (txr->hn_agg_txd != NULL) 5984 hn_flush_txagg(ifp, txr); 5985 return (sched); 5986 } 5987 5988 static int 5989 hn_transmit(struct ifnet *ifp, struct mbuf *m) 5990 { 5991 struct hn_softc *sc = ifp->if_softc; 5992 struct hn_tx_ring *txr; 5993 int error, idx = 0; 5994 5995 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5996 struct rm_priotracker pt; 5997 5998 rm_rlock(&sc->hn_vf_lock, &pt); 5999 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6000 struct mbuf *m_bpf = NULL; 6001 int obytes, omcast; 6002 6003 obytes = m->m_pkthdr.len; 6004 omcast = (m->m_flags & M_MCAST) != 0; 6005 6006 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6007 if (bpf_peers_present(ifp->if_bpf)) { 6008 m_bpf = m_copypacket(m, M_NOWAIT); 6009 if (m_bpf == NULL) { 6010 /* 6011 * Failed to grab a shallow 6012 * copy; tap now. 6013 */ 6014 ETHER_BPF_MTAP(ifp, m); 6015 } 6016 } 6017 } else { 6018 ETHER_BPF_MTAP(ifp, m); 6019 } 6020 6021 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 6022 rm_runlock(&sc->hn_vf_lock, &pt); 6023 6024 if (m_bpf != NULL) { 6025 if (!error) 6026 ETHER_BPF_MTAP(ifp, m_bpf); 6027 m_freem(m_bpf); 6028 } 6029 6030 if (error == ENOBUFS) { 6031 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6032 } else if (error) { 6033 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6034 } else { 6035 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6036 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6037 if (omcast) { 6038 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6039 omcast); 6040 } 6041 } 6042 return (error); 6043 } 6044 rm_runlock(&sc->hn_vf_lock, &pt); 6045 } 6046 6047 #if defined(INET6) || defined(INET) 6048 /* 6049 * Perform TSO packet header fixup or get l2/l3 header length now, 6050 * since packet headers should be cache-hot. 6051 */ 6052 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6053 m = hn_tso_fixup(m); 6054 if (__predict_false(m == NULL)) { 6055 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6056 return EIO; 6057 } 6058 } else if (m->m_pkthdr.csum_flags & 6059 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6060 m = hn_set_hlen(m); 6061 if (__predict_false(m == NULL)) { 6062 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6063 return EIO; 6064 } 6065 } 6066 #endif 6067 6068 /* 6069 * Select the TX ring based on flowid 6070 */ 6071 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6072 #ifdef RSS 6073 uint32_t bid; 6074 6075 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6076 &bid) == 0) 6077 idx = bid % sc->hn_tx_ring_inuse; 6078 else 6079 #endif 6080 { 6081 #if defined(INET6) || defined(INET) 6082 int tcpsyn = 0; 6083 6084 if (m->m_pkthdr.len < 128 && 6085 (m->m_pkthdr.csum_flags & 6086 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6087 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6088 m = hn_check_tcpsyn(m, &tcpsyn); 6089 if (__predict_false(m == NULL)) { 6090 if_inc_counter(ifp, 6091 IFCOUNTER_OERRORS, 1); 6092 return (EIO); 6093 } 6094 } 6095 #else 6096 const int tcpsyn = 0; 6097 #endif 6098 if (tcpsyn) 6099 idx = 0; 6100 else 6101 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6102 } 6103 } 6104 txr = &sc->hn_tx_ring[idx]; 6105 6106 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6107 if (error) { 6108 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6109 return error; 6110 } 6111 6112 if (txr->hn_oactive) 6113 return 0; 6114 6115 if (txr->hn_sched_tx) 6116 goto do_sched; 6117 6118 if (mtx_trylock(&txr->hn_tx_lock)) { 6119 int sched; 6120 6121 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6122 mtx_unlock(&txr->hn_tx_lock); 6123 if (!sched) 6124 return 0; 6125 } 6126 do_sched: 6127 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6128 return 0; 6129 } 6130 6131 static void 6132 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6133 { 6134 struct mbuf *m; 6135 6136 mtx_lock(&txr->hn_tx_lock); 6137 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6138 m_freem(m); 6139 mtx_unlock(&txr->hn_tx_lock); 6140 } 6141 6142 static void 6143 hn_xmit_qflush(struct ifnet *ifp) 6144 { 6145 struct hn_softc *sc = ifp->if_softc; 6146 struct rm_priotracker pt; 6147 int i; 6148 6149 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6150 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6151 if_qflush(ifp); 6152 6153 rm_rlock(&sc->hn_vf_lock, &pt); 6154 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6155 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6156 rm_runlock(&sc->hn_vf_lock, &pt); 6157 } 6158 6159 static void 6160 hn_xmit_txeof(struct hn_tx_ring *txr) 6161 { 6162 6163 if (txr->hn_sched_tx) 6164 goto do_sched; 6165 6166 if (mtx_trylock(&txr->hn_tx_lock)) { 6167 int sched; 6168 6169 txr->hn_oactive = 0; 6170 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6171 mtx_unlock(&txr->hn_tx_lock); 6172 if (sched) { 6173 taskqueue_enqueue(txr->hn_tx_taskq, 6174 &txr->hn_tx_task); 6175 } 6176 } else { 6177 do_sched: 6178 /* 6179 * Release the oactive earlier, with the hope, that 6180 * others could catch up. The task will clear the 6181 * oactive again with the hn_tx_lock to avoid possible 6182 * races. 6183 */ 6184 txr->hn_oactive = 0; 6185 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6186 } 6187 } 6188 6189 static void 6190 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6191 { 6192 struct hn_tx_ring *txr = xtxr; 6193 6194 mtx_lock(&txr->hn_tx_lock); 6195 hn_xmit(txr, 0); 6196 mtx_unlock(&txr->hn_tx_lock); 6197 } 6198 6199 static void 6200 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6201 { 6202 struct hn_tx_ring *txr = xtxr; 6203 6204 mtx_lock(&txr->hn_tx_lock); 6205 txr->hn_oactive = 0; 6206 hn_xmit(txr, 0); 6207 mtx_unlock(&txr->hn_tx_lock); 6208 } 6209 6210 static int 6211 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6212 { 6213 struct vmbus_chan_br cbr; 6214 struct hn_rx_ring *rxr; 6215 struct hn_tx_ring *txr = NULL; 6216 int idx, error; 6217 6218 idx = vmbus_chan_subidx(chan); 6219 6220 /* 6221 * Link this channel to RX/TX ring. 6222 */ 6223 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6224 ("invalid channel index %d, should > 0 && < %d", 6225 idx, sc->hn_rx_ring_inuse)); 6226 rxr = &sc->hn_rx_ring[idx]; 6227 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6228 ("RX ring %d already attached", idx)); 6229 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6230 rxr->hn_chan = chan; 6231 6232 if (bootverbose) { 6233 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6234 idx, vmbus_chan_id(chan)); 6235 } 6236 6237 if (idx < sc->hn_tx_ring_inuse) { 6238 txr = &sc->hn_tx_ring[idx]; 6239 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6240 ("TX ring %d already attached", idx)); 6241 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6242 6243 txr->hn_chan = chan; 6244 if (bootverbose) { 6245 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6246 idx, vmbus_chan_id(chan)); 6247 } 6248 } 6249 6250 /* Bind this channel to a proper CPU. */ 6251 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6252 6253 /* 6254 * Open this channel 6255 */ 6256 cbr.cbr = rxr->hn_br; 6257 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6258 cbr.cbr_txsz = HN_TXBR_SIZE; 6259 cbr.cbr_rxsz = HN_RXBR_SIZE; 6260 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6261 if (error) { 6262 if (error == EISCONN) { 6263 if_printf(sc->hn_ifp, "bufring is connected after " 6264 "chan%u open failure\n", vmbus_chan_id(chan)); 6265 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6266 } else { 6267 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6268 vmbus_chan_id(chan), error); 6269 } 6270 } 6271 return (error); 6272 } 6273 6274 static void 6275 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6276 { 6277 struct hn_rx_ring *rxr; 6278 int idx, error; 6279 6280 idx = vmbus_chan_subidx(chan); 6281 6282 /* 6283 * Link this channel to RX/TX ring. 6284 */ 6285 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6286 ("invalid channel index %d, should > 0 && < %d", 6287 idx, sc->hn_rx_ring_inuse)); 6288 rxr = &sc->hn_rx_ring[idx]; 6289 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6290 ("RX ring %d is not attached", idx)); 6291 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6292 6293 if (idx < sc->hn_tx_ring_inuse) { 6294 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6295 6296 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6297 ("TX ring %d is not attached attached", idx)); 6298 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6299 } 6300 6301 /* 6302 * Close this channel. 6303 * 6304 * NOTE: 6305 * Channel closing does _not_ destroy the target channel. 6306 */ 6307 error = vmbus_chan_close_direct(chan); 6308 if (error == EISCONN) { 6309 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6310 "after being closed\n", vmbus_chan_id(chan)); 6311 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6312 } else if (error) { 6313 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6314 vmbus_chan_id(chan), error); 6315 } 6316 } 6317 6318 static int 6319 hn_attach_subchans(struct hn_softc *sc) 6320 { 6321 struct vmbus_channel **subchans; 6322 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6323 int i, error = 0; 6324 6325 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6326 6327 /* Attach the sub-channels. */ 6328 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6329 for (i = 0; i < subchan_cnt; ++i) { 6330 int error1; 6331 6332 error1 = hn_chan_attach(sc, subchans[i]); 6333 if (error1) { 6334 error = error1; 6335 /* Move on; all channels will be detached later. */ 6336 } 6337 } 6338 vmbus_subchan_rel(subchans, subchan_cnt); 6339 6340 if (error) { 6341 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6342 } else { 6343 if (bootverbose) { 6344 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6345 subchan_cnt); 6346 } 6347 } 6348 return (error); 6349 } 6350 6351 static void 6352 hn_detach_allchans(struct hn_softc *sc) 6353 { 6354 struct vmbus_channel **subchans; 6355 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6356 int i; 6357 6358 if (subchan_cnt == 0) 6359 goto back; 6360 6361 /* Detach the sub-channels. */ 6362 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6363 for (i = 0; i < subchan_cnt; ++i) 6364 hn_chan_detach(sc, subchans[i]); 6365 vmbus_subchan_rel(subchans, subchan_cnt); 6366 6367 back: 6368 /* 6369 * Detach the primary channel, _after_ all sub-channels 6370 * are detached. 6371 */ 6372 hn_chan_detach(sc, sc->hn_prichan); 6373 6374 /* Wait for sub-channels to be destroyed, if any. */ 6375 vmbus_subchan_drain(sc->hn_prichan); 6376 6377 #ifdef INVARIANTS 6378 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6379 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6380 HN_RX_FLAG_ATTACHED) == 0, 6381 ("%dth RX ring is still attached", i)); 6382 } 6383 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6384 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6385 HN_TX_FLAG_ATTACHED) == 0, 6386 ("%dth TX ring is still attached", i)); 6387 } 6388 #endif 6389 } 6390 6391 static int 6392 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6393 { 6394 struct vmbus_channel **subchans; 6395 int nchan, rxr_cnt, error; 6396 6397 nchan = *nsubch + 1; 6398 if (nchan == 1) { 6399 /* 6400 * Multiple RX/TX rings are not requested. 6401 */ 6402 *nsubch = 0; 6403 return (0); 6404 } 6405 6406 /* 6407 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6408 * table entries. 6409 */ 6410 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6411 if (error) { 6412 /* No RSS; this is benign. */ 6413 *nsubch = 0; 6414 return (0); 6415 } 6416 if (bootverbose) { 6417 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6418 rxr_cnt, nchan); 6419 } 6420 6421 if (nchan > rxr_cnt) 6422 nchan = rxr_cnt; 6423 if (nchan == 1) { 6424 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6425 *nsubch = 0; 6426 return (0); 6427 } 6428 6429 /* 6430 * Allocate sub-channels from NVS. 6431 */ 6432 *nsubch = nchan - 1; 6433 error = hn_nvs_alloc_subchans(sc, nsubch); 6434 if (error || *nsubch == 0) { 6435 /* Failed to allocate sub-channels. */ 6436 *nsubch = 0; 6437 return (0); 6438 } 6439 6440 /* 6441 * Wait for all sub-channels to become ready before moving on. 6442 */ 6443 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6444 vmbus_subchan_rel(subchans, *nsubch); 6445 return (0); 6446 } 6447 6448 static bool 6449 hn_synth_attachable(const struct hn_softc *sc) 6450 { 6451 int i; 6452 6453 if (sc->hn_flags & HN_FLAG_ERRORS) 6454 return (false); 6455 6456 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6457 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6458 6459 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6460 return (false); 6461 } 6462 return (true); 6463 } 6464 6465 /* 6466 * Make sure that the RX filter is zero after the successful 6467 * RNDIS initialization. 6468 * 6469 * NOTE: 6470 * Under certain conditions on certain versions of Hyper-V, 6471 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6472 * after the successful RNDIS initialization, which breaks 6473 * the assumption of any following code (well, it breaks the 6474 * RNDIS API contract actually). Clear the RNDIS rxfilter 6475 * explicitly, drain packets sneaking through, and drain the 6476 * interrupt taskqueues scheduled due to the stealth packets. 6477 */ 6478 static void 6479 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6480 { 6481 6482 hn_disable_rx(sc); 6483 hn_drain_rxtx(sc, nchan); 6484 } 6485 6486 static int 6487 hn_synth_attach(struct hn_softc *sc, int mtu) 6488 { 6489 #define ATTACHED_NVS 0x0002 6490 #define ATTACHED_RNDIS 0x0004 6491 6492 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6493 int error, nsubch, nchan = 1, i, rndis_inited; 6494 uint32_t old_caps, attached = 0; 6495 6496 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6497 ("synthetic parts were attached")); 6498 6499 if (!hn_synth_attachable(sc)) 6500 return (ENXIO); 6501 6502 /* Save capabilities for later verification. */ 6503 old_caps = sc->hn_caps; 6504 sc->hn_caps = 0; 6505 6506 /* Clear RSS stuffs. */ 6507 sc->hn_rss_ind_size = 0; 6508 sc->hn_rss_hash = 0; 6509 sc->hn_rss_hcap = 0; 6510 6511 /* 6512 * Attach the primary channel _before_ attaching NVS and RNDIS. 6513 */ 6514 error = hn_chan_attach(sc, sc->hn_prichan); 6515 if (error) 6516 goto failed; 6517 6518 /* 6519 * Attach NVS. 6520 */ 6521 error = hn_nvs_attach(sc, mtu); 6522 if (error) 6523 goto failed; 6524 attached |= ATTACHED_NVS; 6525 6526 /* 6527 * Attach RNDIS _after_ NVS is attached. 6528 */ 6529 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6530 if (rndis_inited) 6531 attached |= ATTACHED_RNDIS; 6532 if (error) 6533 goto failed; 6534 6535 /* 6536 * Make sure capabilities are not changed. 6537 */ 6538 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6539 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6540 old_caps, sc->hn_caps); 6541 error = ENXIO; 6542 goto failed; 6543 } 6544 6545 /* 6546 * Allocate sub-channels for multi-TX/RX rings. 6547 * 6548 * NOTE: 6549 * The # of RX rings that can be used is equivalent to the # of 6550 * channels to be requested. 6551 */ 6552 nsubch = sc->hn_rx_ring_cnt - 1; 6553 error = hn_synth_alloc_subchans(sc, &nsubch); 6554 if (error) 6555 goto failed; 6556 /* NOTE: _Full_ synthetic parts detach is required now. */ 6557 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6558 6559 /* 6560 * Set the # of TX/RX rings that could be used according to 6561 * the # of channels that NVS offered. 6562 */ 6563 nchan = nsubch + 1; 6564 hn_set_ring_inuse(sc, nchan); 6565 if (nchan == 1) { 6566 /* Only the primary channel can be used; done */ 6567 goto back; 6568 } 6569 6570 /* 6571 * Attach the sub-channels. 6572 * 6573 * NOTE: hn_set_ring_inuse() _must_ have been called. 6574 */ 6575 error = hn_attach_subchans(sc); 6576 if (error) 6577 goto failed; 6578 6579 /* 6580 * Configure RSS key and indirect table _after_ all sub-channels 6581 * are attached. 6582 */ 6583 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6584 /* 6585 * RSS key is not set yet; set it to the default RSS key. 6586 */ 6587 if (bootverbose) 6588 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6589 #ifdef RSS 6590 rss_getkey(rss->rss_key); 6591 #else 6592 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6593 #endif 6594 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6595 } 6596 6597 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6598 /* 6599 * RSS indirect table is not set yet; set it up in round- 6600 * robin fashion. 6601 */ 6602 if (bootverbose) { 6603 if_printf(sc->hn_ifp, "setup default RSS indirect " 6604 "table\n"); 6605 } 6606 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6607 uint32_t subidx; 6608 6609 #ifdef RSS 6610 subidx = rss_get_indirection_to_bucket(i); 6611 #else 6612 subidx = i; 6613 #endif 6614 rss->rss_ind[i] = subidx % nchan; 6615 } 6616 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6617 } else { 6618 /* 6619 * # of usable channels may be changed, so we have to 6620 * make sure that all entries in RSS indirect table 6621 * are valid. 6622 * 6623 * NOTE: hn_set_ring_inuse() _must_ have been called. 6624 */ 6625 hn_rss_ind_fixup(sc); 6626 } 6627 6628 sc->hn_rss_hash = sc->hn_rss_hcap; 6629 if ((sc->hn_flags & HN_FLAG_RXVF) || 6630 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6631 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6632 hn_vf_rss_fixup(sc, false); 6633 } 6634 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6635 if (error) 6636 goto failed; 6637 back: 6638 /* 6639 * Fixup transmission aggregation setup. 6640 */ 6641 hn_set_txagg(sc); 6642 hn_rndis_init_fixat(sc, nchan); 6643 return (0); 6644 6645 failed: 6646 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6647 hn_rndis_init_fixat(sc, nchan); 6648 hn_synth_detach(sc); 6649 } else { 6650 if (attached & ATTACHED_RNDIS) { 6651 hn_rndis_init_fixat(sc, nchan); 6652 hn_rndis_detach(sc); 6653 } 6654 if (attached & ATTACHED_NVS) 6655 hn_nvs_detach(sc); 6656 hn_chan_detach(sc, sc->hn_prichan); 6657 /* Restore old capabilities. */ 6658 sc->hn_caps = old_caps; 6659 } 6660 return (error); 6661 6662 #undef ATTACHED_RNDIS 6663 #undef ATTACHED_NVS 6664 } 6665 6666 /* 6667 * NOTE: 6668 * The interface must have been suspended though hn_suspend(), before 6669 * this function get called. 6670 */ 6671 static void 6672 hn_synth_detach(struct hn_softc *sc) 6673 { 6674 6675 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6676 ("synthetic parts were not attached")); 6677 6678 /* Detach the RNDIS first. */ 6679 hn_rndis_detach(sc); 6680 6681 /* Detach NVS. */ 6682 hn_nvs_detach(sc); 6683 6684 /* Detach all of the channels. */ 6685 hn_detach_allchans(sc); 6686 6687 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6688 /* 6689 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6690 */ 6691 int error; 6692 6693 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6694 sc->hn_rxbuf_gpadl); 6695 if (error) { 6696 if_printf(sc->hn_ifp, 6697 "rxbuf gpadl disconn failed: %d\n", error); 6698 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6699 } 6700 sc->hn_rxbuf_gpadl = 0; 6701 } 6702 6703 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6704 /* 6705 * Host is post-Win2016, disconnect chimney sending buffer from 6706 * primary channel here. 6707 */ 6708 int error; 6709 6710 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6711 sc->hn_chim_gpadl); 6712 if (error) { 6713 if_printf(sc->hn_ifp, 6714 "chim gpadl disconn failed: %d\n", error); 6715 sc->hn_flags |= HN_FLAG_CHIM_REF; 6716 } 6717 sc->hn_chim_gpadl = 0; 6718 } 6719 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6720 } 6721 6722 static void 6723 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6724 { 6725 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6726 ("invalid ring count %d", ring_cnt)); 6727 6728 if (sc->hn_tx_ring_cnt > ring_cnt) 6729 sc->hn_tx_ring_inuse = ring_cnt; 6730 else 6731 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6732 sc->hn_rx_ring_inuse = ring_cnt; 6733 6734 #ifdef RSS 6735 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6736 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6737 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6738 rss_getnumbuckets()); 6739 } 6740 #endif 6741 6742 if (bootverbose) { 6743 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6744 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6745 } 6746 } 6747 6748 static void 6749 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6750 { 6751 6752 /* 6753 * NOTE: 6754 * The TX bufring will not be drained by the hypervisor, 6755 * if the primary channel is revoked. 6756 */ 6757 while (!vmbus_chan_rx_empty(chan) || 6758 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6759 !vmbus_chan_tx_empty(chan))) 6760 pause("waitch", 1); 6761 vmbus_chan_intr_drain(chan); 6762 } 6763 6764 static void 6765 hn_disable_rx(struct hn_softc *sc) 6766 { 6767 6768 /* 6769 * Disable RX by clearing RX filter forcefully. 6770 */ 6771 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6772 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6773 6774 /* 6775 * Give RNDIS enough time to flush all pending data packets. 6776 */ 6777 pause("waitrx", (200 * hz) / 1000); 6778 } 6779 6780 /* 6781 * NOTE: 6782 * RX/TX _must_ have been suspended/disabled, before this function 6783 * is called. 6784 */ 6785 static void 6786 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6787 { 6788 struct vmbus_channel **subch = NULL; 6789 int nsubch; 6790 6791 /* 6792 * Drain RX/TX bufrings and interrupts. 6793 */ 6794 nsubch = nchan - 1; 6795 if (nsubch > 0) 6796 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6797 6798 if (subch != NULL) { 6799 int i; 6800 6801 for (i = 0; i < nsubch; ++i) 6802 hn_chan_drain(sc, subch[i]); 6803 } 6804 hn_chan_drain(sc, sc->hn_prichan); 6805 6806 if (subch != NULL) 6807 vmbus_subchan_rel(subch, nsubch); 6808 } 6809 6810 static void 6811 hn_suspend_data(struct hn_softc *sc) 6812 { 6813 struct hn_tx_ring *txr; 6814 int i; 6815 6816 HN_LOCK_ASSERT(sc); 6817 6818 /* 6819 * Suspend TX. 6820 */ 6821 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6822 txr = &sc->hn_tx_ring[i]; 6823 6824 mtx_lock(&txr->hn_tx_lock); 6825 txr->hn_suspended = 1; 6826 mtx_unlock(&txr->hn_tx_lock); 6827 /* No one is able send more packets now. */ 6828 6829 /* 6830 * Wait for all pending sends to finish. 6831 * 6832 * NOTE: 6833 * We will _not_ receive all pending send-done, if the 6834 * primary channel is revoked. 6835 */ 6836 while (hn_tx_ring_pending(txr) && 6837 !vmbus_chan_is_revoked(sc->hn_prichan)) 6838 pause("hnwtx", 1 /* 1 tick */); 6839 } 6840 6841 /* 6842 * Disable RX. 6843 */ 6844 hn_disable_rx(sc); 6845 6846 /* 6847 * Drain RX/TX. 6848 */ 6849 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6850 6851 /* 6852 * Drain any pending TX tasks. 6853 * 6854 * NOTE: 6855 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6856 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6857 */ 6858 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6859 txr = &sc->hn_tx_ring[i]; 6860 6861 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6862 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6863 } 6864 } 6865 6866 static void 6867 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6868 { 6869 6870 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6871 } 6872 6873 static void 6874 hn_suspend_mgmt(struct hn_softc *sc) 6875 { 6876 struct task task; 6877 6878 HN_LOCK_ASSERT(sc); 6879 6880 /* 6881 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6882 * through hn_mgmt_taskq. 6883 */ 6884 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6885 vmbus_chan_run_task(sc->hn_prichan, &task); 6886 6887 /* 6888 * Make sure that all pending management tasks are completed. 6889 */ 6890 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6891 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6892 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6893 } 6894 6895 static void 6896 hn_suspend(struct hn_softc *sc) 6897 { 6898 6899 /* Disable polling. */ 6900 hn_polling(sc, 0); 6901 6902 /* 6903 * If the non-transparent mode VF is activated, the synthetic 6904 * device is receiving packets, so the data path of the 6905 * synthetic device must be suspended. 6906 */ 6907 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6908 (sc->hn_flags & HN_FLAG_RXVF)) 6909 hn_suspend_data(sc); 6910 hn_suspend_mgmt(sc); 6911 } 6912 6913 static void 6914 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6915 { 6916 int i; 6917 6918 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6919 ("invalid TX ring count %d", tx_ring_cnt)); 6920 6921 for (i = 0; i < tx_ring_cnt; ++i) { 6922 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6923 6924 mtx_lock(&txr->hn_tx_lock); 6925 txr->hn_suspended = 0; 6926 mtx_unlock(&txr->hn_tx_lock); 6927 } 6928 } 6929 6930 static void 6931 hn_resume_data(struct hn_softc *sc) 6932 { 6933 int i; 6934 6935 HN_LOCK_ASSERT(sc); 6936 6937 /* 6938 * Re-enable RX. 6939 */ 6940 hn_rxfilter_config(sc); 6941 6942 /* 6943 * Make sure to clear suspend status on "all" TX rings, 6944 * since hn_tx_ring_inuse can be changed after 6945 * hn_suspend_data(). 6946 */ 6947 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6948 6949 #ifdef HN_IFSTART_SUPPORT 6950 if (!hn_use_if_start) 6951 #endif 6952 { 6953 /* 6954 * Flush unused drbrs, since hn_tx_ring_inuse may be 6955 * reduced. 6956 */ 6957 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6958 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6959 } 6960 6961 /* 6962 * Kick start TX. 6963 */ 6964 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6965 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6966 6967 /* 6968 * Use txeof task, so that any pending oactive can be 6969 * cleared properly. 6970 */ 6971 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6972 } 6973 } 6974 6975 static void 6976 hn_resume_mgmt(struct hn_softc *sc) 6977 { 6978 6979 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6980 6981 /* 6982 * Kick off network change detection, if it was pending. 6983 * If no network change was pending, start link status 6984 * checks, which is more lightweight than network change 6985 * detection. 6986 */ 6987 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6988 hn_change_network(sc); 6989 else 6990 hn_update_link_status(sc); 6991 } 6992 6993 static void 6994 hn_resume(struct hn_softc *sc) 6995 { 6996 6997 /* 6998 * If the non-transparent mode VF is activated, the synthetic 6999 * device have to receive packets, so the data path of the 7000 * synthetic device must be resumed. 7001 */ 7002 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 7003 (sc->hn_flags & HN_FLAG_RXVF)) 7004 hn_resume_data(sc); 7005 7006 /* 7007 * Don't resume link status change if VF is attached/activated. 7008 * - In the non-transparent VF mode, the synthetic device marks 7009 * link down until the VF is deactivated; i.e. VF is down. 7010 * - In transparent VF mode, VF's media status is used until 7011 * the VF is detached. 7012 */ 7013 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7014 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7015 hn_resume_mgmt(sc); 7016 7017 /* 7018 * Re-enable polling if this interface is running and 7019 * the polling is requested. 7020 */ 7021 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7022 hn_polling(sc, sc->hn_pollhz); 7023 } 7024 7025 static void 7026 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7027 { 7028 const struct rndis_status_msg *msg; 7029 int ofs; 7030 7031 if (dlen < sizeof(*msg)) { 7032 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7033 return; 7034 } 7035 msg = data; 7036 7037 switch (msg->rm_status) { 7038 case RNDIS_STATUS_MEDIA_CONNECT: 7039 case RNDIS_STATUS_MEDIA_DISCONNECT: 7040 hn_update_link_status(sc); 7041 break; 7042 7043 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7044 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7045 /* Not really useful; ignore. */ 7046 break; 7047 7048 case RNDIS_STATUS_NETWORK_CHANGE: 7049 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7050 if (dlen < ofs + msg->rm_stbuflen || 7051 msg->rm_stbuflen < sizeof(uint32_t)) { 7052 if_printf(sc->hn_ifp, "network changed\n"); 7053 } else { 7054 uint32_t change; 7055 7056 memcpy(&change, ((const uint8_t *)msg) + ofs, 7057 sizeof(change)); 7058 if_printf(sc->hn_ifp, "network changed, change %u\n", 7059 change); 7060 } 7061 hn_change_network(sc); 7062 break; 7063 7064 default: 7065 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7066 msg->rm_status); 7067 break; 7068 } 7069 } 7070 7071 static int 7072 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7073 { 7074 const struct rndis_pktinfo *pi = info_data; 7075 uint32_t mask = 0; 7076 7077 while (info_dlen != 0) { 7078 const void *data; 7079 uint32_t dlen; 7080 7081 if (__predict_false(info_dlen < sizeof(*pi))) 7082 return (EINVAL); 7083 if (__predict_false(info_dlen < pi->rm_size)) 7084 return (EINVAL); 7085 info_dlen -= pi->rm_size; 7086 7087 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7088 return (EINVAL); 7089 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7090 return (EINVAL); 7091 dlen = pi->rm_size - pi->rm_pktinfooffset; 7092 data = pi->rm_data; 7093 7094 if (pi->rm_internal == 1) { 7095 switch (pi->rm_type) { 7096 case NDIS_PKTINFO_IT_PKTINFO_ID: 7097 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7098 return (EINVAL); 7099 info->pktinfo_id = 7100 (const struct packet_info_id *)data; 7101 mask |= HN_RXINFO_PKTINFO_ID; 7102 break; 7103 7104 default: 7105 goto next; 7106 } 7107 } else { 7108 switch (pi->rm_type) { 7109 case NDIS_PKTINFO_TYPE_VLAN: 7110 if (__predict_false(dlen 7111 < NDIS_VLAN_INFO_SIZE)) 7112 return (EINVAL); 7113 info->vlan_info = (const uint32_t *)data; 7114 mask |= HN_RXINFO_VLAN; 7115 break; 7116 7117 case NDIS_PKTINFO_TYPE_CSUM: 7118 if (__predict_false(dlen 7119 < NDIS_RXCSUM_INFO_SIZE)) 7120 return (EINVAL); 7121 info->csum_info = (const uint32_t *)data; 7122 mask |= HN_RXINFO_CSUM; 7123 break; 7124 7125 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7126 if (__predict_false(dlen 7127 < HN_NDIS_HASH_VALUE_SIZE)) 7128 return (EINVAL); 7129 info->hash_value = (const uint32_t *)data; 7130 mask |= HN_RXINFO_HASHVAL; 7131 break; 7132 7133 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7134 if (__predict_false(dlen 7135 < HN_NDIS_HASH_INFO_SIZE)) 7136 return (EINVAL); 7137 info->hash_info = (const uint32_t *)data; 7138 mask |= HN_RXINFO_HASHINF; 7139 break; 7140 7141 default: 7142 goto next; 7143 } 7144 } 7145 7146 if (mask == HN_RXINFO_ALL) { 7147 /* All found; done */ 7148 break; 7149 } 7150 next: 7151 pi = (const struct rndis_pktinfo *) 7152 ((const uint8_t *)pi + pi->rm_size); 7153 } 7154 7155 /* 7156 * Final fixup. 7157 * - If there is no hash value, invalidate the hash info. 7158 */ 7159 if ((mask & HN_RXINFO_HASHVAL) == 0) 7160 info->hash_info = NULL; 7161 return (0); 7162 } 7163 7164 static __inline bool 7165 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7166 { 7167 7168 if (off < check_off) { 7169 if (__predict_true(off + len <= check_off)) 7170 return (false); 7171 } else if (off > check_off) { 7172 if (__predict_true(check_off + check_len <= off)) 7173 return (false); 7174 } 7175 return (true); 7176 } 7177 7178 static __inline void 7179 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7180 uint32_t len, struct hn_rxinfo *info) 7181 { 7182 uint32_t cnt = rxr->rsc.cnt; 7183 7184 if (cnt) { 7185 rxr->rsc.pktlen += len; 7186 } else { 7187 rxr->rsc.vlan_info = info->vlan_info; 7188 rxr->rsc.csum_info = info->csum_info; 7189 rxr->rsc.hash_info = info->hash_info; 7190 rxr->rsc.hash_value = info->hash_value; 7191 rxr->rsc.pktlen = len; 7192 } 7193 7194 rxr->rsc.frag_data[cnt] = data; 7195 rxr->rsc.frag_len[cnt] = len; 7196 rxr->rsc.cnt++; 7197 } 7198 7199 static void 7200 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7201 { 7202 const struct rndis_packet_msg *pkt; 7203 struct hn_rxinfo info; 7204 int data_off, pktinfo_off, data_len, pktinfo_len; 7205 bool rsc_more= false; 7206 7207 /* 7208 * Check length. 7209 */ 7210 if (__predict_false(dlen < sizeof(*pkt))) { 7211 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7212 return; 7213 } 7214 pkt = data; 7215 7216 if (__predict_false(dlen < pkt->rm_len)) { 7217 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7218 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7219 return; 7220 } 7221 if (__predict_false(pkt->rm_len < 7222 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7223 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7224 "msglen %u, data %u, oob %u, pktinfo %u\n", 7225 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7226 pkt->rm_pktinfolen); 7227 return; 7228 } 7229 if (__predict_false(pkt->rm_datalen == 0)) { 7230 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7231 return; 7232 } 7233 7234 /* 7235 * Check offests. 7236 */ 7237 #define IS_OFFSET_INVALID(ofs) \ 7238 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7239 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7240 7241 /* XXX Hyper-V does not meet data offset alignment requirement */ 7242 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7243 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7244 "data offset %u\n", pkt->rm_dataoffset); 7245 return; 7246 } 7247 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7248 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7249 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7250 "oob offset %u\n", pkt->rm_oobdataoffset); 7251 return; 7252 } 7253 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7254 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7255 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7256 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7257 return; 7258 } 7259 7260 #undef IS_OFFSET_INVALID 7261 7262 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7263 data_len = pkt->rm_datalen; 7264 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7265 pktinfo_len = pkt->rm_pktinfolen; 7266 7267 /* 7268 * Check OOB coverage. 7269 */ 7270 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7271 int oob_off, oob_len; 7272 7273 if_printf(rxr->hn_ifp, "got oobdata\n"); 7274 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7275 oob_len = pkt->rm_oobdatalen; 7276 7277 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7278 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7279 "oob overflow, msglen %u, oob abs %d len %d\n", 7280 pkt->rm_len, oob_off, oob_len); 7281 return; 7282 } 7283 7284 /* 7285 * Check against data. 7286 */ 7287 if (hn_rndis_check_overlap(oob_off, oob_len, 7288 data_off, data_len)) { 7289 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7290 "oob overlaps data, oob abs %d len %d, " 7291 "data abs %d len %d\n", 7292 oob_off, oob_len, data_off, data_len); 7293 return; 7294 } 7295 7296 /* 7297 * Check against pktinfo. 7298 */ 7299 if (pktinfo_len != 0 && 7300 hn_rndis_check_overlap(oob_off, oob_len, 7301 pktinfo_off, pktinfo_len)) { 7302 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7303 "oob overlaps pktinfo, oob abs %d len %d, " 7304 "pktinfo abs %d len %d\n", 7305 oob_off, oob_len, pktinfo_off, pktinfo_len); 7306 return; 7307 } 7308 } 7309 7310 /* 7311 * Check per-packet-info coverage and find useful per-packet-info. 7312 */ 7313 info.vlan_info = NULL; 7314 info.csum_info = NULL; 7315 info.hash_info = NULL; 7316 info.pktinfo_id = NULL; 7317 7318 if (__predict_true(pktinfo_len != 0)) { 7319 bool overlap; 7320 int error; 7321 7322 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7323 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7324 "pktinfo overflow, msglen %u, " 7325 "pktinfo abs %d len %d\n", 7326 pkt->rm_len, pktinfo_off, pktinfo_len); 7327 return; 7328 } 7329 7330 /* 7331 * Check packet info coverage. 7332 */ 7333 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7334 data_off, data_len); 7335 if (__predict_false(overlap)) { 7336 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7337 "pktinfo overlap data, pktinfo abs %d len %d, " 7338 "data abs %d len %d\n", 7339 pktinfo_off, pktinfo_len, data_off, data_len); 7340 return; 7341 } 7342 7343 /* 7344 * Find useful per-packet-info. 7345 */ 7346 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7347 pktinfo_len, &info); 7348 if (__predict_false(error)) { 7349 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7350 "pktinfo\n"); 7351 return; 7352 } 7353 } 7354 7355 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7356 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7357 "data overflow, msglen %u, data abs %d len %d\n", 7358 pkt->rm_len, data_off, data_len); 7359 return; 7360 } 7361 7362 /* Identify RSC fragments, drop invalid packets */ 7363 if ((info.pktinfo_id != NULL) && 7364 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7365 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7366 rxr->rsc.cnt = 0; 7367 rxr->hn_rsc_pkts++; 7368 } else if (rxr->rsc.cnt == 0) 7369 goto drop; 7370 7371 rsc_more = true; 7372 7373 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7374 rsc_more = false; 7375 7376 if (rsc_more && rxr->rsc.is_last) 7377 goto drop; 7378 } else { 7379 rxr->rsc.cnt = 0; 7380 } 7381 7382 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7383 goto drop; 7384 7385 /* Store data in per rx ring structure */ 7386 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7387 data_len, &info); 7388 7389 if (rsc_more) 7390 return; 7391 7392 hn_rxpkt(rxr); 7393 rxr->rsc.cnt = 0; 7394 return; 7395 drop: 7396 rxr->hn_rsc_drop++; 7397 return; 7398 } 7399 7400 static __inline void 7401 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7402 { 7403 const struct rndis_msghdr *hdr; 7404 7405 if (__predict_false(dlen < sizeof(*hdr))) { 7406 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7407 return; 7408 } 7409 hdr = data; 7410 7411 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7412 /* Hot data path. */ 7413 hn_rndis_rx_data(rxr, data, dlen); 7414 /* Done! */ 7415 return; 7416 } 7417 7418 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7419 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7420 else 7421 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7422 } 7423 7424 static void 7425 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7426 { 7427 const struct hn_nvs_hdr *hdr; 7428 7429 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7430 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7431 return; 7432 } 7433 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7434 7435 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7436 /* Useless; ignore */ 7437 return; 7438 } 7439 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7440 } 7441 7442 static void 7443 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7444 const struct vmbus_chanpkt_hdr *pkt) 7445 { 7446 struct hn_nvs_sendctx *sndc; 7447 7448 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7449 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7450 VMBUS_CHANPKT_DATALEN(pkt)); 7451 /* 7452 * NOTE: 7453 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7454 * its callback. 7455 */ 7456 } 7457 7458 static void 7459 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7460 const struct vmbus_chanpkt_hdr *pkthdr) 7461 { 7462 const struct vmbus_chanpkt_rxbuf *pkt; 7463 const struct hn_nvs_hdr *nvs_hdr; 7464 int count, i, hlen; 7465 7466 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7467 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7468 return; 7469 } 7470 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7471 7472 /* Make sure that this is a RNDIS message. */ 7473 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7474 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7475 nvs_hdr->nvs_type); 7476 return; 7477 } 7478 7479 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7480 if (__predict_false(hlen < sizeof(*pkt))) { 7481 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7482 return; 7483 } 7484 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7485 7486 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7487 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7488 pkt->cp_rxbuf_id); 7489 return; 7490 } 7491 7492 count = pkt->cp_rxbuf_cnt; 7493 if (__predict_false(hlen < 7494 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7495 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7496 return; 7497 } 7498 7499 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7500 for (i = 0; i < count; ++i) { 7501 int ofs, len; 7502 7503 ofs = pkt->cp_rxbuf[i].rb_ofs; 7504 len = pkt->cp_rxbuf[i].rb_len; 7505 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7506 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7507 "ofs %d, len %d\n", i, ofs, len); 7508 continue; 7509 } 7510 7511 rxr->rsc.is_last = (i == (count - 1)); 7512 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7513 } 7514 7515 /* 7516 * Ack the consumed RXBUF associated w/ this channel packet, 7517 * so that this RXBUF can be recycled by the hypervisor. 7518 */ 7519 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7520 } 7521 7522 static void 7523 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7524 uint64_t tid) 7525 { 7526 struct hn_nvs_rndis_ack ack; 7527 int retries, error; 7528 7529 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7530 ack.nvs_status = HN_NVS_STATUS_OK; 7531 7532 retries = 0; 7533 again: 7534 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7535 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7536 if (__predict_false(error == EAGAIN)) { 7537 /* 7538 * NOTE: 7539 * This should _not_ happen in real world, since the 7540 * consumption of the TX bufring from the TX path is 7541 * controlled. 7542 */ 7543 if (rxr->hn_ack_failed == 0) 7544 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7545 rxr->hn_ack_failed++; 7546 retries++; 7547 if (retries < 10) { 7548 DELAY(100); 7549 goto again; 7550 } 7551 /* RXBUF leaks! */ 7552 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7553 } 7554 } 7555 7556 static void 7557 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7558 { 7559 struct hn_rx_ring *rxr = xrxr; 7560 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7561 7562 for (;;) { 7563 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7564 int error, pktlen; 7565 7566 pktlen = rxr->hn_pktbuf_len; 7567 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7568 if (__predict_false(error == ENOBUFS)) { 7569 void *nbuf; 7570 int nlen; 7571 7572 /* 7573 * Expand channel packet buffer. 7574 * 7575 * XXX 7576 * Use M_WAITOK here, since allocation failure 7577 * is fatal. 7578 */ 7579 nlen = rxr->hn_pktbuf_len * 2; 7580 while (nlen < pktlen) 7581 nlen *= 2; 7582 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7583 7584 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7585 rxr->hn_pktbuf_len, nlen); 7586 7587 free(rxr->hn_pktbuf, M_DEVBUF); 7588 rxr->hn_pktbuf = nbuf; 7589 rxr->hn_pktbuf_len = nlen; 7590 /* Retry! */ 7591 continue; 7592 } else if (__predict_false(error == EAGAIN)) { 7593 /* No more channel packets; done! */ 7594 break; 7595 } 7596 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7597 7598 switch (pkt->cph_type) { 7599 case VMBUS_CHANPKT_TYPE_COMP: 7600 hn_nvs_handle_comp(sc, chan, pkt); 7601 break; 7602 7603 case VMBUS_CHANPKT_TYPE_RXBUF: 7604 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7605 break; 7606 7607 case VMBUS_CHANPKT_TYPE_INBAND: 7608 hn_nvs_handle_notify(sc, pkt); 7609 break; 7610 7611 default: 7612 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7613 pkt->cph_type); 7614 break; 7615 } 7616 } 7617 hn_chan_rollup(rxr, rxr->hn_txr); 7618 } 7619 7620 static void 7621 hn_sysinit(void *arg __unused) 7622 { 7623 int i; 7624 7625 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7626 7627 #ifdef HN_IFSTART_SUPPORT 7628 /* 7629 * Don't use ifnet.if_start if transparent VF mode is requested; 7630 * mainly due to the IFF_DRV_OACTIVE flag. 7631 */ 7632 if (hn_xpnt_vf && hn_use_if_start) { 7633 hn_use_if_start = 0; 7634 printf("hn: tranparent VF mode, if_transmit will be used, " 7635 "instead of if_start\n"); 7636 } 7637 #endif 7638 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7639 printf("hn: invalid transparent VF attach routing " 7640 "wait timeout %d, reset to %d\n", 7641 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7642 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7643 } 7644 7645 /* 7646 * Initialize VF map. 7647 */ 7648 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7649 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7650 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7651 M_WAITOK | M_ZERO); 7652 7653 /* 7654 * Fix the # of TX taskqueues. 7655 */ 7656 if (hn_tx_taskq_cnt <= 0) 7657 hn_tx_taskq_cnt = 1; 7658 else if (hn_tx_taskq_cnt > mp_ncpus) 7659 hn_tx_taskq_cnt = mp_ncpus; 7660 7661 /* 7662 * Fix the TX taskqueue mode. 7663 */ 7664 switch (hn_tx_taskq_mode) { 7665 case HN_TX_TASKQ_M_INDEP: 7666 case HN_TX_TASKQ_M_GLOBAL: 7667 case HN_TX_TASKQ_M_EVTTQ: 7668 break; 7669 default: 7670 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7671 break; 7672 } 7673 7674 if (vm_guest != VM_GUEST_HV) 7675 return; 7676 7677 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7678 return; 7679 7680 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7681 M_DEVBUF, M_WAITOK); 7682 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7683 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7684 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7685 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7686 "hn tx%d", i); 7687 } 7688 } 7689 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7690 7691 static void 7692 hn_sysuninit(void *arg __unused) 7693 { 7694 7695 if (hn_tx_taskque != NULL) { 7696 int i; 7697 7698 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7699 taskqueue_free(hn_tx_taskque[i]); 7700 free(hn_tx_taskque, M_DEVBUF); 7701 } 7702 7703 if (hn_vfmap != NULL) 7704 free(hn_vfmap, M_DEVBUF); 7705 rm_destroy(&hn_vfmap_lock); 7706 7707 counter_u64_free(hn_udpcs_fixup); 7708 } 7709 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7710