1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <machine/atomic.h> 89 #include <machine/in_cksum.h> 90 91 #include <net/bpf.h> 92 #include <net/ethernet.h> 93 #include <net/if.h> 94 #include <net/if_dl.h> 95 #include <net/if_media.h> 96 #include <net/if_types.h> 97 #include <net/if_var.h> 98 #include <net/rndis.h> 99 #ifdef RSS 100 #include <net/rss_config.h> 101 #endif 102 103 #include <netinet/in_systm.h> 104 #include <netinet/in.h> 105 #include <netinet/ip.h> 106 #include <netinet/ip6.h> 107 #include <netinet/tcp.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/udp.h> 110 111 #include <dev/hyperv/include/hyperv.h> 112 #include <dev/hyperv/include/hyperv_busdma.h> 113 #include <dev/hyperv/include/vmbus.h> 114 #include <dev/hyperv/include/vmbus_xact.h> 115 116 #include <dev/hyperv/netvsc/ndis.h> 117 #include <dev/hyperv/netvsc/if_hnreg.h> 118 #include <dev/hyperv/netvsc/if_hnvar.h> 119 #include <dev/hyperv/netvsc/hn_nvs.h> 120 #include <dev/hyperv/netvsc/hn_rndis.h> 121 122 #include "vmbus_if.h" 123 124 #define HN_IFSTART_SUPPORT 125 126 #define HN_RING_CNT_DEF_MAX 8 127 128 #define HN_VFMAP_SIZE_DEF 8 129 130 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 131 132 /* YYY should get it from the underlying channel */ 133 #define HN_TX_DESC_CNT 512 134 135 #define HN_RNDIS_PKT_LEN \ 136 (sizeof(struct rndis_packet_msg) + \ 137 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 138 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 141 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 142 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 143 144 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 145 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 146 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 147 /* -1 for RNDIS packet message */ 148 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 149 150 #define HN_DIRECT_TX_SIZE_DEF 128 151 152 #define HN_EARLY_TXEOF_THRESH 8 153 154 #define HN_PKTBUF_LEN_DEF (16 * 1024) 155 156 #define HN_LROENT_CNT_DEF 128 157 158 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 159 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 160 /* YYY 2*MTU is a bit rough, but should be good enough. */ 161 #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp)) 162 163 #define HN_LRO_ACKCNT_DEF 1 164 165 #define HN_LOCK_INIT(sc) \ 166 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 167 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 168 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 169 #define HN_LOCK(sc) \ 170 do { \ 171 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 172 /* Relinquish cpu to avoid deadlock */ \ 173 sched_relinquish(curthread); \ 174 DELAY(1000); \ 175 } \ 176 } while (0) 177 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 178 179 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 180 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 181 #define HN_CSUM_IP_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 183 #define HN_CSUM_IP6_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 185 186 #define HN_PKTSIZE_MIN(align) \ 187 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 188 HN_RNDIS_PKT_LEN, (align)) 189 #define HN_PKTSIZE(m, align) \ 190 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 191 192 #ifdef RSS 193 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 194 #else 195 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 196 #endif 197 198 struct hn_txdesc { 199 #ifndef HN_USE_TXDESC_BUFRING 200 SLIST_ENTRY(hn_txdesc) link; 201 #endif 202 STAILQ_ENTRY(hn_txdesc) agg_link; 203 204 /* Aggregated txdescs, in sending order. */ 205 STAILQ_HEAD(, hn_txdesc) agg_list; 206 207 /* The oldest packet, if transmission aggregation happens. */ 208 struct mbuf *m; 209 struct hn_tx_ring *txr; 210 int refs; 211 uint32_t flags; /* HN_TXD_FLAG_ */ 212 struct hn_nvs_sendctx send_ctx; 213 uint32_t chim_index; 214 int chim_size; 215 216 bus_dmamap_t data_dmap; 217 218 bus_addr_t rndis_pkt_paddr; 219 struct rndis_packet_msg *rndis_pkt; 220 bus_dmamap_t rndis_pkt_dmap; 221 }; 222 223 #define HN_TXD_FLAG_ONLIST 0x0001 224 #define HN_TXD_FLAG_DMAMAP 0x0002 225 #define HN_TXD_FLAG_ONAGG 0x0004 226 227 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 228 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 229 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 230 231 struct packet_info_id { 232 uint8_t ver; 233 uint8_t flag; 234 uint16_t pkt_id; 235 }; 236 237 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 238 239 240 struct hn_rxinfo { 241 const uint32_t *vlan_info; 242 const uint32_t *csum_info; 243 const uint32_t *hash_info; 244 const uint32_t *hash_value; 245 const struct packet_info_id *pktinfo_id; 246 }; 247 248 struct hn_rxvf_setarg { 249 struct hn_rx_ring *rxr; 250 if_t vf_ifp; 251 }; 252 253 #define HN_RXINFO_VLAN 0x0001 254 #define HN_RXINFO_CSUM 0x0002 255 #define HN_RXINFO_HASHINF 0x0004 256 #define HN_RXINFO_HASHVAL 0x0008 257 #define HN_RXINFO_PKTINFO_ID 0x0010 258 #define HN_RXINFO_ALL \ 259 (HN_RXINFO_VLAN | \ 260 HN_RXINFO_CSUM | \ 261 HN_RXINFO_HASHINF | \ 262 HN_RXINFO_HASHVAL | \ 263 HN_RXINFO_PKTINFO_ID) 264 265 static int hn_probe(device_t); 266 static int hn_attach(device_t); 267 static int hn_detach(device_t); 268 static int hn_shutdown(device_t); 269 static void hn_chan_callback(struct vmbus_channel *, 270 void *); 271 272 static void hn_init(void *); 273 static int hn_ioctl(if_t, u_long, caddr_t); 274 #ifdef HN_IFSTART_SUPPORT 275 static void hn_start(if_t); 276 #endif 277 static int hn_transmit(if_t, struct mbuf *); 278 static void hn_xmit_qflush(if_t); 279 static int hn_ifmedia_upd(if_t); 280 static void hn_ifmedia_sts(if_t, 281 struct ifmediareq *); 282 283 static void hn_ifnet_event(void *, if_t, int); 284 static void hn_ifaddr_event(void *, if_t); 285 static void hn_ifnet_attevent(void *, if_t); 286 static void hn_ifnet_detevent(void *, if_t); 287 static void hn_ifnet_lnkevent(void *, if_t, int); 288 289 static bool hn_ismyvf(const struct hn_softc *, 290 const if_t); 291 static void hn_rxvf_change(struct hn_softc *, 292 if_t, bool); 293 static void hn_rxvf_set(struct hn_softc *, if_t); 294 static void hn_rxvf_set_task(void *, int); 295 static void hn_xpnt_vf_input(if_t, struct mbuf *); 296 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 297 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 298 struct ifreq *); 299 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 300 static bool hn_xpnt_vf_isready(struct hn_softc *); 301 static void hn_xpnt_vf_setready(struct hn_softc *); 302 static void hn_xpnt_vf_init_taskfunc(void *, int); 303 static void hn_xpnt_vf_init(struct hn_softc *); 304 static void hn_xpnt_vf_setenable(struct hn_softc *); 305 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 306 static void hn_vf_rss_fixup(struct hn_softc *, bool); 307 static void hn_vf_rss_restore(struct hn_softc *); 308 309 static int hn_rndis_rxinfo(const void *, int, 310 struct hn_rxinfo *); 311 static void hn_rndis_rx_data(struct hn_rx_ring *, 312 const void *, int); 313 static void hn_rndis_rx_status(struct hn_softc *, 314 const void *, int); 315 static void hn_rndis_init_fixat(struct hn_softc *, int); 316 317 static void hn_nvs_handle_notify(struct hn_softc *, 318 const struct vmbus_chanpkt_hdr *); 319 static void hn_nvs_handle_comp(struct hn_softc *, 320 struct vmbus_channel *, 321 const struct vmbus_chanpkt_hdr *); 322 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 323 struct vmbus_channel *, 324 const struct vmbus_chanpkt_hdr *); 325 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 326 struct vmbus_channel *, uint64_t); 327 328 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 329 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 330 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 331 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 340 #ifndef RSS 341 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 343 #endif 344 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 345 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 346 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 347 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 349 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 358 359 static void hn_stop(struct hn_softc *, bool); 360 static void hn_init_locked(struct hn_softc *); 361 static int hn_chan_attach(struct hn_softc *, 362 struct vmbus_channel *); 363 static void hn_chan_detach(struct hn_softc *, 364 struct vmbus_channel *); 365 static int hn_attach_subchans(struct hn_softc *); 366 static void hn_detach_allchans(struct hn_softc *); 367 static void hn_chan_rollup(struct hn_rx_ring *, 368 struct hn_tx_ring *); 369 static void hn_set_ring_inuse(struct hn_softc *, int); 370 static int hn_synth_attach(struct hn_softc *, int); 371 static void hn_synth_detach(struct hn_softc *); 372 static int hn_synth_alloc_subchans(struct hn_softc *, 373 int *); 374 static bool hn_synth_attachable(const struct hn_softc *); 375 static void hn_suspend(struct hn_softc *); 376 static void hn_suspend_data(struct hn_softc *); 377 static void hn_suspend_mgmt(struct hn_softc *); 378 static void hn_resume(struct hn_softc *); 379 static void hn_resume_data(struct hn_softc *); 380 static void hn_resume_mgmt(struct hn_softc *); 381 static void hn_suspend_mgmt_taskfunc(void *, int); 382 static void hn_chan_drain(struct hn_softc *, 383 struct vmbus_channel *); 384 static void hn_disable_rx(struct hn_softc *); 385 static void hn_drain_rxtx(struct hn_softc *, int); 386 static void hn_polling(struct hn_softc *, u_int); 387 static void hn_chan_polling(struct vmbus_channel *, u_int); 388 static void hn_mtu_change_fixup(struct hn_softc *); 389 390 static void hn_update_link_status(struct hn_softc *); 391 static void hn_change_network(struct hn_softc *); 392 static void hn_link_taskfunc(void *, int); 393 static void hn_netchg_init_taskfunc(void *, int); 394 static void hn_netchg_status_taskfunc(void *, int); 395 static void hn_link_status(struct hn_softc *); 396 397 static int hn_create_rx_data(struct hn_softc *, int); 398 static void hn_destroy_rx_data(struct hn_softc *); 399 static int hn_check_iplen(const struct mbuf *, int); 400 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 401 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 402 static int hn_rxfilter_config(struct hn_softc *); 403 static int hn_rss_reconfig(struct hn_softc *); 404 static void hn_rss_ind_fixup(struct hn_softc *); 405 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 406 static int hn_rxpkt(struct hn_rx_ring *); 407 static uint32_t hn_rss_type_fromndis(uint32_t); 408 static uint32_t hn_rss_type_tondis(uint32_t); 409 410 static int hn_tx_ring_create(struct hn_softc *, int); 411 static void hn_tx_ring_destroy(struct hn_tx_ring *); 412 static int hn_create_tx_data(struct hn_softc *, int); 413 static void hn_fixup_tx_data(struct hn_softc *); 414 static void hn_fixup_rx_data(struct hn_softc *); 415 static void hn_destroy_tx_data(struct hn_softc *); 416 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 417 static void hn_txdesc_gc(struct hn_tx_ring *, 418 struct hn_txdesc *); 419 static int hn_encap(if_t, struct hn_tx_ring *, 420 struct hn_txdesc *, struct mbuf **); 421 static int hn_txpkt(if_t, struct hn_tx_ring *, 422 struct hn_txdesc *); 423 static void hn_set_chim_size(struct hn_softc *, int); 424 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 425 static bool hn_tx_ring_pending(struct hn_tx_ring *); 426 static void hn_tx_ring_qflush(struct hn_tx_ring *); 427 static void hn_resume_tx(struct hn_softc *, int); 428 static void hn_set_txagg(struct hn_softc *); 429 static void *hn_try_txagg(if_t, 430 struct hn_tx_ring *, struct hn_txdesc *, 431 int); 432 static int hn_get_txswq_depth(const struct hn_tx_ring *); 433 static void hn_txpkt_done(struct hn_nvs_sendctx *, 434 struct hn_softc *, struct vmbus_channel *, 435 const void *, int); 436 static int hn_txpkt_sglist(struct hn_tx_ring *, 437 struct hn_txdesc *); 438 static int hn_txpkt_chim(struct hn_tx_ring *, 439 struct hn_txdesc *); 440 static int hn_xmit(struct hn_tx_ring *, int); 441 static void hn_xmit_taskfunc(void *, int); 442 static void hn_xmit_txeof(struct hn_tx_ring *); 443 static void hn_xmit_txeof_taskfunc(void *, int); 444 #ifdef HN_IFSTART_SUPPORT 445 static int hn_start_locked(struct hn_tx_ring *, int); 446 static void hn_start_taskfunc(void *, int); 447 static void hn_start_txeof(struct hn_tx_ring *); 448 static void hn_start_txeof_taskfunc(void *, int); 449 #endif 450 451 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 452 453 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 454 "Hyper-V network interface"); 455 456 /* Trust tcp segment verification on host side. */ 457 static int hn_trust_hosttcp = 1; 458 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 459 &hn_trust_hosttcp, 0, 460 "Trust tcp segment verification on host side, " 461 "when csum info is missing (global setting)"); 462 463 /* Trust udp datagrams verification on host side. */ 464 static int hn_trust_hostudp = 1; 465 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 466 &hn_trust_hostudp, 0, 467 "Trust udp datagram verification on host side, " 468 "when csum info is missing (global setting)"); 469 470 /* Trust ip packets verification on host side. */ 471 static int hn_trust_hostip = 1; 472 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 473 &hn_trust_hostip, 0, 474 "Trust ip packet verification on host side, " 475 "when csum info is missing (global setting)"); 476 477 /* 478 * Offload UDP/IPv4 checksum. 479 */ 480 static int hn_enable_udp4cs = 1; 481 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 482 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 483 484 /* 485 * Offload UDP/IPv6 checksum. 486 */ 487 static int hn_enable_udp6cs = 1; 488 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 489 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 490 491 /* Stats. */ 492 static counter_u64_t hn_udpcs_fixup; 493 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 494 &hn_udpcs_fixup, "# of UDP checksum fixup"); 495 496 /* 497 * See hn_set_hlen(). 498 * 499 * This value is for Azure. For Hyper-V, set this above 500 * 65536 to disable UDP datagram checksum fixup. 501 */ 502 static int hn_udpcs_fixup_mtu = 1420; 503 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 504 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 505 506 /* Limit TSO burst size */ 507 static int hn_tso_maxlen = IP_MAXPACKET; 508 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 509 &hn_tso_maxlen, 0, "TSO burst limit"); 510 511 /* Limit chimney send size */ 512 static int hn_tx_chimney_size = 0; 513 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 514 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 515 516 /* Limit the size of packet for direct transmission */ 517 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 518 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 519 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 520 521 /* # of LRO entries per RX ring */ 522 #if defined(INET) || defined(INET6) 523 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 524 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 525 &hn_lro_entry_count, 0, "LRO entry count"); 526 #endif 527 528 static int hn_tx_taskq_cnt = 1; 529 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 530 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 531 532 #define HN_TX_TASKQ_M_INDEP 0 533 #define HN_TX_TASKQ_M_GLOBAL 1 534 #define HN_TX_TASKQ_M_EVTTQ 2 535 536 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 537 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 538 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 539 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 540 541 #ifndef HN_USE_TXDESC_BUFRING 542 static int hn_use_txdesc_bufring = 0; 543 #else 544 static int hn_use_txdesc_bufring = 1; 545 #endif 546 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 547 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 548 549 #ifdef HN_IFSTART_SUPPORT 550 /* Use ifnet.if_start instead of ifnet.if_transmit */ 551 static int hn_use_if_start = 0; 552 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 553 &hn_use_if_start, 0, "Use if_start TX method"); 554 #endif 555 556 /* # of channels to use */ 557 static int hn_chan_cnt = 0; 558 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 559 &hn_chan_cnt, 0, 560 "# of channels to use; each channel has one RX ring and one TX ring"); 561 562 /* # of transmit rings to use */ 563 static int hn_tx_ring_cnt = 0; 564 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 565 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 566 567 /* Software TX ring deptch */ 568 static int hn_tx_swq_depth = 0; 569 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 570 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 571 572 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 573 static u_int hn_lro_mbufq_depth = 0; 574 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 575 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 576 577 /* Packet transmission aggregation size limit */ 578 static int hn_tx_agg_size = -1; 579 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 580 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 581 582 /* Packet transmission aggregation count limit */ 583 static int hn_tx_agg_pkts = -1; 584 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 585 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 586 587 /* VF list */ 588 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 589 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 590 hn_vflist_sysctl, "A", 591 "VF list"); 592 593 /* VF mapping */ 594 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 595 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 596 hn_vfmap_sysctl, "A", 597 "VF mapping"); 598 599 /* Transparent VF */ 600 static int hn_xpnt_vf = 1; 601 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 602 &hn_xpnt_vf, 0, "Transparent VF mod"); 603 604 /* Accurate BPF support for Transparent VF */ 605 static int hn_xpnt_vf_accbpf = 0; 606 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 607 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 608 609 /* Extra wait for transparent VF attach routing; unit seconds. */ 610 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 611 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 612 &hn_xpnt_vf_attwait, 0, 613 "Extra wait for transparent VF attach routing; unit: seconds"); 614 615 static u_int hn_cpu_index; /* next CPU for channel */ 616 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 617 618 static struct rmlock hn_vfmap_lock; 619 static int hn_vfmap_size; 620 static if_t *hn_vfmap; 621 622 #ifndef RSS 623 static const uint8_t 624 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 625 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 626 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 627 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 628 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 629 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 630 }; 631 #endif /* !RSS */ 632 633 static const struct hyperv_guid hn_guid = { 634 .hv_guid = { 635 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 636 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 637 }; 638 639 static device_method_t hn_methods[] = { 640 /* Device interface */ 641 DEVMETHOD(device_probe, hn_probe), 642 DEVMETHOD(device_attach, hn_attach), 643 DEVMETHOD(device_detach, hn_detach), 644 DEVMETHOD(device_shutdown, hn_shutdown), 645 DEVMETHOD_END 646 }; 647 648 static driver_t hn_driver = { 649 "hn", 650 hn_methods, 651 sizeof(struct hn_softc) 652 }; 653 654 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); 655 MODULE_VERSION(hn, 1); 656 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 657 658 static void 659 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 660 { 661 int i; 662 663 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 664 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 665 } 666 667 static int 668 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 669 { 670 671 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 672 txd->chim_size == 0, ("invalid rndis sglist txd")); 673 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 674 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 675 } 676 677 static int 678 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 679 { 680 struct hn_nvs_rndis rndis; 681 682 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 683 txd->chim_size > 0, ("invalid rndis chim txd")); 684 685 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 686 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 687 rndis.nvs_chim_idx = txd->chim_index; 688 rndis.nvs_chim_sz = txd->chim_size; 689 690 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 691 &rndis, sizeof(rndis), &txd->send_ctx)); 692 } 693 694 static __inline uint32_t 695 hn_chim_alloc(struct hn_softc *sc) 696 { 697 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 698 u_long *bmap = sc->hn_chim_bmap; 699 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 700 701 for (i = 0; i < bmap_cnt; ++i) { 702 int idx; 703 704 idx = ffsl(~bmap[i]); 705 if (idx == 0) 706 continue; 707 708 --idx; /* ffsl is 1-based */ 709 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 710 ("invalid i %d and idx %d", i, idx)); 711 712 if (atomic_testandset_long(&bmap[i], idx)) 713 continue; 714 715 ret = i * LONG_BIT + idx; 716 break; 717 } 718 return (ret); 719 } 720 721 static __inline void 722 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 723 { 724 u_long mask; 725 uint32_t idx; 726 727 idx = chim_idx / LONG_BIT; 728 KASSERT(idx < sc->hn_chim_bmap_cnt, 729 ("invalid chimney index 0x%x", chim_idx)); 730 731 mask = 1UL << (chim_idx % LONG_BIT); 732 KASSERT(sc->hn_chim_bmap[idx] & mask, 733 ("index bitmap 0x%lx, chimney index %u, " 734 "bitmap idx %d, bitmask 0x%lx", 735 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 736 737 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 738 } 739 740 #if defined(INET6) || defined(INET) 741 742 #define PULLUP_HDR(m, len) \ 743 do { \ 744 if (__predict_false((m)->m_len < (len))) { \ 745 (m) = m_pullup((m), (len)); \ 746 if ((m) == NULL) \ 747 return (NULL); \ 748 } \ 749 } while (0) 750 751 /* 752 * NOTE: If this function failed, the m_head would be freed. 753 */ 754 static __inline struct mbuf * 755 hn_tso_fixup(struct mbuf *m_head) 756 { 757 struct ether_vlan_header *evl; 758 struct tcphdr *th; 759 int ehlen; 760 761 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 762 763 PULLUP_HDR(m_head, sizeof(*evl)); 764 evl = mtod(m_head, struct ether_vlan_header *); 765 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 766 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 767 else 768 ehlen = ETHER_HDR_LEN; 769 m_head->m_pkthdr.l2hlen = ehlen; 770 771 #ifdef INET 772 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 773 struct ip *ip; 774 int iphlen; 775 776 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 777 ip = mtodo(m_head, ehlen); 778 iphlen = ip->ip_hl << 2; 779 m_head->m_pkthdr.l3hlen = iphlen; 780 781 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 782 th = mtodo(m_head, ehlen + iphlen); 783 784 ip->ip_len = 0; 785 ip->ip_sum = 0; 786 th->th_sum = in_pseudo(ip->ip_src.s_addr, 787 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 788 } 789 #endif 790 #if defined(INET6) && defined(INET) 791 else 792 #endif 793 #ifdef INET6 794 { 795 struct ip6_hdr *ip6; 796 797 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 798 ip6 = mtodo(m_head, ehlen); 799 if (ip6->ip6_nxt != IPPROTO_TCP) { 800 m_freem(m_head); 801 return (NULL); 802 } 803 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 804 805 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 806 th = mtodo(m_head, ehlen + sizeof(*ip6)); 807 808 ip6->ip6_plen = 0; 809 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 810 } 811 #endif 812 return (m_head); 813 } 814 815 /* 816 * NOTE: If this function failed, the m_head would be freed. 817 */ 818 static __inline struct mbuf * 819 hn_set_hlen(struct mbuf *m_head) 820 { 821 const struct ether_vlan_header *evl; 822 int ehlen; 823 824 PULLUP_HDR(m_head, sizeof(*evl)); 825 evl = mtod(m_head, const struct ether_vlan_header *); 826 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 827 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 828 else 829 ehlen = ETHER_HDR_LEN; 830 m_head->m_pkthdr.l2hlen = ehlen; 831 832 #ifdef INET 833 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 834 const struct ip *ip; 835 int iphlen; 836 837 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 838 ip = mtodo(m_head, ehlen); 839 iphlen = ip->ip_hl << 2; 840 m_head->m_pkthdr.l3hlen = iphlen; 841 842 /* 843 * UDP checksum offload does not work in Azure, if the 844 * following conditions meet: 845 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 846 * - IP_DF is not set in the IP hdr. 847 * 848 * Fallback to software checksum for these UDP datagrams. 849 */ 850 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 851 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 852 (ntohs(ip->ip_off) & IP_DF) == 0) { 853 uint16_t off = ehlen + iphlen; 854 855 counter_u64_add(hn_udpcs_fixup, 1); 856 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 857 *(uint16_t *)(m_head->m_data + off + 858 m_head->m_pkthdr.csum_data) = in_cksum_skip( 859 m_head, m_head->m_pkthdr.len, off); 860 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 861 } 862 } 863 #endif 864 #if defined(INET6) && defined(INET) 865 else 866 #endif 867 #ifdef INET6 868 { 869 const struct ip6_hdr *ip6; 870 871 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 872 ip6 = mtodo(m_head, ehlen); 873 if (ip6->ip6_nxt != IPPROTO_TCP && 874 ip6->ip6_nxt != IPPROTO_UDP) { 875 m_freem(m_head); 876 return (NULL); 877 } 878 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 879 } 880 #endif 881 return (m_head); 882 } 883 884 /* 885 * NOTE: If this function failed, the m_head would be freed. 886 */ 887 static __inline struct mbuf * 888 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 889 { 890 const struct tcphdr *th; 891 int ehlen, iphlen; 892 893 *tcpsyn = 0; 894 ehlen = m_head->m_pkthdr.l2hlen; 895 iphlen = m_head->m_pkthdr.l3hlen; 896 897 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 898 th = mtodo(m_head, ehlen + iphlen); 899 if (th->th_flags & TH_SYN) 900 *tcpsyn = 1; 901 return (m_head); 902 } 903 904 #undef PULLUP_HDR 905 906 #endif /* INET6 || INET */ 907 908 static int 909 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 910 { 911 int error = 0; 912 913 HN_LOCK_ASSERT(sc); 914 915 if (sc->hn_rx_filter != filter) { 916 error = hn_rndis_set_rxfilter(sc, filter); 917 if (!error) 918 sc->hn_rx_filter = filter; 919 } 920 return (error); 921 } 922 923 static int 924 hn_rxfilter_config(struct hn_softc *sc) 925 { 926 if_t ifp = sc->hn_ifp; 927 uint32_t filter; 928 929 HN_LOCK_ASSERT(sc); 930 931 /* 932 * If the non-transparent mode VF is activated, we don't know how 933 * its RX filter is configured, so stick the synthetic device in 934 * the promiscous mode. 935 */ 936 if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 937 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 938 } else { 939 filter = NDIS_PACKET_TYPE_DIRECTED; 940 if (if_getflags(ifp) & IFF_BROADCAST) 941 filter |= NDIS_PACKET_TYPE_BROADCAST; 942 /* TODO: support multicast list */ 943 if ((if_getflags(ifp) & IFF_ALLMULTI) || 944 !if_maddr_empty(ifp)) 945 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 946 } 947 return (hn_set_rxfilter(sc, filter)); 948 } 949 950 static void 951 hn_set_txagg(struct hn_softc *sc) 952 { 953 uint32_t size, pkts; 954 int i; 955 956 /* 957 * Setup aggregation size. 958 */ 959 if (sc->hn_agg_size < 0) 960 size = UINT32_MAX; 961 else 962 size = sc->hn_agg_size; 963 964 if (sc->hn_rndis_agg_size < size) 965 size = sc->hn_rndis_agg_size; 966 967 /* NOTE: We only aggregate packets using chimney sending buffers. */ 968 if (size > (uint32_t)sc->hn_chim_szmax) 969 size = sc->hn_chim_szmax; 970 971 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 972 /* Disable */ 973 size = 0; 974 pkts = 0; 975 goto done; 976 } 977 978 /* NOTE: Type of the per TX ring setting is 'int'. */ 979 if (size > INT_MAX) 980 size = INT_MAX; 981 982 /* 983 * Setup aggregation packet count. 984 */ 985 if (sc->hn_agg_pkts < 0) 986 pkts = UINT32_MAX; 987 else 988 pkts = sc->hn_agg_pkts; 989 990 if (sc->hn_rndis_agg_pkts < pkts) 991 pkts = sc->hn_rndis_agg_pkts; 992 993 if (pkts <= 1) { 994 /* Disable */ 995 size = 0; 996 pkts = 0; 997 goto done; 998 } 999 1000 /* NOTE: Type of the per TX ring setting is 'short'. */ 1001 if (pkts > SHRT_MAX) 1002 pkts = SHRT_MAX; 1003 1004 done: 1005 /* NOTE: Type of the per TX ring setting is 'short'. */ 1006 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1007 /* Disable */ 1008 size = 0; 1009 pkts = 0; 1010 } 1011 1012 if (bootverbose) { 1013 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1014 size, pkts, sc->hn_rndis_agg_align); 1015 } 1016 1017 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1018 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1019 1020 mtx_lock(&txr->hn_tx_lock); 1021 txr->hn_agg_szmax = size; 1022 txr->hn_agg_pktmax = pkts; 1023 txr->hn_agg_align = sc->hn_rndis_agg_align; 1024 mtx_unlock(&txr->hn_tx_lock); 1025 } 1026 } 1027 1028 static int 1029 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1030 { 1031 1032 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1033 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1034 return txr->hn_txdesc_cnt; 1035 return hn_tx_swq_depth; 1036 } 1037 1038 static int 1039 hn_rss_reconfig(struct hn_softc *sc) 1040 { 1041 int error; 1042 1043 HN_LOCK_ASSERT(sc); 1044 1045 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1046 return (ENXIO); 1047 1048 /* 1049 * Disable RSS first. 1050 * 1051 * NOTE: 1052 * Direct reconfiguration by setting the UNCHG flags does 1053 * _not_ work properly. 1054 */ 1055 if (bootverbose) 1056 if_printf(sc->hn_ifp, "disable RSS\n"); 1057 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1058 if (error) { 1059 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1060 return (error); 1061 } 1062 1063 /* 1064 * Reenable the RSS w/ the updated RSS key or indirect 1065 * table. 1066 */ 1067 if (bootverbose) 1068 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1069 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1070 if (error) { 1071 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1072 return (error); 1073 } 1074 return (0); 1075 } 1076 1077 static void 1078 hn_rss_ind_fixup(struct hn_softc *sc) 1079 { 1080 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1081 int i, nchan; 1082 1083 nchan = sc->hn_rx_ring_inuse; 1084 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1085 1086 /* 1087 * Check indirect table to make sure that all channels in it 1088 * can be used. 1089 */ 1090 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1091 if (rss->rss_ind[i] >= nchan) { 1092 if_printf(sc->hn_ifp, 1093 "RSS indirect table %d fixup: %u -> %d\n", 1094 i, rss->rss_ind[i], nchan - 1); 1095 rss->rss_ind[i] = nchan - 1; 1096 } 1097 } 1098 } 1099 1100 static int 1101 hn_ifmedia_upd(if_t ifp __unused) 1102 { 1103 1104 return EOPNOTSUPP; 1105 } 1106 1107 static void 1108 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) 1109 { 1110 struct hn_softc *sc = if_getsoftc(ifp); 1111 1112 ifmr->ifm_status = IFM_AVALID; 1113 ifmr->ifm_active = IFM_ETHER; 1114 1115 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1116 ifmr->ifm_active |= IFM_NONE; 1117 return; 1118 } 1119 ifmr->ifm_status |= IFM_ACTIVE; 1120 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1121 } 1122 1123 static void 1124 hn_rxvf_set_task(void *xarg, int pending __unused) 1125 { 1126 struct hn_rxvf_setarg *arg = xarg; 1127 1128 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1129 } 1130 1131 static void 1132 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp) 1133 { 1134 struct hn_rx_ring *rxr; 1135 struct hn_rxvf_setarg arg; 1136 struct task task; 1137 int i; 1138 1139 HN_LOCK_ASSERT(sc); 1140 1141 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1142 1143 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1144 rxr = &sc->hn_rx_ring[i]; 1145 1146 if (i < sc->hn_rx_ring_inuse) { 1147 arg.rxr = rxr; 1148 arg.vf_ifp = vf_ifp; 1149 vmbus_chan_run_task(rxr->hn_chan, &task); 1150 } else { 1151 rxr->hn_rxvf_ifp = vf_ifp; 1152 } 1153 } 1154 } 1155 1156 static bool 1157 hn_ismyvf(const struct hn_softc *sc, const if_t ifp) 1158 { 1159 if_t hn_ifp; 1160 1161 hn_ifp = sc->hn_ifp; 1162 1163 if (ifp == hn_ifp) 1164 return (false); 1165 1166 if (if_getalloctype(ifp) != IFT_ETHER) 1167 return (false); 1168 1169 /* Ignore lagg/vlan interfaces */ 1170 if (strcmp(if_getdname(ifp), "lagg") == 0 || 1171 strcmp(if_getdname(ifp), "vlan") == 0) 1172 return (false); 1173 1174 /* 1175 * During detach events if_getifaddr(ifp) might be NULL. 1176 * Make sure the bcmp() below doesn't panic on that: 1177 */ 1178 if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL) 1179 return (false); 1180 1181 if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0) 1182 return (false); 1183 1184 return (true); 1185 } 1186 1187 static void 1188 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf) 1189 { 1190 if_t hn_ifp; 1191 1192 HN_LOCK(sc); 1193 1194 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1195 goto out; 1196 1197 if (!hn_ismyvf(sc, ifp)) 1198 goto out; 1199 hn_ifp = sc->hn_ifp; 1200 1201 if (rxvf) { 1202 if (sc->hn_flags & HN_FLAG_RXVF) 1203 goto out; 1204 1205 sc->hn_flags |= HN_FLAG_RXVF; 1206 hn_rxfilter_config(sc); 1207 } else { 1208 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1209 goto out; 1210 1211 sc->hn_flags &= ~HN_FLAG_RXVF; 1212 if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING) 1213 hn_rxfilter_config(sc); 1214 else 1215 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1216 } 1217 1218 hn_nvs_set_datapath(sc, 1219 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1220 1221 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1222 1223 if (rxvf) { 1224 hn_vf_rss_fixup(sc, true); 1225 hn_suspend_mgmt(sc); 1226 sc->hn_link_flags &= 1227 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1228 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1229 } else { 1230 hn_vf_rss_restore(sc); 1231 hn_resume_mgmt(sc); 1232 } 1233 1234 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1235 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1236 1237 if (bootverbose) { 1238 if_printf(hn_ifp, "datapath is switched %s %s\n", 1239 rxvf ? "to" : "from", if_name(ifp)); 1240 } 1241 out: 1242 HN_UNLOCK(sc); 1243 } 1244 1245 static void 1246 hn_ifnet_event(void *arg, if_t ifp, int event) 1247 { 1248 1249 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1250 return; 1251 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1252 } 1253 1254 static void 1255 hn_ifaddr_event(void *arg, if_t ifp) 1256 { 1257 1258 hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP); 1259 } 1260 1261 static int 1262 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1263 { 1264 if_t ifp, vf_ifp; 1265 uint64_t tmp; 1266 int error; 1267 1268 HN_LOCK_ASSERT(sc); 1269 ifp = sc->hn_ifp; 1270 vf_ifp = sc->hn_vf_ifp; 1271 1272 /* 1273 * Fix up requested capabilities w/ supported capabilities, 1274 * since the supported capabilities could have been changed. 1275 */ 1276 ifr->ifr_reqcap &= if_getcapabilities(ifp); 1277 /* Pass SIOCSIFCAP to VF. */ 1278 error = ifhwioctl(SIOCSIFCAP, vf_ifp, (caddr_t)ifr, curthread); 1279 1280 /* 1281 * NOTE: 1282 * The error will be propagated to the callers, however, it 1283 * is _not_ useful here. 1284 */ 1285 1286 /* 1287 * Merge VF's enabled capabilities. 1288 */ 1289 if_setcapenable(ifp, if_getcapenable(vf_ifp) & if_getcapabilities(ifp)); 1290 1291 tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP_HWASSIST(sc); 1292 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 1293 if_sethwassistbits(ifp, tmp, 0); 1294 else 1295 if_sethwassistbits(ifp, 0, tmp); 1296 1297 tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP6_HWASSIST(sc); 1298 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 1299 if_sethwassistbits(ifp, tmp, 0); 1300 else 1301 if_sethwassistbits(ifp, 0, tmp); 1302 1303 tmp = if_gethwassist(vf_ifp) & CSUM_IP_TSO; 1304 if (if_getcapenable(ifp) & IFCAP_TSO4) 1305 if_sethwassistbits(ifp, tmp, 0); 1306 else 1307 if_sethwassistbits(ifp, 0, tmp); 1308 1309 tmp = if_gethwassist(vf_ifp) & CSUM_IP6_TSO; 1310 if (if_getcapenable(ifp) & IFCAP_TSO6) 1311 if_sethwassistbits(ifp, tmp, 0); 1312 else 1313 if_sethwassistbits(ifp, 0, tmp); 1314 1315 return (error); 1316 } 1317 1318 static int 1319 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1320 { 1321 if_t vf_ifp; 1322 struct ifreq ifr; 1323 1324 HN_LOCK_ASSERT(sc); 1325 vf_ifp = sc->hn_vf_ifp; 1326 1327 memset(&ifr, 0, sizeof(ifr)); 1328 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1329 ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff; 1330 ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16; 1331 return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread)); 1332 } 1333 1334 static void 1335 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1336 { 1337 if_t ifp = sc->hn_ifp; 1338 int allmulti = 0; 1339 1340 HN_LOCK_ASSERT(sc); 1341 1342 /* XXX vlan(4) style mcast addr maintenance */ 1343 if (!if_maddr_empty(ifp)) 1344 allmulti = IFF_ALLMULTI; 1345 1346 /* Always set the VF's if_flags */ 1347 if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti); 1348 } 1349 1350 static void 1351 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m) 1352 { 1353 struct rm_priotracker pt; 1354 if_t hn_ifp = NULL; 1355 struct mbuf *mn; 1356 1357 /* 1358 * XXX racy, if hn(4) ever detached. 1359 */ 1360 rm_rlock(&hn_vfmap_lock, &pt); 1361 if (if_getindex(vf_ifp) < hn_vfmap_size) 1362 hn_ifp = hn_vfmap[if_getindex(vf_ifp)]; 1363 rm_runlock(&hn_vfmap_lock, &pt); 1364 1365 if (hn_ifp != NULL) { 1366 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1367 /* 1368 * Allow tapping on the VF. 1369 */ 1370 ETHER_BPF_MTAP(vf_ifp, mn); 1371 1372 /* 1373 * Update VF stats. 1374 */ 1375 if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) { 1376 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1377 mn->m_pkthdr.len); 1378 } 1379 /* 1380 * XXX IFCOUNTER_IMCAST 1381 * This stat updating is kinda invasive, since it 1382 * requires two checks on the mbuf: the length check 1383 * and the ethernet header check. As of this write, 1384 * all multicast packets go directly to hn(4), which 1385 * makes imcast stat updating in the VF a try in vian. 1386 */ 1387 1388 /* 1389 * Fix up rcvif and increase hn(4)'s ipackets. 1390 */ 1391 mn->m_pkthdr.rcvif = hn_ifp; 1392 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1393 } 1394 /* 1395 * Go through hn(4)'s if_input. 1396 */ 1397 if_input(hn_ifp, m); 1398 } else { 1399 /* 1400 * In the middle of the transition; free this 1401 * mbuf chain. 1402 */ 1403 while (m != NULL) { 1404 mn = m->m_nextpkt; 1405 m->m_nextpkt = NULL; 1406 m_freem(m); 1407 m = mn; 1408 } 1409 } 1410 } 1411 1412 static void 1413 hn_mtu_change_fixup(struct hn_softc *sc) 1414 { 1415 if_t ifp; 1416 1417 HN_LOCK_ASSERT(sc); 1418 ifp = sc->hn_ifp; 1419 1420 hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp)); 1421 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1422 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1423 } 1424 1425 static uint32_t 1426 hn_rss_type_fromndis(uint32_t rss_hash) 1427 { 1428 uint32_t types = 0; 1429 1430 if (rss_hash & NDIS_HASH_IPV4) 1431 types |= RSS_TYPE_IPV4; 1432 if (rss_hash & NDIS_HASH_TCP_IPV4) 1433 types |= RSS_TYPE_TCP_IPV4; 1434 if (rss_hash & NDIS_HASH_IPV6) 1435 types |= RSS_TYPE_IPV6; 1436 if (rss_hash & NDIS_HASH_IPV6_EX) 1437 types |= RSS_TYPE_IPV6_EX; 1438 if (rss_hash & NDIS_HASH_TCP_IPV6) 1439 types |= RSS_TYPE_TCP_IPV6; 1440 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1441 types |= RSS_TYPE_TCP_IPV6_EX; 1442 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1443 types |= RSS_TYPE_UDP_IPV4; 1444 return (types); 1445 } 1446 1447 static uint32_t 1448 hn_rss_type_tondis(uint32_t types) 1449 { 1450 uint32_t rss_hash = 0; 1451 1452 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1453 ("UDP6 and UDP6EX are not supported")); 1454 1455 if (types & RSS_TYPE_IPV4) 1456 rss_hash |= NDIS_HASH_IPV4; 1457 if (types & RSS_TYPE_TCP_IPV4) 1458 rss_hash |= NDIS_HASH_TCP_IPV4; 1459 if (types & RSS_TYPE_IPV6) 1460 rss_hash |= NDIS_HASH_IPV6; 1461 if (types & RSS_TYPE_IPV6_EX) 1462 rss_hash |= NDIS_HASH_IPV6_EX; 1463 if (types & RSS_TYPE_TCP_IPV6) 1464 rss_hash |= NDIS_HASH_TCP_IPV6; 1465 if (types & RSS_TYPE_TCP_IPV6_EX) 1466 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1467 if (types & RSS_TYPE_UDP_IPV4) 1468 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1469 return (rss_hash); 1470 } 1471 1472 static void 1473 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1474 { 1475 int i; 1476 1477 HN_LOCK_ASSERT(sc); 1478 1479 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1480 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1481 } 1482 1483 static void 1484 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1485 { 1486 if_t ifp, vf_ifp; 1487 struct ifrsshash ifrh; 1488 struct ifrsskey ifrk; 1489 int error; 1490 uint32_t my_types, diff_types, mbuf_types = 0; 1491 1492 HN_LOCK_ASSERT(sc); 1493 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1494 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1495 1496 if (sc->hn_rx_ring_inuse == 1) { 1497 /* No RSS on synthetic parts; done. */ 1498 return; 1499 } 1500 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1501 /* Synthetic parts do not support Toeplitz; done. */ 1502 return; 1503 } 1504 1505 ifp = sc->hn_ifp; 1506 vf_ifp = sc->hn_vf_ifp; 1507 1508 /* 1509 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1510 * supported. 1511 */ 1512 memset(&ifrk, 0, sizeof(ifrk)); 1513 strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name)); 1514 error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread); 1515 if (error) { 1516 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1517 if_name(vf_ifp), error); 1518 goto done; 1519 } 1520 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1521 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1522 if_name(vf_ifp), ifrk.ifrk_func); 1523 goto done; 1524 } 1525 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1526 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1527 if_name(vf_ifp), ifrk.ifrk_keylen); 1528 goto done; 1529 } 1530 1531 /* 1532 * Extract VF's RSS hash. Only Toeplitz is supported. 1533 */ 1534 memset(&ifrh, 0, sizeof(ifrh)); 1535 strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name)); 1536 error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread); 1537 if (error) { 1538 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1539 if_name(vf_ifp), error); 1540 goto done; 1541 } 1542 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1543 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1544 if_name(vf_ifp), ifrh.ifrh_func); 1545 goto done; 1546 } 1547 1548 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1549 if ((ifrh.ifrh_types & my_types) == 0) { 1550 /* This disables RSS; ignore it then */ 1551 if_printf(ifp, "%s intersection of RSS types failed. " 1552 "VF %#x, mine %#x\n", if_name(vf_ifp), 1553 ifrh.ifrh_types, my_types); 1554 goto done; 1555 } 1556 1557 diff_types = my_types ^ ifrh.ifrh_types; 1558 my_types &= ifrh.ifrh_types; 1559 mbuf_types = my_types; 1560 1561 /* 1562 * Detect RSS hash value/type confliction. 1563 * 1564 * NOTE: 1565 * We don't disable the hash type, but stop delivery the hash 1566 * value/type through mbufs on RX path. 1567 * 1568 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1569 * hash is delivered with type of TCP_IPV4. This means if 1570 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1571 * least to hn_mbuf_hash. However, given that _all_ of the 1572 * NICs implement TCP_IPV4, this will _not_ impose any issues 1573 * here. 1574 */ 1575 if ((my_types & RSS_TYPE_IPV4) && 1576 (diff_types & ifrh.ifrh_types & 1577 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1578 /* Conflict; disable IPV4 hash type/value delivery. */ 1579 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1580 mbuf_types &= ~RSS_TYPE_IPV4; 1581 } 1582 if ((my_types & RSS_TYPE_IPV6) && 1583 (diff_types & ifrh.ifrh_types & 1584 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1585 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1586 RSS_TYPE_IPV6_EX))) { 1587 /* Conflict; disable IPV6 hash type/value delivery. */ 1588 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1589 mbuf_types &= ~RSS_TYPE_IPV6; 1590 } 1591 if ((my_types & RSS_TYPE_IPV6_EX) && 1592 (diff_types & ifrh.ifrh_types & 1593 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1594 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1595 RSS_TYPE_IPV6))) { 1596 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1597 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1598 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1599 } 1600 if ((my_types & RSS_TYPE_TCP_IPV6) && 1601 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1602 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1603 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1604 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1605 } 1606 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1607 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1608 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1609 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1610 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1611 } 1612 if ((my_types & RSS_TYPE_UDP_IPV6) && 1613 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1614 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1615 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1616 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1617 } 1618 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1619 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1620 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1621 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1622 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1623 } 1624 1625 /* 1626 * Indirect table does not matter. 1627 */ 1628 1629 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1630 hn_rss_type_tondis(my_types); 1631 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1632 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1633 1634 if (reconf) { 1635 error = hn_rss_reconfig(sc); 1636 if (error) { 1637 /* XXX roll-back? */ 1638 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1639 /* XXX keep going. */ 1640 } 1641 } 1642 done: 1643 /* Hash deliverability for mbufs. */ 1644 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1645 } 1646 1647 static void 1648 hn_vf_rss_restore(struct hn_softc *sc) 1649 { 1650 1651 HN_LOCK_ASSERT(sc); 1652 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1653 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1654 1655 if (sc->hn_rx_ring_inuse == 1) 1656 goto done; 1657 1658 /* 1659 * Restore hash types. Key does _not_ matter. 1660 */ 1661 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1662 int error; 1663 1664 sc->hn_rss_hash = sc->hn_rss_hcap; 1665 error = hn_rss_reconfig(sc); 1666 if (error) { 1667 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1668 error); 1669 /* XXX keep going. */ 1670 } 1671 } 1672 done: 1673 /* Hash deliverability for mbufs. */ 1674 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1675 } 1676 1677 static void 1678 hn_xpnt_vf_setready(struct hn_softc *sc) 1679 { 1680 if_t ifp, vf_ifp; 1681 struct ifreq ifr; 1682 1683 HN_LOCK_ASSERT(sc); 1684 ifp = sc->hn_ifp; 1685 vf_ifp = sc->hn_vf_ifp; 1686 1687 /* 1688 * Mark the VF ready. 1689 */ 1690 sc->hn_vf_rdytick = 0; 1691 1692 /* 1693 * Save information for restoration. 1694 */ 1695 sc->hn_saved_caps = if_getcapabilities(ifp); 1696 sc->hn_saved_tsomax = if_gethwtsomax(ifp); 1697 sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp); 1698 sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp); 1699 1700 /* 1701 * Intersect supported/enabled capabilities. 1702 * 1703 * NOTE: 1704 * if_hwassist is not changed here. 1705 */ 1706 if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp)); 1707 if_setcapenablebit(ifp, 0, if_getcapabilities(ifp)); 1708 1709 /* 1710 * Fix TSO settings. 1711 */ 1712 if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp)) 1713 if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp)); 1714 if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp)) 1715 if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp)); 1716 if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp)) 1717 if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp)); 1718 1719 /* 1720 * Change VF's enabled capabilities. 1721 */ 1722 memset(&ifr, 0, sizeof(ifr)); 1723 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1724 ifr.ifr_reqcap = if_getcapenable(ifp); 1725 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1726 1727 if (if_getmtu(ifp) != ETHERMTU) { 1728 int error; 1729 1730 /* 1731 * Change VF's MTU. 1732 */ 1733 memset(&ifr, 0, sizeof(ifr)); 1734 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1735 ifr.ifr_mtu = if_getmtu(ifp); 1736 error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread); 1737 if (error) { 1738 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1739 if_name(vf_ifp), if_getmtu(ifp)); 1740 if (if_getmtu(ifp) > ETHERMTU) { 1741 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1742 1743 /* 1744 * XXX 1745 * No need to adjust the synthetic parts' MTU; 1746 * failure of the adjustment will cause us 1747 * infinite headache. 1748 */ 1749 if_setmtu(ifp, ETHERMTU); 1750 hn_mtu_change_fixup(sc); 1751 } 1752 } 1753 } 1754 } 1755 1756 static bool 1757 hn_xpnt_vf_isready(struct hn_softc *sc) 1758 { 1759 1760 HN_LOCK_ASSERT(sc); 1761 1762 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1763 return (false); 1764 1765 if (sc->hn_vf_rdytick == 0) 1766 return (true); 1767 1768 if (sc->hn_vf_rdytick > ticks) 1769 return (false); 1770 1771 /* Mark VF as ready. */ 1772 hn_xpnt_vf_setready(sc); 1773 return (true); 1774 } 1775 1776 static void 1777 hn_xpnt_vf_setenable(struct hn_softc *sc) 1778 { 1779 int i; 1780 1781 HN_LOCK_ASSERT(sc); 1782 1783 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1784 rm_wlock(&sc->hn_vf_lock); 1785 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1786 rm_wunlock(&sc->hn_vf_lock); 1787 1788 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1789 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1790 } 1791 1792 static void 1793 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1794 { 1795 int i; 1796 1797 HN_LOCK_ASSERT(sc); 1798 1799 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1800 rm_wlock(&sc->hn_vf_lock); 1801 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1802 if (clear_vf) 1803 sc->hn_vf_ifp = NULL; 1804 rm_wunlock(&sc->hn_vf_lock); 1805 1806 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1807 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1808 } 1809 1810 static void 1811 hn_xpnt_vf_init(struct hn_softc *sc) 1812 { 1813 int error; 1814 1815 HN_LOCK_ASSERT(sc); 1816 1817 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1818 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1819 1820 if (bootverbose) { 1821 if_printf(sc->hn_ifp, "try bringing up %s\n", 1822 if_name(sc->hn_vf_ifp)); 1823 } 1824 1825 /* 1826 * Bring the VF up. 1827 */ 1828 hn_xpnt_vf_saveifflags(sc); 1829 if_setflagbits(sc->hn_ifp, IFF_UP, 0); 1830 error = hn_xpnt_vf_iocsetflags(sc); 1831 if (error) { 1832 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1833 if_name(sc->hn_vf_ifp), error); 1834 return; 1835 } 1836 1837 /* 1838 * NOTE: 1839 * Datapath setting must happen _after_ bringing the VF up. 1840 */ 1841 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1842 1843 /* 1844 * NOTE: 1845 * Fixup RSS related bits _after_ the VF is brought up, since 1846 * many VFs generate RSS key during it's initialization. 1847 */ 1848 hn_vf_rss_fixup(sc, true); 1849 1850 /* Mark transparent mode VF as enabled. */ 1851 hn_xpnt_vf_setenable(sc); 1852 } 1853 1854 static void 1855 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1856 { 1857 struct hn_softc *sc = xsc; 1858 1859 HN_LOCK(sc); 1860 1861 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1862 goto done; 1863 if (sc->hn_vf_ifp == NULL) 1864 goto done; 1865 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1866 goto done; 1867 1868 if (sc->hn_vf_rdytick != 0) { 1869 /* Mark VF as ready. */ 1870 hn_xpnt_vf_setready(sc); 1871 } 1872 1873 if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) { 1874 /* 1875 * Delayed VF initialization. 1876 */ 1877 if (bootverbose) { 1878 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1879 if_name(sc->hn_vf_ifp)); 1880 } 1881 hn_xpnt_vf_init(sc); 1882 } 1883 done: 1884 HN_UNLOCK(sc); 1885 } 1886 1887 static void 1888 hn_ifnet_attevent(void *xsc, if_t ifp) 1889 { 1890 struct hn_softc *sc = xsc; 1891 1892 HN_LOCK(sc); 1893 1894 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1895 goto done; 1896 1897 if (!hn_ismyvf(sc, ifp)) 1898 goto done; 1899 1900 if (sc->hn_vf_ifp != NULL) { 1901 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1902 if_name(sc->hn_vf_ifp)); 1903 goto done; 1904 } 1905 1906 if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) { 1907 /* 1908 * ifnet.if_start is _not_ supported by transparent 1909 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1910 */ 1911 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1912 "in transparent VF mode.\n", if_name(sc->hn_vf_ifp)); 1913 1914 goto done; 1915 } 1916 1917 rm_wlock(&hn_vfmap_lock); 1918 1919 if (if_getindex(ifp) >= hn_vfmap_size) { 1920 if_t *newmap; 1921 int newsize; 1922 1923 newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF; 1924 newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF, 1925 M_WAITOK | M_ZERO); 1926 1927 memcpy(newmap, hn_vfmap, 1928 sizeof(if_t) * hn_vfmap_size); 1929 free(hn_vfmap, M_DEVBUF); 1930 hn_vfmap = newmap; 1931 hn_vfmap_size = newsize; 1932 } 1933 KASSERT(hn_vfmap[if_getindex(ifp)] == NULL, 1934 ("%s: ifindex %d was mapped to %s", 1935 if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); 1936 hn_vfmap[if_getindex(ifp)] = sc->hn_ifp; 1937 1938 rm_wunlock(&hn_vfmap_lock); 1939 1940 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1941 rm_wlock(&sc->hn_vf_lock); 1942 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1943 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1944 sc->hn_vf_ifp = ifp; 1945 rm_wunlock(&sc->hn_vf_lock); 1946 1947 if (hn_xpnt_vf) { 1948 int wait_ticks; 1949 1950 /* 1951 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1952 * Save vf_ifp's current if_input for later restoration. 1953 */ 1954 sc->hn_vf_input = if_getinputfn(ifp); 1955 if_setinputfn(ifp, hn_xpnt_vf_input); 1956 1957 /* 1958 * Stop link status management; use the VF's. 1959 */ 1960 hn_suspend_mgmt(sc); 1961 1962 /* 1963 * Give VF sometime to complete its attach routing. 1964 */ 1965 wait_ticks = hn_xpnt_vf_attwait * hz; 1966 sc->hn_vf_rdytick = ticks + wait_ticks; 1967 1968 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1969 wait_ticks); 1970 } 1971 done: 1972 HN_UNLOCK(sc); 1973 } 1974 1975 static void 1976 hn_ifnet_detevent(void *xsc, if_t ifp) 1977 { 1978 struct hn_softc *sc = xsc; 1979 1980 HN_LOCK(sc); 1981 1982 if (sc->hn_vf_ifp == NULL) 1983 goto done; 1984 1985 if (!hn_ismyvf(sc, ifp)) 1986 goto done; 1987 1988 if (hn_xpnt_vf) { 1989 /* 1990 * Make sure that the delayed initialization is not running. 1991 * 1992 * NOTE: 1993 * - This lock _must_ be released, since the hn_vf_init task 1994 * will try holding this lock. 1995 * - It is safe to release this lock here, since the 1996 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1997 * 1998 * XXX racy, if hn(4) ever detached. 1999 */ 2000 HN_UNLOCK(sc); 2001 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2002 HN_LOCK(sc); 2003 2004 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2005 if_name(sc->hn_ifp))); 2006 if_setinputfn(ifp, sc->hn_vf_input); 2007 sc->hn_vf_input = NULL; 2008 2009 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2010 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2011 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2012 2013 if (sc->hn_vf_rdytick == 0) { 2014 /* 2015 * The VF was ready; restore some settings. 2016 */ 2017 if_setcapabilities(ifp, sc->hn_saved_caps); 2018 /* 2019 * NOTE: 2020 * There is _no_ need to fixup if_capenable and 2021 * if_hwassist, since the if_capabilities before 2022 * restoration was an intersection of the VF's 2023 * if_capabilites and the synthetic device's 2024 * if_capabilites. 2025 */ 2026 if_sethwtsomax(ifp, sc->hn_saved_tsomax); 2027 if_sethwtsomaxsegcount(sc->hn_ifp, 2028 sc->hn_saved_tsosegcnt); 2029 if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz); 2030 } 2031 2032 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2033 /* 2034 * Restore RSS settings. 2035 */ 2036 hn_vf_rss_restore(sc); 2037 2038 /* 2039 * Resume link status management, which was suspended 2040 * by hn_ifnet_attevent(). 2041 */ 2042 hn_resume_mgmt(sc); 2043 } 2044 } 2045 2046 /* Mark transparent mode VF as disabled. */ 2047 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2048 2049 rm_wlock(&hn_vfmap_lock); 2050 2051 KASSERT(if_getindex(ifp) < hn_vfmap_size, 2052 ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size)); 2053 if (hn_vfmap[if_getindex(ifp)] != NULL) { 2054 KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp, 2055 ("%s: ifindex %d was mapped to %s", 2056 if_name(ifp), if_getindex(ifp), 2057 if_name(hn_vfmap[if_getindex(ifp)]))); 2058 hn_vfmap[if_getindex(ifp)] = NULL; 2059 } 2060 2061 rm_wunlock(&hn_vfmap_lock); 2062 done: 2063 HN_UNLOCK(sc); 2064 } 2065 2066 static void 2067 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state) 2068 { 2069 struct hn_softc *sc = xsc; 2070 2071 if (sc->hn_vf_ifp == ifp) 2072 if_link_state_change(sc->hn_ifp, link_state); 2073 } 2074 2075 static int 2076 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS) 2077 { 2078 struct hn_softc *sc = arg1; 2079 unsigned int tsomax; 2080 int error; 2081 2082 tsomax = if_gethwtsomax(sc->hn_ifp); 2083 error = sysctl_handle_int(oidp, &tsomax, 0, req); 2084 return error; 2085 } 2086 2087 static int 2088 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS) 2089 { 2090 struct hn_softc *sc = arg1; 2091 unsigned int tsomaxsegcnt; 2092 int error; 2093 2094 tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp); 2095 error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req); 2096 return error; 2097 } 2098 2099 static int 2100 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS) 2101 { 2102 struct hn_softc *sc = arg1; 2103 unsigned int tsomaxsegsz; 2104 int error; 2105 2106 tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp); 2107 error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req); 2108 return error; 2109 } 2110 2111 static int 2112 hn_probe(device_t dev) 2113 { 2114 2115 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2116 device_set_desc(dev, "Hyper-V Network Interface"); 2117 return BUS_PROBE_DEFAULT; 2118 } 2119 return ENXIO; 2120 } 2121 2122 static int 2123 hn_attach(device_t dev) 2124 { 2125 struct hn_softc *sc = device_get_softc(dev); 2126 struct sysctl_oid_list *child; 2127 struct sysctl_ctx_list *ctx; 2128 uint8_t eaddr[ETHER_ADDR_LEN]; 2129 if_t ifp = NULL; 2130 int error, ring_cnt, tx_ring_cnt; 2131 uint32_t mtu; 2132 2133 sc->hn_dev = dev; 2134 sc->hn_prichan = vmbus_get_channel(dev); 2135 HN_LOCK_INIT(sc); 2136 rm_init(&sc->hn_vf_lock, "hnvf"); 2137 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2138 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2139 2140 /* 2141 * Initialize these tunables once. 2142 */ 2143 sc->hn_agg_size = hn_tx_agg_size; 2144 sc->hn_agg_pkts = hn_tx_agg_pkts; 2145 2146 /* 2147 * Setup taskqueue for transmission. 2148 */ 2149 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2150 int i; 2151 2152 sc->hn_tx_taskqs = 2153 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2154 M_DEVBUF, M_WAITOK); 2155 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2156 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2157 M_WAITOK, taskqueue_thread_enqueue, 2158 &sc->hn_tx_taskqs[i]); 2159 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2160 "%s tx%d", device_get_nameunit(dev), i); 2161 } 2162 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2163 sc->hn_tx_taskqs = hn_tx_taskque; 2164 } 2165 2166 /* 2167 * Setup taskqueue for mangement tasks, e.g. link status. 2168 */ 2169 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2170 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2171 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2172 device_get_nameunit(dev)); 2173 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2174 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2175 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2176 hn_netchg_status_taskfunc, sc); 2177 2178 if (hn_xpnt_vf) { 2179 /* 2180 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2181 */ 2182 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2183 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2184 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2185 device_get_nameunit(dev)); 2186 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2187 hn_xpnt_vf_init_taskfunc, sc); 2188 } 2189 2190 /* 2191 * Allocate ifnet and setup its name earlier, so that if_printf 2192 * can be used by functions, which will be called after 2193 * ether_ifattach(). 2194 */ 2195 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2196 if_setsoftc(ifp, sc); 2197 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2198 2199 /* 2200 * Initialize ifmedia earlier so that it can be unconditionally 2201 * destroyed, if error happened later on. 2202 */ 2203 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2204 2205 /* 2206 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2207 * to use (tx_ring_cnt). 2208 * 2209 * NOTE: 2210 * The # of RX rings to use is same as the # of channels to use. 2211 */ 2212 ring_cnt = hn_chan_cnt; 2213 if (ring_cnt <= 0) { 2214 /* Default */ 2215 ring_cnt = mp_ncpus; 2216 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2217 ring_cnt = HN_RING_CNT_DEF_MAX; 2218 } else if (ring_cnt > mp_ncpus) { 2219 ring_cnt = mp_ncpus; 2220 } 2221 #ifdef RSS 2222 if (ring_cnt > rss_getnumbuckets()) 2223 ring_cnt = rss_getnumbuckets(); 2224 #endif 2225 2226 tx_ring_cnt = hn_tx_ring_cnt; 2227 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2228 tx_ring_cnt = ring_cnt; 2229 #ifdef HN_IFSTART_SUPPORT 2230 if (hn_use_if_start) { 2231 /* ifnet.if_start only needs one TX ring. */ 2232 tx_ring_cnt = 1; 2233 } 2234 #endif 2235 2236 /* 2237 * Set the leader CPU for channels. 2238 */ 2239 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2240 2241 /* 2242 * Create enough TX/RX rings, even if only limited number of 2243 * channels can be allocated. 2244 */ 2245 error = hn_create_tx_data(sc, tx_ring_cnt); 2246 if (error) 2247 goto failed; 2248 error = hn_create_rx_data(sc, ring_cnt); 2249 if (error) 2250 goto failed; 2251 2252 /* 2253 * Create transaction context for NVS and RNDIS transactions. 2254 */ 2255 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2256 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2257 if (sc->hn_xact == NULL) { 2258 error = ENXIO; 2259 goto failed; 2260 } 2261 2262 /* 2263 * Install orphan handler for the revocation of this device's 2264 * primary channel. 2265 * 2266 * NOTE: 2267 * The processing order is critical here: 2268 * Install the orphan handler, _before_ testing whether this 2269 * device's primary channel has been revoked or not. 2270 */ 2271 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2272 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2273 error = ENXIO; 2274 goto failed; 2275 } 2276 2277 /* 2278 * Attach the synthetic parts, i.e. NVS and RNDIS. 2279 */ 2280 error = hn_synth_attach(sc, ETHERMTU); 2281 if (error) 2282 goto failed; 2283 2284 error = hn_rndis_get_eaddr(sc, eaddr); 2285 if (error) 2286 goto failed; 2287 2288 error = hn_rndis_get_mtu(sc, &mtu); 2289 if (error) 2290 mtu = ETHERMTU; 2291 else if (bootverbose) 2292 device_printf(dev, "RNDIS mtu %u\n", mtu); 2293 2294 if (sc->hn_rx_ring_inuse > 1) { 2295 /* 2296 * Reduce TCP segment aggregation limit for multiple 2297 * RX rings to increase ACK timeliness. 2298 */ 2299 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2300 } 2301 2302 /* 2303 * Fixup TX/RX stuffs after synthetic parts are attached. 2304 */ 2305 hn_fixup_tx_data(sc); 2306 hn_fixup_rx_data(sc); 2307 2308 ctx = device_get_sysctl_ctx(dev); 2309 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2310 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2311 &sc->hn_nvs_ver, 0, "NVS version"); 2312 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2313 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2314 hn_ndis_version_sysctl, "A", "NDIS version"); 2315 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2316 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2317 hn_caps_sysctl, "A", "capabilities"); 2318 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2319 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2320 hn_hwassist_sysctl, "A", "hwassist"); 2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max", 2322 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl, 2323 "IU", "max TSO size"); 2324 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt", 2325 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl, 2326 "IU", "max # of TSO segments"); 2327 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz", 2328 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl, 2329 "IU", "max size of TSO segment"); 2330 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2331 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2332 hn_rxfilter_sysctl, "A", "rxfilter"); 2333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2334 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2335 hn_rss_hash_sysctl, "A", "RSS hash"); 2336 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2337 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2338 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2339 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2340 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2341 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2342 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2343 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2344 #ifndef RSS 2345 /* 2346 * Don't allow RSS key/indirect table changes, if RSS is defined. 2347 */ 2348 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2349 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2350 hn_rss_key_sysctl, "IU", "RSS key"); 2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2352 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2353 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2354 #endif 2355 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2356 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2357 "RNDIS offered packet transmission aggregation size limit"); 2358 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2359 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2360 "RNDIS offered packet transmission aggregation count limit"); 2361 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2362 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2363 "RNDIS packet transmission aggregation alignment"); 2364 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2365 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2366 hn_txagg_size_sysctl, "I", 2367 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2368 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2369 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2370 hn_txagg_pkts_sysctl, "I", 2371 "Packet transmission aggregation packets, " 2372 "0 -- disable, -1 -- auto"); 2373 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2374 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2375 hn_polling_sysctl, "I", 2376 "Polling frequency: [100,1000000], 0 disable polling"); 2377 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2378 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2379 hn_vf_sysctl, "A", "Virtual Function's name"); 2380 if (!hn_xpnt_vf) { 2381 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2382 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2383 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2384 } else { 2385 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2386 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2387 hn_xpnt_vf_enabled_sysctl, "I", 2388 "Transparent VF enabled"); 2389 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2390 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2391 hn_xpnt_vf_accbpf_sysctl, "I", 2392 "Accurate BPF for transparent VF"); 2393 } 2394 2395 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2396 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2397 "switch to rsc"); 2398 2399 /* 2400 * Setup the ifmedia, which has been initialized earlier. 2401 */ 2402 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2403 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2404 /* XXX ifmedia_set really should do this for us */ 2405 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2406 2407 /* 2408 * Setup the ifnet for this interface. 2409 */ 2410 2411 if_setbaudrate(ifp, IF_Gbps(10)); 2412 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); 2413 if_setioctlfn(ifp, hn_ioctl); 2414 if_setinitfn(ifp, hn_init); 2415 #ifdef HN_IFSTART_SUPPORT 2416 if (hn_use_if_start) { 2417 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2418 2419 if_setstartfn(ifp, hn_start); 2420 if_setsendqlen(ifp, qdepth); 2421 if_setsendqready(ifp); 2422 } else 2423 #endif 2424 { 2425 if_settransmitfn(ifp, hn_transmit); 2426 if_setqflushfn(ifp, hn_xmit_qflush); 2427 } 2428 2429 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0); 2430 #ifdef foo 2431 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2432 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); 2433 #endif 2434 if (sc->hn_caps & HN_CAP_VLAN) { 2435 /* XXX not sure about VLAN_MTU. */ 2436 if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0); 2437 } 2438 2439 if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist); 2440 if (if_gethwassist(ifp) & HN_CSUM_IP_MASK) 2441 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0); 2442 if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK) 2443 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0); 2444 if (sc->hn_caps & HN_CAP_TSO4) { 2445 if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); 2446 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 2447 } 2448 if (sc->hn_caps & HN_CAP_TSO6) { 2449 if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); 2450 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 2451 } 2452 2453 /* Enable all available capabilities by default. */ 2454 if_setcapenable(ifp, if_getcapabilities(ifp)); 2455 2456 /* 2457 * Disable IPv6 TSO and TXCSUM by default, they still can 2458 * be enabled through SIOCSIFCAP. 2459 */ 2460 if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6)); 2461 if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO)); 2462 2463 if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) { 2464 /* 2465 * Lock hn_set_tso_maxsize() to simplify its 2466 * internal logic. 2467 */ 2468 HN_LOCK(sc); 2469 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2470 HN_UNLOCK(sc); 2471 if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX); 2472 if_sethwtsomaxsegsize(ifp, PAGE_SIZE); 2473 } 2474 2475 ether_ifattach(ifp, eaddr); 2476 2477 if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2478 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2479 if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp)); 2480 } 2481 if (mtu < ETHERMTU) { 2482 2483 if_setmtu(ifp, mtu); 2484 } 2485 2486 /* Inform the upper layer about the long frame support. */ 2487 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); 2488 2489 /* 2490 * Kick off link status check. 2491 */ 2492 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2493 hn_update_link_status(sc); 2494 2495 if (!hn_xpnt_vf) { 2496 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2497 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2498 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2499 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2500 } else { 2501 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2502 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2503 } 2504 2505 /* 2506 * NOTE: 2507 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2508 * since interface's LLADDR is needed; interface LLADDR is not 2509 * available when ifnet_arrival event is triggered. 2510 */ 2511 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2512 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2513 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2514 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2515 2516 return (0); 2517 failed: 2518 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2519 hn_synth_detach(sc); 2520 hn_detach(dev); 2521 return (error); 2522 } 2523 2524 static int 2525 hn_detach(device_t dev) 2526 { 2527 struct hn_softc *sc = device_get_softc(dev); 2528 if_t ifp = sc->hn_ifp, vf_ifp; 2529 2530 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2531 /* 2532 * In case that the vmbus missed the orphan handler 2533 * installation. 2534 */ 2535 vmbus_xact_ctx_orphan(sc->hn_xact); 2536 } 2537 2538 if (sc->hn_ifaddr_evthand != NULL) 2539 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2540 if (sc->hn_ifnet_evthand != NULL) 2541 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2542 if (sc->hn_ifnet_atthand != NULL) { 2543 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2544 sc->hn_ifnet_atthand); 2545 } 2546 if (sc->hn_ifnet_dethand != NULL) { 2547 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2548 sc->hn_ifnet_dethand); 2549 } 2550 if (sc->hn_ifnet_lnkhand != NULL) 2551 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2552 2553 vf_ifp = sc->hn_vf_ifp; 2554 __compiler_membar(); 2555 if (vf_ifp != NULL) 2556 hn_ifnet_detevent(sc, vf_ifp); 2557 2558 if (device_is_attached(dev)) { 2559 HN_LOCK(sc); 2560 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2561 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 2562 hn_stop(sc, true); 2563 /* 2564 * NOTE: 2565 * hn_stop() only suspends data, so managment 2566 * stuffs have to be suspended manually here. 2567 */ 2568 hn_suspend_mgmt(sc); 2569 hn_synth_detach(sc); 2570 } 2571 HN_UNLOCK(sc); 2572 ether_ifdetach(ifp); 2573 } 2574 2575 ifmedia_removeall(&sc->hn_media); 2576 hn_destroy_rx_data(sc); 2577 hn_destroy_tx_data(sc); 2578 2579 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2580 int i; 2581 2582 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2583 taskqueue_free(sc->hn_tx_taskqs[i]); 2584 free(sc->hn_tx_taskqs, M_DEVBUF); 2585 } 2586 taskqueue_free(sc->hn_mgmt_taskq0); 2587 if (sc->hn_vf_taskq != NULL) 2588 taskqueue_free(sc->hn_vf_taskq); 2589 2590 if (sc->hn_xact != NULL) { 2591 /* 2592 * Uninstall the orphan handler _before_ the xact is 2593 * destructed. 2594 */ 2595 vmbus_chan_unset_orphan(sc->hn_prichan); 2596 vmbus_xact_ctx_destroy(sc->hn_xact); 2597 } 2598 2599 if_free(ifp); 2600 2601 HN_LOCK_DESTROY(sc); 2602 rm_destroy(&sc->hn_vf_lock); 2603 return (0); 2604 } 2605 2606 static int 2607 hn_shutdown(device_t dev) 2608 { 2609 2610 return (0); 2611 } 2612 2613 static void 2614 hn_link_status(struct hn_softc *sc) 2615 { 2616 uint32_t link_status; 2617 int error; 2618 2619 error = hn_rndis_get_linkstatus(sc, &link_status); 2620 if (error) { 2621 /* XXX what to do? */ 2622 return; 2623 } 2624 2625 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2626 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2627 else 2628 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2629 if_link_state_change(sc->hn_ifp, 2630 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2631 LINK_STATE_UP : LINK_STATE_DOWN); 2632 } 2633 2634 static void 2635 hn_link_taskfunc(void *xsc, int pending __unused) 2636 { 2637 struct hn_softc *sc = xsc; 2638 2639 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2640 return; 2641 hn_link_status(sc); 2642 } 2643 2644 static void 2645 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2646 { 2647 struct hn_softc *sc = xsc; 2648 2649 /* Prevent any link status checks from running. */ 2650 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2651 2652 /* 2653 * Fake up a [link down --> link up] state change; 5 seconds 2654 * delay is used, which closely simulates miibus reaction 2655 * upon link down event. 2656 */ 2657 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2658 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2659 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2660 &sc->hn_netchg_status, 5 * hz); 2661 } 2662 2663 static void 2664 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2665 { 2666 struct hn_softc *sc = xsc; 2667 2668 /* Re-allow link status checks. */ 2669 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2670 hn_link_status(sc); 2671 } 2672 2673 static void 2674 hn_update_link_status(struct hn_softc *sc) 2675 { 2676 2677 if (sc->hn_mgmt_taskq != NULL) 2678 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2679 } 2680 2681 static void 2682 hn_change_network(struct hn_softc *sc) 2683 { 2684 2685 if (sc->hn_mgmt_taskq != NULL) 2686 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2687 } 2688 2689 static __inline int 2690 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2691 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2692 { 2693 struct mbuf *m = *m_head; 2694 int error; 2695 2696 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2697 2698 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2699 m, segs, nsegs, BUS_DMA_NOWAIT); 2700 if (error == EFBIG) { 2701 struct mbuf *m_new; 2702 2703 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2704 if (m_new == NULL) 2705 return ENOBUFS; 2706 else 2707 *m_head = m = m_new; 2708 txr->hn_tx_collapsed++; 2709 2710 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2711 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2712 } 2713 if (!error) { 2714 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2715 BUS_DMASYNC_PREWRITE); 2716 txd->flags |= HN_TXD_FLAG_DMAMAP; 2717 } 2718 return error; 2719 } 2720 2721 static __inline int 2722 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2723 { 2724 2725 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2726 ("put an onlist txd %#x", txd->flags)); 2727 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2728 ("put an onagg txd %#x", txd->flags)); 2729 2730 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2731 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2732 return 0; 2733 2734 if (!STAILQ_EMPTY(&txd->agg_list)) { 2735 struct hn_txdesc *tmp_txd; 2736 2737 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2738 int freed __diagused; 2739 2740 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2741 ("resursive aggregation on aggregated txdesc")); 2742 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2743 ("not aggregated txdesc")); 2744 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2745 ("aggregated txdesc uses dmamap")); 2746 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2747 ("aggregated txdesc consumes " 2748 "chimney sending buffer")); 2749 KASSERT(tmp_txd->chim_size == 0, 2750 ("aggregated txdesc has non-zero " 2751 "chimney sending size")); 2752 2753 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2754 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2755 freed = hn_txdesc_put(txr, tmp_txd); 2756 KASSERT(freed, ("failed to free aggregated txdesc")); 2757 } 2758 } 2759 2760 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2761 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2762 ("chim txd uses dmamap")); 2763 hn_chim_free(txr->hn_sc, txd->chim_index); 2764 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2765 txd->chim_size = 0; 2766 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2767 bus_dmamap_sync(txr->hn_tx_data_dtag, 2768 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2769 bus_dmamap_unload(txr->hn_tx_data_dtag, 2770 txd->data_dmap); 2771 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2772 } 2773 2774 if (txd->m != NULL) { 2775 m_freem(txd->m); 2776 txd->m = NULL; 2777 } 2778 2779 txd->flags |= HN_TXD_FLAG_ONLIST; 2780 #ifndef HN_USE_TXDESC_BUFRING 2781 mtx_lock_spin(&txr->hn_txlist_spin); 2782 KASSERT(txr->hn_txdesc_avail >= 0 && 2783 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2784 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2785 txr->hn_txdesc_avail++; 2786 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2787 mtx_unlock_spin(&txr->hn_txlist_spin); 2788 #else /* HN_USE_TXDESC_BUFRING */ 2789 #ifdef HN_DEBUG 2790 atomic_add_int(&txr->hn_txdesc_avail, 1); 2791 #endif 2792 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2793 #endif /* !HN_USE_TXDESC_BUFRING */ 2794 2795 return 1; 2796 } 2797 2798 static __inline struct hn_txdesc * 2799 hn_txdesc_get(struct hn_tx_ring *txr) 2800 { 2801 struct hn_txdesc *txd; 2802 2803 #ifndef HN_USE_TXDESC_BUFRING 2804 mtx_lock_spin(&txr->hn_txlist_spin); 2805 txd = SLIST_FIRST(&txr->hn_txlist); 2806 if (txd != NULL) { 2807 KASSERT(txr->hn_txdesc_avail > 0, 2808 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2809 txr->hn_txdesc_avail--; 2810 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2811 } 2812 mtx_unlock_spin(&txr->hn_txlist_spin); 2813 #else 2814 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2815 #endif 2816 2817 if (txd != NULL) { 2818 #ifdef HN_USE_TXDESC_BUFRING 2819 #ifdef HN_DEBUG 2820 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2821 #endif 2822 #endif /* HN_USE_TXDESC_BUFRING */ 2823 KASSERT(txd->m == NULL && txd->refs == 0 && 2824 STAILQ_EMPTY(&txd->agg_list) && 2825 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2826 txd->chim_size == 0 && 2827 (txd->flags & HN_TXD_FLAG_ONLIST) && 2828 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2829 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2830 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2831 txd->refs = 1; 2832 } 2833 return txd; 2834 } 2835 2836 static __inline void 2837 hn_txdesc_hold(struct hn_txdesc *txd) 2838 { 2839 2840 /* 0->1 transition will never work */ 2841 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2842 atomic_add_int(&txd->refs, 1); 2843 } 2844 2845 static __inline void 2846 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2847 { 2848 2849 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2850 ("recursive aggregation on aggregating txdesc")); 2851 2852 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2853 ("already aggregated")); 2854 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2855 ("recursive aggregation on to-be-aggregated txdesc")); 2856 2857 txd->flags |= HN_TXD_FLAG_ONAGG; 2858 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2859 } 2860 2861 static bool 2862 hn_tx_ring_pending(struct hn_tx_ring *txr) 2863 { 2864 bool pending = false; 2865 2866 #ifndef HN_USE_TXDESC_BUFRING 2867 mtx_lock_spin(&txr->hn_txlist_spin); 2868 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2869 pending = true; 2870 mtx_unlock_spin(&txr->hn_txlist_spin); 2871 #else 2872 if (!buf_ring_full(txr->hn_txdesc_br)) 2873 pending = true; 2874 #endif 2875 return (pending); 2876 } 2877 2878 static __inline void 2879 hn_txeof(struct hn_tx_ring *txr) 2880 { 2881 txr->hn_has_txeof = 0; 2882 txr->hn_txeof(txr); 2883 } 2884 2885 static void 2886 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2887 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2888 { 2889 struct hn_txdesc *txd = sndc->hn_cbarg; 2890 struct hn_tx_ring *txr; 2891 2892 txr = txd->txr; 2893 KASSERT(txr->hn_chan == chan, 2894 ("channel mismatch, on chan%u, should be chan%u", 2895 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2896 2897 txr->hn_has_txeof = 1; 2898 hn_txdesc_put(txr, txd); 2899 2900 ++txr->hn_txdone_cnt; 2901 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2902 txr->hn_txdone_cnt = 0; 2903 if (txr->hn_oactive) 2904 hn_txeof(txr); 2905 } 2906 } 2907 2908 static void 2909 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2910 { 2911 #if defined(INET) || defined(INET6) 2912 struct epoch_tracker et; 2913 2914 NET_EPOCH_ENTER(et); 2915 tcp_lro_flush_all(&rxr->hn_lro); 2916 NET_EPOCH_EXIT(et); 2917 #endif 2918 2919 /* 2920 * NOTE: 2921 * 'txr' could be NULL, if multiple channels and 2922 * ifnet.if_start method are enabled. 2923 */ 2924 if (txr == NULL || !txr->hn_has_txeof) 2925 return; 2926 2927 txr->hn_txdone_cnt = 0; 2928 hn_txeof(txr); 2929 } 2930 2931 static __inline uint32_t 2932 hn_rndis_pktmsg_offset(uint32_t ofs) 2933 { 2934 2935 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2936 ("invalid RNDIS packet msg offset %u", ofs)); 2937 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2938 } 2939 2940 static __inline void * 2941 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2942 size_t pi_dlen, uint32_t pi_type) 2943 { 2944 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2945 struct rndis_pktinfo *pi; 2946 2947 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2948 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2949 2950 /* 2951 * Per-packet-info does not move; it only grows. 2952 * 2953 * NOTE: 2954 * rm_pktinfooffset in this phase counts from the beginning 2955 * of rndis_packet_msg. 2956 */ 2957 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2958 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2959 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2960 pkt->rm_pktinfolen); 2961 pkt->rm_pktinfolen += pi_size; 2962 2963 pi->rm_size = pi_size; 2964 pi->rm_type = pi_type; 2965 pi->rm_internal = 0; 2966 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2967 2968 return (pi->rm_data); 2969 } 2970 2971 static __inline int 2972 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr) 2973 { 2974 struct hn_txdesc *txd; 2975 struct mbuf *m; 2976 int error, pkts; 2977 2978 txd = txr->hn_agg_txd; 2979 KASSERT(txd != NULL, ("no aggregate txdesc")); 2980 2981 /* 2982 * Since hn_txpkt() will reset this temporary stat, save 2983 * it now, so that oerrors can be updated properly, if 2984 * hn_txpkt() ever fails. 2985 */ 2986 pkts = txr->hn_stat_pkts; 2987 2988 /* 2989 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2990 * failure, save it for later freeing, if hn_txpkt() ever 2991 * fails. 2992 */ 2993 m = txd->m; 2994 error = hn_txpkt(ifp, txr, txd); 2995 if (__predict_false(error)) { 2996 /* txd is freed, but m is not. */ 2997 m_freem(m); 2998 2999 txr->hn_flush_failed++; 3000 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 3001 } 3002 3003 /* Reset all aggregation states. */ 3004 txr->hn_agg_txd = NULL; 3005 txr->hn_agg_szleft = 0; 3006 txr->hn_agg_pktleft = 0; 3007 txr->hn_agg_prevpkt = NULL; 3008 3009 return (error); 3010 } 3011 3012 static void * 3013 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3014 int pktsize) 3015 { 3016 void *chim; 3017 3018 if (txr->hn_agg_txd != NULL) { 3019 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 3020 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 3021 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 3022 int olen; 3023 3024 /* 3025 * Update the previous RNDIS packet's total length, 3026 * it can be increased due to the mandatory alignment 3027 * padding for this RNDIS packet. And update the 3028 * aggregating txdesc's chimney sending buffer size 3029 * accordingly. 3030 * 3031 * XXX 3032 * Zero-out the padding, as required by the RNDIS spec. 3033 */ 3034 olen = pkt->rm_len; 3035 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3036 agg_txd->chim_size += pkt->rm_len - olen; 3037 3038 /* Link this txdesc to the parent. */ 3039 hn_txdesc_agg(agg_txd, txd); 3040 3041 chim = (uint8_t *)pkt + pkt->rm_len; 3042 /* Save the current packet for later fixup. */ 3043 txr->hn_agg_prevpkt = chim; 3044 3045 txr->hn_agg_pktleft--; 3046 txr->hn_agg_szleft -= pktsize; 3047 if (txr->hn_agg_szleft <= 3048 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3049 /* 3050 * Probably can't aggregate more packets, 3051 * flush this aggregating txdesc proactively. 3052 */ 3053 txr->hn_agg_pktleft = 0; 3054 } 3055 /* Done! */ 3056 return (chim); 3057 } 3058 hn_flush_txagg(ifp, txr); 3059 } 3060 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3061 3062 txr->hn_tx_chimney_tried++; 3063 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3064 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3065 return (NULL); 3066 txr->hn_tx_chimney++; 3067 3068 chim = txr->hn_sc->hn_chim + 3069 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3070 3071 if (txr->hn_agg_pktmax > 1 && 3072 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3073 txr->hn_agg_txd = txd; 3074 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3075 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3076 txr->hn_agg_prevpkt = chim; 3077 } 3078 return (chim); 3079 } 3080 3081 /* 3082 * NOTE: 3083 * If this function fails, then both txd and m_head0 will be freed. 3084 */ 3085 static int 3086 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3087 struct mbuf **m_head0) 3088 { 3089 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3090 int error, nsegs, i; 3091 struct mbuf *m_head = *m_head0; 3092 struct rndis_packet_msg *pkt; 3093 uint32_t *pi_data; 3094 void *chim = NULL; 3095 int pkt_hlen, pkt_size; 3096 3097 pkt = txd->rndis_pkt; 3098 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3099 if (pkt_size < txr->hn_chim_size) { 3100 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3101 if (chim != NULL) 3102 pkt = chim; 3103 } else { 3104 if (txr->hn_agg_txd != NULL) 3105 hn_flush_txagg(ifp, txr); 3106 } 3107 3108 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3109 pkt->rm_len = m_head->m_pkthdr.len; 3110 pkt->rm_dataoffset = 0; 3111 pkt->rm_datalen = m_head->m_pkthdr.len; 3112 pkt->rm_oobdataoffset = 0; 3113 pkt->rm_oobdatalen = 0; 3114 pkt->rm_oobdataelements = 0; 3115 pkt->rm_pktinfooffset = sizeof(*pkt); 3116 pkt->rm_pktinfolen = 0; 3117 pkt->rm_vchandle = 0; 3118 pkt->rm_reserved = 0; 3119 3120 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3121 /* 3122 * Set the hash value for this packet. 3123 */ 3124 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3125 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3126 3127 if (M_HASHTYPE_ISHASH(m_head)) 3128 /* 3129 * The flowid field contains the hash value host 3130 * set in the rx queue if it is a ip forwarding pkt. 3131 * Set the same hash value so host can send on the 3132 * cpu it was received. 3133 */ 3134 *pi_data = m_head->m_pkthdr.flowid; 3135 else 3136 /* 3137 * Otherwise just put the tx queue index. 3138 */ 3139 *pi_data = txr->hn_tx_idx; 3140 } 3141 3142 if (m_head->m_flags & M_VLANTAG) { 3143 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3144 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3145 *pi_data = NDIS_VLAN_INFO_MAKE( 3146 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3147 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3148 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3149 } 3150 3151 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3152 #if defined(INET6) || defined(INET) 3153 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3154 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3155 #ifdef INET 3156 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3157 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3158 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3159 m_head->m_pkthdr.tso_segsz); 3160 } 3161 #endif 3162 #if defined(INET6) && defined(INET) 3163 else 3164 #endif 3165 #ifdef INET6 3166 { 3167 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3168 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3169 m_head->m_pkthdr.tso_segsz); 3170 } 3171 #endif 3172 #endif /* INET6 || INET */ 3173 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3174 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3175 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3176 if (m_head->m_pkthdr.csum_flags & 3177 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3178 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3179 } else { 3180 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3181 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3182 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3183 } 3184 3185 if (m_head->m_pkthdr.csum_flags & 3186 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3187 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3188 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3189 } else if (m_head->m_pkthdr.csum_flags & 3190 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3191 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3192 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3193 } 3194 } 3195 3196 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3197 /* Fixup RNDIS packet message total length */ 3198 pkt->rm_len += pkt_hlen; 3199 /* Convert RNDIS packet message offsets */ 3200 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3201 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3202 3203 /* 3204 * Fast path: Chimney sending. 3205 */ 3206 if (chim != NULL) { 3207 struct hn_txdesc *tgt_txd = txd; 3208 3209 if (txr->hn_agg_txd != NULL) { 3210 tgt_txd = txr->hn_agg_txd; 3211 #ifdef INVARIANTS 3212 *m_head0 = NULL; 3213 #endif 3214 } 3215 3216 KASSERT(pkt == chim, 3217 ("RNDIS pkt not in chimney sending buffer")); 3218 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3219 ("chimney sending buffer is not used")); 3220 tgt_txd->chim_size += pkt->rm_len; 3221 3222 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3223 ((uint8_t *)chim) + pkt_hlen); 3224 3225 txr->hn_gpa_cnt = 0; 3226 txr->hn_sendpkt = hn_txpkt_chim; 3227 goto done; 3228 } 3229 3230 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3231 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3232 ("chimney buffer is used")); 3233 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3234 3235 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3236 if (__predict_false(error)) { 3237 int freed __diagused; 3238 3239 /* 3240 * This mbuf is not linked w/ the txd yet, so free it now. 3241 */ 3242 m_freem(m_head); 3243 *m_head0 = NULL; 3244 3245 freed = hn_txdesc_put(txr, txd); 3246 KASSERT(freed != 0, 3247 ("fail to free txd upon txdma error")); 3248 3249 txr->hn_txdma_failed++; 3250 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3251 return error; 3252 } 3253 *m_head0 = m_head; 3254 3255 /* +1 RNDIS packet message */ 3256 txr->hn_gpa_cnt = nsegs + 1; 3257 3258 /* send packet with page buffer */ 3259 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3260 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3261 txr->hn_gpa[0].gpa_len = pkt_hlen; 3262 3263 /* 3264 * Fill the page buffers with mbuf info after the page 3265 * buffer for RNDIS packet message. 3266 */ 3267 for (i = 0; i < nsegs; ++i) { 3268 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3269 3270 gpa->gpa_page = atop(segs[i].ds_addr); 3271 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3272 gpa->gpa_len = segs[i].ds_len; 3273 } 3274 3275 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3276 txd->chim_size = 0; 3277 txr->hn_sendpkt = hn_txpkt_sglist; 3278 done: 3279 txd->m = m_head; 3280 3281 /* Set the completion routine */ 3282 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3283 3284 /* Update temporary stats for later use. */ 3285 txr->hn_stat_pkts++; 3286 txr->hn_stat_size += m_head->m_pkthdr.len; 3287 if (m_head->m_flags & M_MCAST) 3288 txr->hn_stat_mcasts++; 3289 3290 return 0; 3291 } 3292 3293 /* 3294 * NOTE: 3295 * If this function fails, then txd will be freed, but the mbuf 3296 * associated w/ the txd will _not_ be freed. 3297 */ 3298 static int 3299 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3300 { 3301 int error, send_failed = 0, has_bpf; 3302 3303 again: 3304 has_bpf = bpf_peers_present(if_getbpf(ifp)); 3305 if (has_bpf) { 3306 /* 3307 * Make sure that this txd and any aggregated txds are not 3308 * freed before ETHER_BPF_MTAP. 3309 */ 3310 hn_txdesc_hold(txd); 3311 } 3312 error = txr->hn_sendpkt(txr, txd); 3313 if (!error) { 3314 if (has_bpf) { 3315 const struct hn_txdesc *tmp_txd; 3316 3317 ETHER_BPF_MTAP(ifp, txd->m); 3318 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3319 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3320 } 3321 3322 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3323 #ifdef HN_IFSTART_SUPPORT 3324 if (!hn_use_if_start) 3325 #endif 3326 { 3327 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3328 txr->hn_stat_size); 3329 if (txr->hn_stat_mcasts != 0) { 3330 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3331 txr->hn_stat_mcasts); 3332 } 3333 } 3334 txr->hn_pkts += txr->hn_stat_pkts; 3335 txr->hn_sends++; 3336 } 3337 if (has_bpf) 3338 hn_txdesc_put(txr, txd); 3339 3340 if (__predict_false(error)) { 3341 int freed __diagused; 3342 3343 /* 3344 * This should "really rarely" happen. 3345 * 3346 * XXX Too many RX to be acked or too many sideband 3347 * commands to run? Ask netvsc_channel_rollup() 3348 * to kick start later. 3349 */ 3350 txr->hn_has_txeof = 1; 3351 if (!send_failed) { 3352 txr->hn_send_failed++; 3353 send_failed = 1; 3354 /* 3355 * Try sending again after set hn_has_txeof; 3356 * in case that we missed the last 3357 * netvsc_channel_rollup(). 3358 */ 3359 goto again; 3360 } 3361 if_printf(ifp, "send failed\n"); 3362 3363 /* 3364 * Caller will perform further processing on the 3365 * associated mbuf, so don't free it in hn_txdesc_put(); 3366 * only unload it from the DMA map in hn_txdesc_put(), 3367 * if it was loaded. 3368 */ 3369 txd->m = NULL; 3370 freed = hn_txdesc_put(txr, txd); 3371 KASSERT(freed != 0, 3372 ("fail to free txd upon send error")); 3373 3374 txr->hn_send_failed++; 3375 } 3376 3377 /* Reset temporary stats, after this sending is done. */ 3378 txr->hn_stat_size = 0; 3379 txr->hn_stat_pkts = 0; 3380 txr->hn_stat_mcasts = 0; 3381 3382 return (error); 3383 } 3384 3385 /* 3386 * Append the specified data to the indicated mbuf chain, 3387 * Extend the mbuf chain if the new data does not fit in 3388 * existing space. 3389 * 3390 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3391 * There should be an equivalent in the kernel mbuf code, 3392 * but there does not appear to be one yet. 3393 * 3394 * Differs from m_append() in that additional mbufs are 3395 * allocated with cluster size MJUMPAGESIZE, and filled 3396 * accordingly. 3397 * 3398 * Return the last mbuf in the chain or NULL if failed to 3399 * allocate new mbuf. 3400 */ 3401 static struct mbuf * 3402 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3403 { 3404 struct mbuf *m, *n; 3405 int remainder, space; 3406 3407 for (m = m0; m->m_next != NULL; m = m->m_next) 3408 ; 3409 remainder = len; 3410 space = M_TRAILINGSPACE(m); 3411 if (space > 0) { 3412 /* 3413 * Copy into available space. 3414 */ 3415 if (space > remainder) 3416 space = remainder; 3417 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3418 m->m_len += space; 3419 cp += space; 3420 remainder -= space; 3421 } 3422 while (remainder > 0) { 3423 /* 3424 * Allocate a new mbuf; could check space 3425 * and allocate a cluster instead. 3426 */ 3427 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3428 if (n == NULL) 3429 return NULL; 3430 n->m_len = min(MJUMPAGESIZE, remainder); 3431 bcopy(cp, mtod(n, caddr_t), n->m_len); 3432 cp += n->m_len; 3433 remainder -= n->m_len; 3434 m->m_next = n; 3435 m = n; 3436 } 3437 3438 return m; 3439 } 3440 3441 #if defined(INET) || defined(INET6) 3442 static __inline int 3443 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3444 { 3445 if (hn_lro_mbufq_depth) { 3446 tcp_lro_queue_mbuf(lc, m); 3447 return 0; 3448 } 3449 return tcp_lro_rx(lc, m, 0); 3450 } 3451 #endif 3452 3453 static int 3454 hn_rxpkt(struct hn_rx_ring *rxr) 3455 { 3456 if_t ifp, hn_ifp = rxr->hn_ifp; 3457 struct mbuf *m_new, *n; 3458 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3459 int hash_type = M_HASHTYPE_NONE; 3460 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3461 int i; 3462 3463 ifp = hn_ifp; 3464 if (rxr->hn_rxvf_ifp != NULL) { 3465 /* 3466 * Non-transparent mode VF; pretend this packet is from 3467 * the VF. 3468 */ 3469 ifp = rxr->hn_rxvf_ifp; 3470 is_vf = 1; 3471 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3472 /* Transparent mode VF. */ 3473 is_vf = 1; 3474 } 3475 3476 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { 3477 /* 3478 * NOTE: 3479 * See the NOTE of hn_rndis_init_fixat(). This 3480 * function can be reached, immediately after the 3481 * RNDIS is initialized but before the ifnet is 3482 * setup on the hn_attach() path; drop the unexpected 3483 * packets. 3484 */ 3485 return (0); 3486 } 3487 3488 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3489 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3490 return (0); 3491 } 3492 3493 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3494 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3495 if (m_new == NULL) { 3496 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3497 return (0); 3498 } 3499 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3500 rxr->rsc.frag_len[0]); 3501 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3502 } else { 3503 /* 3504 * Get an mbuf with a cluster. For packets 2K or less, 3505 * get a standard 2K cluster. For anything larger, get a 3506 * 4K cluster. Any buffers larger than 4K can cause problems 3507 * if looped around to the Hyper-V TX channel, so avoid them. 3508 */ 3509 size = MCLBYTES; 3510 if (rxr->rsc.pktlen > MCLBYTES) { 3511 /* 4096 */ 3512 size = MJUMPAGESIZE; 3513 } 3514 3515 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3516 if (m_new == NULL) { 3517 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3518 return (0); 3519 } 3520 3521 n = m_new; 3522 for (i = 0; i < rxr->rsc.cnt; i++) { 3523 n = hv_m_append(n, rxr->rsc.frag_len[i], 3524 rxr->rsc.frag_data[i]); 3525 if (n == NULL) { 3526 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3527 return (0); 3528 } else { 3529 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3530 } 3531 } 3532 } 3533 if (rxr->rsc.pktlen <= MHLEN) 3534 rxr->hn_small_pkts++; 3535 3536 m_new->m_pkthdr.rcvif = ifp; 3537 3538 if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0)) 3539 do_csum = 0; 3540 3541 /* receive side checksum offload */ 3542 if (rxr->rsc.csum_info != NULL) { 3543 /* IP csum offload */ 3544 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3545 m_new->m_pkthdr.csum_flags |= 3546 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3547 rxr->hn_csum_ip++; 3548 } 3549 3550 /* TCP/UDP csum offload */ 3551 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3552 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3553 m_new->m_pkthdr.csum_flags |= 3554 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3555 m_new->m_pkthdr.csum_data = 0xffff; 3556 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3557 rxr->hn_csum_tcp++; 3558 else 3559 rxr->hn_csum_udp++; 3560 } 3561 3562 /* 3563 * XXX 3564 * As of this write (Oct 28th, 2016), host side will turn 3565 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3566 * the do_lro setting here is actually _not_ accurate. We 3567 * depend on the RSS hash type check to reset do_lro. 3568 */ 3569 if ((*(rxr->rsc.csum_info) & 3570 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3571 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3572 do_lro = 1; 3573 } else { 3574 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3575 if (l3proto == ETHERTYPE_IP) { 3576 if (l4proto == IPPROTO_TCP) { 3577 if (do_csum && 3578 (rxr->hn_trust_hcsum & 3579 HN_TRUST_HCSUM_TCP)) { 3580 rxr->hn_csum_trusted++; 3581 m_new->m_pkthdr.csum_flags |= 3582 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3583 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3584 m_new->m_pkthdr.csum_data = 0xffff; 3585 } 3586 do_lro = 1; 3587 } else if (l4proto == IPPROTO_UDP) { 3588 if (do_csum && 3589 (rxr->hn_trust_hcsum & 3590 HN_TRUST_HCSUM_UDP)) { 3591 rxr->hn_csum_trusted++; 3592 m_new->m_pkthdr.csum_flags |= 3593 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3594 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3595 m_new->m_pkthdr.csum_data = 0xffff; 3596 } 3597 } else if (l4proto != IPPROTO_DONE && do_csum && 3598 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3599 rxr->hn_csum_trusted++; 3600 m_new->m_pkthdr.csum_flags |= 3601 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3602 } 3603 } 3604 } 3605 3606 if (rxr->rsc.vlan_info != NULL) { 3607 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3608 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3609 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3610 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3611 m_new->m_flags |= M_VLANTAG; 3612 } 3613 3614 /* 3615 * If VF is activated (tranparent/non-transparent mode does not 3616 * matter here). 3617 * 3618 * - Disable LRO 3619 * 3620 * hn(4) will only receive broadcast packets, multicast packets, 3621 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3622 * packet types. 3623 * 3624 * For non-transparent, we definitely _cannot_ enable LRO at 3625 * all, since the LRO flush will use hn(4) as the receiving 3626 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3627 */ 3628 if (is_vf) 3629 do_lro = 0; 3630 3631 /* 3632 * If VF is activated (tranparent/non-transparent mode does not 3633 * matter here), do _not_ mess with unsupported hash types or 3634 * functions. 3635 */ 3636 if (rxr->rsc.hash_info != NULL) { 3637 rxr->hn_rss_pkts++; 3638 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3639 if (!is_vf) 3640 hash_type = M_HASHTYPE_OPAQUE_HASH; 3641 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3642 NDIS_HASH_FUNCTION_TOEPLITZ) { 3643 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3644 rxr->hn_mbuf_hash); 3645 3646 /* 3647 * NOTE: 3648 * do_lro is resetted, if the hash types are not TCP 3649 * related. See the comment in the above csum_flags 3650 * setup section. 3651 */ 3652 switch (type) { 3653 case NDIS_HASH_IPV4: 3654 hash_type = M_HASHTYPE_RSS_IPV4; 3655 do_lro = 0; 3656 break; 3657 3658 case NDIS_HASH_TCP_IPV4: 3659 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3660 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3661 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3662 3663 if (is_vf) 3664 def_htype = M_HASHTYPE_NONE; 3665 3666 /* 3667 * UDP 4-tuple hash is delivered as 3668 * TCP 4-tuple hash. 3669 */ 3670 if (l3proto == ETHERTYPE_MAX) { 3671 hn_rxpkt_proto(m_new, 3672 &l3proto, &l4proto); 3673 } 3674 if (l3proto == ETHERTYPE_IP) { 3675 if (l4proto == IPPROTO_UDP && 3676 (rxr->hn_mbuf_hash & 3677 NDIS_HASH_UDP_IPV4_X)) { 3678 hash_type = 3679 M_HASHTYPE_RSS_UDP_IPV4; 3680 do_lro = 0; 3681 } else if (l4proto != 3682 IPPROTO_TCP) { 3683 hash_type = def_htype; 3684 do_lro = 0; 3685 } 3686 } else { 3687 hash_type = def_htype; 3688 do_lro = 0; 3689 } 3690 } 3691 break; 3692 3693 case NDIS_HASH_IPV6: 3694 hash_type = M_HASHTYPE_RSS_IPV6; 3695 do_lro = 0; 3696 break; 3697 3698 case NDIS_HASH_IPV6_EX: 3699 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3700 do_lro = 0; 3701 break; 3702 3703 case NDIS_HASH_TCP_IPV6: 3704 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3705 break; 3706 3707 case NDIS_HASH_TCP_IPV6_EX: 3708 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3709 break; 3710 } 3711 } 3712 } else if (!is_vf) { 3713 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3714 hash_type = M_HASHTYPE_OPAQUE; 3715 } 3716 M_HASHTYPE_SET(m_new, hash_type); 3717 3718 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3719 if (hn_ifp != ifp) { 3720 const struct ether_header *eh; 3721 3722 /* 3723 * Non-transparent mode VF is activated. 3724 */ 3725 3726 /* 3727 * Allow tapping on hn(4). 3728 */ 3729 ETHER_BPF_MTAP(hn_ifp, m_new); 3730 3731 /* 3732 * Update hn(4)'s stats. 3733 */ 3734 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3735 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3736 /* Checked at the beginning of this function. */ 3737 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3738 eh = mtod(m_new, struct ether_header *); 3739 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3740 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3741 } 3742 rxr->hn_pkts++; 3743 3744 if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) { 3745 #if defined(INET) || defined(INET6) 3746 struct lro_ctrl *lro = &rxr->hn_lro; 3747 3748 if (lro->lro_cnt) { 3749 rxr->hn_lro_tried++; 3750 if (hn_lro_rx(lro, m_new) == 0) { 3751 /* DONE! */ 3752 return 0; 3753 } 3754 } 3755 #endif 3756 } 3757 if_input(ifp, m_new); 3758 3759 return (0); 3760 } 3761 3762 static int 3763 hn_ioctl(if_t ifp, u_long cmd, caddr_t data) 3764 { 3765 struct hn_softc *sc = if_getsoftc(ifp); 3766 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3767 if_t vf_ifp; 3768 int mask, error = 0; 3769 struct ifrsskey *ifrk; 3770 struct ifrsshash *ifrh; 3771 uint32_t mtu; 3772 3773 switch (cmd) { 3774 case SIOCSIFMTU: 3775 if (ifr->ifr_mtu > HN_MTU_MAX) { 3776 error = EINVAL; 3777 break; 3778 } 3779 3780 HN_LOCK(sc); 3781 3782 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3783 HN_UNLOCK(sc); 3784 break; 3785 } 3786 3787 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3788 /* Can't change MTU */ 3789 HN_UNLOCK(sc); 3790 error = EOPNOTSUPP; 3791 break; 3792 } 3793 3794 if (if_getmtu(ifp) == ifr->ifr_mtu) { 3795 HN_UNLOCK(sc); 3796 break; 3797 } 3798 3799 if (hn_xpnt_vf_isready(sc)) { 3800 vf_ifp = sc->hn_vf_ifp; 3801 ifr_vf = *ifr; 3802 strlcpy(ifr_vf.ifr_name, if_name(vf_ifp), 3803 sizeof(ifr_vf.ifr_name)); 3804 error = ifhwioctl(SIOCSIFMTU,vf_ifp, 3805 (caddr_t)&ifr_vf, curthread); 3806 if (error) { 3807 HN_UNLOCK(sc); 3808 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3809 if_name(vf_ifp), ifr->ifr_mtu, error); 3810 break; 3811 } 3812 } 3813 3814 /* 3815 * Suspend this interface before the synthetic parts 3816 * are ripped. 3817 */ 3818 hn_suspend(sc); 3819 3820 /* 3821 * Detach the synthetics parts, i.e. NVS and RNDIS. 3822 */ 3823 hn_synth_detach(sc); 3824 3825 /* 3826 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3827 * with the new MTU setting. 3828 */ 3829 error = hn_synth_attach(sc, ifr->ifr_mtu); 3830 if (error) { 3831 HN_UNLOCK(sc); 3832 break; 3833 } 3834 3835 error = hn_rndis_get_mtu(sc, &mtu); 3836 if (error) 3837 mtu = ifr->ifr_mtu; 3838 else if (bootverbose) 3839 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3840 3841 /* 3842 * Commit the requested MTU, after the synthetic parts 3843 * have been successfully attached. 3844 */ 3845 if (mtu >= ifr->ifr_mtu) { 3846 mtu = ifr->ifr_mtu; 3847 } else { 3848 if_printf(ifp, "fixup mtu %d -> %u\n", 3849 ifr->ifr_mtu, mtu); 3850 } 3851 if_setmtu(ifp, mtu); 3852 3853 /* 3854 * Synthetic parts' reattach may change the chimney 3855 * sending size; update it. 3856 */ 3857 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3858 hn_set_chim_size(sc, sc->hn_chim_szmax); 3859 3860 /* 3861 * Make sure that various parameters based on MTU are 3862 * still valid, after the MTU change. 3863 */ 3864 hn_mtu_change_fixup(sc); 3865 3866 /* 3867 * All done! Resume the interface now. 3868 */ 3869 hn_resume(sc); 3870 3871 if ((sc->hn_flags & HN_FLAG_RXVF) || 3872 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3873 /* 3874 * Since we have reattached the NVS part, 3875 * change the datapath to VF again; in case 3876 * that it is lost, after the NVS was detached. 3877 */ 3878 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3879 } 3880 3881 HN_UNLOCK(sc); 3882 break; 3883 3884 case SIOCSIFFLAGS: 3885 HN_LOCK(sc); 3886 3887 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3888 HN_UNLOCK(sc); 3889 break; 3890 } 3891 3892 if (hn_xpnt_vf_isready(sc)) 3893 hn_xpnt_vf_saveifflags(sc); 3894 3895 if (if_getflags(ifp) & IFF_UP) { 3896 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3897 /* 3898 * Caller meight hold mutex, e.g. 3899 * bpf; use busy-wait for the RNDIS 3900 * reply. 3901 */ 3902 HN_NO_SLEEPING(sc); 3903 hn_rxfilter_config(sc); 3904 HN_SLEEPING_OK(sc); 3905 3906 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3907 error = hn_xpnt_vf_iocsetflags(sc); 3908 } else { 3909 hn_init_locked(sc); 3910 } 3911 } else { 3912 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 3913 hn_stop(sc, false); 3914 } 3915 sc->hn_if_flags = if_getflags(ifp); 3916 3917 HN_UNLOCK(sc); 3918 break; 3919 3920 case SIOCSIFCAP: 3921 HN_LOCK(sc); 3922 3923 if (hn_xpnt_vf_isready(sc)) { 3924 ifr_vf = *ifr; 3925 strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp), 3926 sizeof(ifr_vf.ifr_name)); 3927 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3928 HN_UNLOCK(sc); 3929 break; 3930 } 3931 3932 /* 3933 * Fix up requested capabilities w/ supported capabilities, 3934 * since the supported capabilities could have been changed. 3935 */ 3936 mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ 3937 if_getcapenable(ifp); 3938 3939 if (mask & IFCAP_TXCSUM) { 3940 if_togglecapenable(ifp, IFCAP_TXCSUM); 3941 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 3942 if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0); 3943 else 3944 if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc)); 3945 } 3946 if (mask & IFCAP_TXCSUM_IPV6) { 3947 if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); 3948 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 3949 if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0); 3950 else 3951 if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc)); 3952 } 3953 3954 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3955 if (mask & IFCAP_RXCSUM) 3956 if_togglecapenable(ifp, IFCAP_RXCSUM); 3957 #ifdef foo 3958 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3959 if (mask & IFCAP_RXCSUM_IPV6) 3960 if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); 3961 #endif 3962 3963 if (mask & IFCAP_LRO) 3964 if_togglecapenable(ifp, IFCAP_LRO); 3965 3966 if (mask & IFCAP_TSO4) { 3967 if_togglecapenable(ifp, IFCAP_TSO4); 3968 if (if_getcapenable(ifp) & IFCAP_TSO4) 3969 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 3970 else 3971 if_sethwassistbits(ifp, 0, CSUM_IP_TSO); 3972 } 3973 if (mask & IFCAP_TSO6) { 3974 if_togglecapenable(ifp, IFCAP_TSO6); 3975 if (if_getcapenable(ifp) & IFCAP_TSO6) 3976 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 3977 else 3978 if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); 3979 } 3980 3981 HN_UNLOCK(sc); 3982 break; 3983 3984 case SIOCADDMULTI: 3985 case SIOCDELMULTI: 3986 HN_LOCK(sc); 3987 3988 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3989 HN_UNLOCK(sc); 3990 break; 3991 } 3992 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3993 /* 3994 * Multicast uses mutex; use busy-wait for 3995 * the RNDIS reply. 3996 */ 3997 HN_NO_SLEEPING(sc); 3998 hn_rxfilter_config(sc); 3999 HN_SLEEPING_OK(sc); 4000 } 4001 4002 /* XXX vlan(4) style mcast addr maintenance */ 4003 if (hn_xpnt_vf_isready(sc)) { 4004 int old_if_flags; 4005 4006 old_if_flags = if_getflags(sc->hn_vf_ifp); 4007 hn_xpnt_vf_saveifflags(sc); 4008 4009 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 4010 ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) & 4011 IFF_ALLMULTI)) 4012 error = hn_xpnt_vf_iocsetflags(sc); 4013 } 4014 4015 HN_UNLOCK(sc); 4016 break; 4017 4018 case SIOCSIFMEDIA: 4019 case SIOCGIFMEDIA: 4020 HN_LOCK(sc); 4021 if (hn_xpnt_vf_isready(sc)) { 4022 /* 4023 * SIOCGIFMEDIA expects ifmediareq, so don't 4024 * create and pass ifr_vf to the VF here; just 4025 * replace the ifr_name. 4026 */ 4027 vf_ifp = sc->hn_vf_ifp; 4028 strlcpy(ifr->ifr_name, if_name(vf_ifp), 4029 sizeof(ifr->ifr_name)); 4030 error = ifhwioctl(cmd, vf_ifp, data, curthread); 4031 /* Restore the ifr_name. */ 4032 strlcpy(ifr->ifr_name, if_name(ifp), 4033 sizeof(ifr->ifr_name)); 4034 HN_UNLOCK(sc); 4035 break; 4036 } 4037 HN_UNLOCK(sc); 4038 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4039 break; 4040 4041 case SIOCGIFRSSHASH: 4042 ifrh = (struct ifrsshash *)data; 4043 HN_LOCK(sc); 4044 if (sc->hn_rx_ring_inuse == 1) { 4045 HN_UNLOCK(sc); 4046 ifrh->ifrh_func = RSS_FUNC_NONE; 4047 ifrh->ifrh_types = 0; 4048 break; 4049 } 4050 4051 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4052 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4053 else 4054 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4055 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4056 HN_UNLOCK(sc); 4057 break; 4058 4059 case SIOCGIFRSSKEY: 4060 ifrk = (struct ifrsskey *)data; 4061 HN_LOCK(sc); 4062 if (sc->hn_rx_ring_inuse == 1) { 4063 HN_UNLOCK(sc); 4064 ifrk->ifrk_func = RSS_FUNC_NONE; 4065 ifrk->ifrk_keylen = 0; 4066 break; 4067 } 4068 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4069 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4070 else 4071 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4072 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4073 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4074 NDIS_HASH_KEYSIZE_TOEPLITZ); 4075 HN_UNLOCK(sc); 4076 break; 4077 4078 default: 4079 error = ether_ioctl(ifp, cmd, data); 4080 break; 4081 } 4082 return (error); 4083 } 4084 4085 static void 4086 hn_stop(struct hn_softc *sc, bool detaching) 4087 { 4088 if_t ifp = sc->hn_ifp; 4089 int i; 4090 4091 HN_LOCK_ASSERT(sc); 4092 4093 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4094 ("synthetic parts were not attached")); 4095 4096 /* Clear RUNNING bit ASAP. */ 4097 if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); 4098 4099 /* Disable polling. */ 4100 hn_polling(sc, 0); 4101 4102 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4103 KASSERT(sc->hn_vf_ifp != NULL, 4104 ("%s: VF is not attached", if_name(ifp))); 4105 4106 /* Mark transparent mode VF as disabled. */ 4107 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4108 4109 /* 4110 * NOTE: 4111 * Datapath setting must happen _before_ bringing 4112 * the VF down. 4113 */ 4114 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4115 4116 /* 4117 * Bring the VF down. 4118 */ 4119 hn_xpnt_vf_saveifflags(sc); 4120 if_setflagbits(ifp, 0, IFF_UP); 4121 hn_xpnt_vf_iocsetflags(sc); 4122 } 4123 4124 /* Suspend data transfers. */ 4125 hn_suspend_data(sc); 4126 4127 /* Clear OACTIVE bit. */ 4128 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4129 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4130 sc->hn_tx_ring[i].hn_oactive = 0; 4131 4132 /* 4133 * If the non-transparent mode VF is active, make sure 4134 * that the RX filter still allows packet reception. 4135 */ 4136 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4137 hn_rxfilter_config(sc); 4138 } 4139 4140 static void 4141 hn_init_locked(struct hn_softc *sc) 4142 { 4143 if_t ifp = sc->hn_ifp; 4144 int i; 4145 4146 HN_LOCK_ASSERT(sc); 4147 4148 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4149 return; 4150 4151 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 4152 return; 4153 4154 /* Configure RX filter */ 4155 hn_rxfilter_config(sc); 4156 4157 /* Clear OACTIVE bit. */ 4158 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4159 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4160 sc->hn_tx_ring[i].hn_oactive = 0; 4161 4162 /* Clear TX 'suspended' bit. */ 4163 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4164 4165 if (hn_xpnt_vf_isready(sc)) { 4166 /* Initialize transparent VF. */ 4167 hn_xpnt_vf_init(sc); 4168 } 4169 4170 /* Everything is ready; unleash! */ 4171 if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); 4172 4173 /* Re-enable polling if requested. */ 4174 if (sc->hn_pollhz > 0) 4175 hn_polling(sc, sc->hn_pollhz); 4176 } 4177 4178 static void 4179 hn_init(void *xsc) 4180 { 4181 struct hn_softc *sc = xsc; 4182 4183 HN_LOCK(sc); 4184 hn_init_locked(sc); 4185 HN_UNLOCK(sc); 4186 } 4187 4188 static int 4189 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4190 { 4191 struct hn_softc *sc = arg1; 4192 unsigned int lenlim; 4193 int error; 4194 4195 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4196 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4197 if (error || req->newptr == NULL) 4198 return error; 4199 4200 HN_LOCK(sc); 4201 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4202 lenlim > TCP_LRO_LENGTH_MAX) { 4203 HN_UNLOCK(sc); 4204 return EINVAL; 4205 } 4206 hn_set_lro_lenlim(sc, lenlim); 4207 HN_UNLOCK(sc); 4208 4209 return 0; 4210 } 4211 4212 static int 4213 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4214 { 4215 struct hn_softc *sc = arg1; 4216 int ackcnt, error, i; 4217 4218 /* 4219 * lro_ackcnt_lim is append count limit, 4220 * +1 to turn it into aggregation limit. 4221 */ 4222 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4223 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4224 if (error || req->newptr == NULL) 4225 return error; 4226 4227 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4228 return EINVAL; 4229 4230 /* 4231 * Convert aggregation limit back to append 4232 * count limit. 4233 */ 4234 --ackcnt; 4235 HN_LOCK(sc); 4236 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4237 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4238 HN_UNLOCK(sc); 4239 return 0; 4240 } 4241 4242 static int 4243 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4244 { 4245 struct hn_softc *sc = arg1; 4246 int hcsum = arg2; 4247 int on, error, i; 4248 4249 on = 0; 4250 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4251 on = 1; 4252 4253 error = sysctl_handle_int(oidp, &on, 0, req); 4254 if (error || req->newptr == NULL) 4255 return error; 4256 4257 HN_LOCK(sc); 4258 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4259 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4260 4261 if (on) 4262 rxr->hn_trust_hcsum |= hcsum; 4263 else 4264 rxr->hn_trust_hcsum &= ~hcsum; 4265 } 4266 HN_UNLOCK(sc); 4267 return 0; 4268 } 4269 4270 static int 4271 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4272 { 4273 struct hn_softc *sc = arg1; 4274 int chim_size, error; 4275 4276 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4277 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4278 if (error || req->newptr == NULL) 4279 return error; 4280 4281 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4282 return EINVAL; 4283 4284 HN_LOCK(sc); 4285 hn_set_chim_size(sc, chim_size); 4286 HN_UNLOCK(sc); 4287 return 0; 4288 } 4289 4290 static int 4291 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4292 { 4293 struct hn_softc *sc = arg1; 4294 int ofs = arg2, i, error; 4295 struct hn_rx_ring *rxr; 4296 uint64_t stat; 4297 4298 stat = 0; 4299 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4300 rxr = &sc->hn_rx_ring[i]; 4301 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4302 } 4303 4304 error = sysctl_handle_64(oidp, &stat, 0, req); 4305 if (error || req->newptr == NULL) 4306 return error; 4307 4308 /* Zero out this stat. */ 4309 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4310 rxr = &sc->hn_rx_ring[i]; 4311 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4312 } 4313 return 0; 4314 } 4315 4316 static int 4317 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4318 { 4319 struct hn_softc *sc = arg1; 4320 int ofs = arg2, i, error; 4321 struct hn_rx_ring *rxr; 4322 u_long stat; 4323 4324 stat = 0; 4325 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4326 rxr = &sc->hn_rx_ring[i]; 4327 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4328 } 4329 4330 error = sysctl_handle_long(oidp, &stat, 0, req); 4331 if (error || req->newptr == NULL) 4332 return error; 4333 4334 /* Zero out this stat. */ 4335 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4336 rxr = &sc->hn_rx_ring[i]; 4337 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4338 } 4339 return 0; 4340 } 4341 4342 static int 4343 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4344 { 4345 struct hn_softc *sc = arg1; 4346 int ofs = arg2, i, error; 4347 struct hn_tx_ring *txr; 4348 u_long stat; 4349 4350 stat = 0; 4351 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4352 txr = &sc->hn_tx_ring[i]; 4353 stat += *((u_long *)((uint8_t *)txr + ofs)); 4354 } 4355 4356 error = sysctl_handle_long(oidp, &stat, 0, req); 4357 if (error || req->newptr == NULL) 4358 return error; 4359 4360 /* Zero out this stat. */ 4361 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4362 txr = &sc->hn_tx_ring[i]; 4363 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4364 } 4365 return 0; 4366 } 4367 4368 static int 4369 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4370 { 4371 struct hn_softc *sc = arg1; 4372 int ofs = arg2, i, error, conf; 4373 struct hn_tx_ring *txr; 4374 4375 txr = &sc->hn_tx_ring[0]; 4376 conf = *((int *)((uint8_t *)txr + ofs)); 4377 4378 error = sysctl_handle_int(oidp, &conf, 0, req); 4379 if (error || req->newptr == NULL) 4380 return error; 4381 4382 HN_LOCK(sc); 4383 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4384 txr = &sc->hn_tx_ring[i]; 4385 *((int *)((uint8_t *)txr + ofs)) = conf; 4386 } 4387 HN_UNLOCK(sc); 4388 4389 return 0; 4390 } 4391 4392 static int 4393 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4394 { 4395 struct hn_softc *sc = arg1; 4396 int error, size; 4397 4398 size = sc->hn_agg_size; 4399 error = sysctl_handle_int(oidp, &size, 0, req); 4400 if (error || req->newptr == NULL) 4401 return (error); 4402 4403 HN_LOCK(sc); 4404 sc->hn_agg_size = size; 4405 hn_set_txagg(sc); 4406 HN_UNLOCK(sc); 4407 4408 return (0); 4409 } 4410 4411 static int 4412 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4413 { 4414 struct hn_softc *sc = arg1; 4415 int error, pkts; 4416 4417 pkts = sc->hn_agg_pkts; 4418 error = sysctl_handle_int(oidp, &pkts, 0, req); 4419 if (error || req->newptr == NULL) 4420 return (error); 4421 4422 HN_LOCK(sc); 4423 sc->hn_agg_pkts = pkts; 4424 hn_set_txagg(sc); 4425 HN_UNLOCK(sc); 4426 4427 return (0); 4428 } 4429 4430 static int 4431 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4432 { 4433 struct hn_softc *sc = arg1; 4434 int pkts; 4435 4436 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4437 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4438 } 4439 4440 static int 4441 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4442 { 4443 struct hn_softc *sc = arg1; 4444 int align; 4445 4446 align = sc->hn_tx_ring[0].hn_agg_align; 4447 return (sysctl_handle_int(oidp, &align, 0, req)); 4448 } 4449 4450 static void 4451 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4452 { 4453 if (pollhz == 0) 4454 vmbus_chan_poll_disable(chan); 4455 else 4456 vmbus_chan_poll_enable(chan, pollhz); 4457 } 4458 4459 static void 4460 hn_polling(struct hn_softc *sc, u_int pollhz) 4461 { 4462 int nsubch = sc->hn_rx_ring_inuse - 1; 4463 4464 HN_LOCK_ASSERT(sc); 4465 4466 if (nsubch > 0) { 4467 struct vmbus_channel **subch; 4468 int i; 4469 4470 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4471 for (i = 0; i < nsubch; ++i) 4472 hn_chan_polling(subch[i], pollhz); 4473 vmbus_subchan_rel(subch, nsubch); 4474 } 4475 hn_chan_polling(sc->hn_prichan, pollhz); 4476 } 4477 4478 static int 4479 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4480 { 4481 struct hn_softc *sc = arg1; 4482 int pollhz, error; 4483 4484 pollhz = sc->hn_pollhz; 4485 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4486 if (error || req->newptr == NULL) 4487 return (error); 4488 4489 if (pollhz != 0 && 4490 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4491 return (EINVAL); 4492 4493 HN_LOCK(sc); 4494 if (sc->hn_pollhz != pollhz) { 4495 sc->hn_pollhz = pollhz; 4496 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && 4497 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4498 hn_polling(sc, sc->hn_pollhz); 4499 } 4500 HN_UNLOCK(sc); 4501 4502 return (0); 4503 } 4504 4505 static int 4506 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4507 { 4508 struct hn_softc *sc = arg1; 4509 char verstr[16]; 4510 4511 snprintf(verstr, sizeof(verstr), "%u.%u", 4512 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4513 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4514 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4515 } 4516 4517 static int 4518 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4519 { 4520 struct hn_softc *sc = arg1; 4521 char caps_str[128]; 4522 uint32_t caps; 4523 4524 HN_LOCK(sc); 4525 caps = sc->hn_caps; 4526 HN_UNLOCK(sc); 4527 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4528 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4529 } 4530 4531 static int 4532 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4533 { 4534 struct hn_softc *sc = arg1; 4535 char assist_str[128]; 4536 uint32_t hwassist; 4537 4538 HN_LOCK(sc); 4539 hwassist = if_gethwassist(sc->hn_ifp); 4540 HN_UNLOCK(sc); 4541 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4542 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4543 } 4544 4545 static int 4546 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4547 { 4548 struct hn_softc *sc = arg1; 4549 char filter_str[128]; 4550 uint32_t filter; 4551 4552 HN_LOCK(sc); 4553 filter = sc->hn_rx_filter; 4554 HN_UNLOCK(sc); 4555 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4556 NDIS_PACKET_TYPES); 4557 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4558 } 4559 4560 static int 4561 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4562 { 4563 struct hn_softc *sc = arg1; 4564 uint32_t mtu; 4565 int error; 4566 HN_LOCK(sc); 4567 error = hn_rndis_get_mtu(sc, &mtu); 4568 if (error) { 4569 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4570 goto back; 4571 } 4572 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4573 if (error || req->newptr == NULL) 4574 goto back; 4575 4576 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4577 if (error) 4578 goto back; 4579 error = hn_rndis_reconf_offload(sc, mtu); 4580 back: 4581 HN_UNLOCK(sc); 4582 return (error); 4583 } 4584 #ifndef RSS 4585 4586 static int 4587 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4588 { 4589 struct hn_softc *sc = arg1; 4590 int error; 4591 4592 HN_LOCK(sc); 4593 4594 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4595 if (error || req->newptr == NULL) 4596 goto back; 4597 4598 if ((sc->hn_flags & HN_FLAG_RXVF) || 4599 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4600 /* 4601 * RSS key is synchronized w/ VF's, don't allow users 4602 * to change it. 4603 */ 4604 error = EBUSY; 4605 goto back; 4606 } 4607 4608 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4609 if (error) 4610 goto back; 4611 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4612 4613 if (sc->hn_rx_ring_inuse > 1) { 4614 error = hn_rss_reconfig(sc); 4615 } else { 4616 /* Not RSS capable, at least for now; just save the RSS key. */ 4617 error = 0; 4618 } 4619 back: 4620 HN_UNLOCK(sc); 4621 return (error); 4622 } 4623 4624 static int 4625 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4626 { 4627 struct hn_softc *sc = arg1; 4628 int error; 4629 4630 HN_LOCK(sc); 4631 4632 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4633 if (error || req->newptr == NULL) 4634 goto back; 4635 4636 /* 4637 * Don't allow RSS indirect table change, if this interface is not 4638 * RSS capable currently. 4639 */ 4640 if (sc->hn_rx_ring_inuse == 1) { 4641 error = EOPNOTSUPP; 4642 goto back; 4643 } 4644 4645 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4646 if (error) 4647 goto back; 4648 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4649 4650 hn_rss_ind_fixup(sc); 4651 error = hn_rss_reconfig(sc); 4652 back: 4653 HN_UNLOCK(sc); 4654 return (error); 4655 } 4656 4657 #endif /* !RSS */ 4658 4659 static int 4660 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4661 { 4662 struct hn_softc *sc = arg1; 4663 char hash_str[128]; 4664 uint32_t hash; 4665 4666 HN_LOCK(sc); 4667 hash = sc->hn_rss_hash; 4668 HN_UNLOCK(sc); 4669 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4670 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4671 } 4672 4673 static int 4674 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4675 { 4676 struct hn_softc *sc = arg1; 4677 char hash_str[128]; 4678 uint32_t hash; 4679 4680 HN_LOCK(sc); 4681 hash = sc->hn_rss_hcap; 4682 HN_UNLOCK(sc); 4683 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4684 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4685 } 4686 4687 static int 4688 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4689 { 4690 struct hn_softc *sc = arg1; 4691 char hash_str[128]; 4692 uint32_t hash; 4693 4694 HN_LOCK(sc); 4695 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4696 HN_UNLOCK(sc); 4697 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4698 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4699 } 4700 4701 static int 4702 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4703 { 4704 struct hn_softc *sc = arg1; 4705 char vf_name[IFNAMSIZ + 1]; 4706 if_t vf_ifp; 4707 4708 HN_LOCK(sc); 4709 vf_name[0] = '\0'; 4710 vf_ifp = sc->hn_vf_ifp; 4711 if (vf_ifp != NULL) 4712 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4713 HN_UNLOCK(sc); 4714 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4715 } 4716 4717 static int 4718 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4719 { 4720 struct hn_softc *sc = arg1; 4721 char vf_name[IFNAMSIZ + 1]; 4722 if_t vf_ifp; 4723 4724 HN_LOCK(sc); 4725 vf_name[0] = '\0'; 4726 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4727 if (vf_ifp != NULL) 4728 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4729 HN_UNLOCK(sc); 4730 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4731 } 4732 4733 static int 4734 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4735 { 4736 struct rm_priotracker pt; 4737 struct sbuf *sb; 4738 int error, i; 4739 bool first; 4740 4741 error = sysctl_wire_old_buffer(req, 0); 4742 if (error != 0) 4743 return (error); 4744 4745 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4746 if (sb == NULL) 4747 return (ENOMEM); 4748 4749 rm_rlock(&hn_vfmap_lock, &pt); 4750 4751 first = true; 4752 for (i = 0; i < hn_vfmap_size; ++i) { 4753 struct epoch_tracker et; 4754 if_t ifp; 4755 4756 if (hn_vfmap[i] == NULL) 4757 continue; 4758 4759 NET_EPOCH_ENTER(et); 4760 ifp = ifnet_byindex(i); 4761 if (ifp != NULL) { 4762 if (first) 4763 sbuf_printf(sb, "%s", if_name(ifp)); 4764 else 4765 sbuf_printf(sb, " %s", if_name(ifp)); 4766 first = false; 4767 } 4768 NET_EPOCH_EXIT(et); 4769 } 4770 4771 rm_runlock(&hn_vfmap_lock, &pt); 4772 4773 error = sbuf_finish(sb); 4774 sbuf_delete(sb); 4775 return (error); 4776 } 4777 4778 static int 4779 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4780 { 4781 struct rm_priotracker pt; 4782 struct sbuf *sb; 4783 int error, i; 4784 bool first; 4785 4786 error = sysctl_wire_old_buffer(req, 0); 4787 if (error != 0) 4788 return (error); 4789 4790 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4791 if (sb == NULL) 4792 return (ENOMEM); 4793 4794 rm_rlock(&hn_vfmap_lock, &pt); 4795 4796 first = true; 4797 for (i = 0; i < hn_vfmap_size; ++i) { 4798 struct epoch_tracker et; 4799 if_t ifp, hn_ifp; 4800 4801 hn_ifp = hn_vfmap[i]; 4802 if (hn_ifp == NULL) 4803 continue; 4804 4805 NET_EPOCH_ENTER(et); 4806 ifp = ifnet_byindex(i); 4807 if (ifp != NULL) { 4808 if (first) { 4809 sbuf_printf(sb, "%s:%s", if_name(ifp), 4810 if_name(hn_ifp)); 4811 } else { 4812 sbuf_printf(sb, " %s:%s", if_name(ifp), 4813 if_name(hn_ifp)); 4814 } 4815 first = false; 4816 } 4817 NET_EPOCH_EXIT(et); 4818 } 4819 4820 rm_runlock(&hn_vfmap_lock, &pt); 4821 4822 error = sbuf_finish(sb); 4823 sbuf_delete(sb); 4824 return (error); 4825 } 4826 4827 static int 4828 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4829 { 4830 struct hn_softc *sc = arg1; 4831 int error, onoff = 0; 4832 4833 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4834 onoff = 1; 4835 error = sysctl_handle_int(oidp, &onoff, 0, req); 4836 if (error || req->newptr == NULL) 4837 return (error); 4838 4839 HN_LOCK(sc); 4840 /* NOTE: hn_vf_lock for hn_transmit() */ 4841 rm_wlock(&sc->hn_vf_lock); 4842 if (onoff) 4843 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4844 else 4845 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4846 rm_wunlock(&sc->hn_vf_lock); 4847 HN_UNLOCK(sc); 4848 4849 return (0); 4850 } 4851 4852 static int 4853 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4854 { 4855 struct hn_softc *sc = arg1; 4856 int enabled = 0; 4857 4858 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4859 enabled = 1; 4860 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4861 } 4862 4863 static int 4864 hn_check_iplen(const struct mbuf *m, int hoff) 4865 { 4866 const struct ip *ip; 4867 int len, iphlen, iplen; 4868 const struct tcphdr *th; 4869 int thoff; /* TCP data offset */ 4870 4871 len = hoff + sizeof(struct ip); 4872 4873 /* The packet must be at least the size of an IP header. */ 4874 if (m->m_pkthdr.len < len) 4875 return IPPROTO_DONE; 4876 4877 /* The fixed IP header must reside completely in the first mbuf. */ 4878 if (m->m_len < len) 4879 return IPPROTO_DONE; 4880 4881 ip = mtodo(m, hoff); 4882 4883 /* Bound check the packet's stated IP header length. */ 4884 iphlen = ip->ip_hl << 2; 4885 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4886 return IPPROTO_DONE; 4887 4888 /* The full IP header must reside completely in the one mbuf. */ 4889 if (m->m_len < hoff + iphlen) 4890 return IPPROTO_DONE; 4891 4892 iplen = ntohs(ip->ip_len); 4893 4894 /* 4895 * Check that the amount of data in the buffers is as 4896 * at least much as the IP header would have us expect. 4897 */ 4898 if (m->m_pkthdr.len < hoff + iplen) 4899 return IPPROTO_DONE; 4900 4901 /* 4902 * Ignore IP fragments. 4903 */ 4904 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4905 return IPPROTO_DONE; 4906 4907 /* 4908 * The TCP/IP or UDP/IP header must be entirely contained within 4909 * the first fragment of a packet. 4910 */ 4911 switch (ip->ip_p) { 4912 case IPPROTO_TCP: 4913 if (iplen < iphlen + sizeof(struct tcphdr)) 4914 return IPPROTO_DONE; 4915 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4916 return IPPROTO_DONE; 4917 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4918 thoff = th->th_off << 2; 4919 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4920 return IPPROTO_DONE; 4921 if (m->m_len < hoff + iphlen + thoff) 4922 return IPPROTO_DONE; 4923 break; 4924 case IPPROTO_UDP: 4925 if (iplen < iphlen + sizeof(struct udphdr)) 4926 return IPPROTO_DONE; 4927 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4928 return IPPROTO_DONE; 4929 break; 4930 default: 4931 if (iplen < iphlen) 4932 return IPPROTO_DONE; 4933 break; 4934 } 4935 return ip->ip_p; 4936 } 4937 4938 static void 4939 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4940 { 4941 const struct ether_header *eh; 4942 uint16_t etype; 4943 int hoff; 4944 4945 hoff = sizeof(*eh); 4946 /* Checked at the beginning of this function. */ 4947 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4948 4949 eh = mtod(m_new, const struct ether_header *); 4950 etype = ntohs(eh->ether_type); 4951 if (etype == ETHERTYPE_VLAN) { 4952 const struct ether_vlan_header *evl; 4953 4954 hoff = sizeof(*evl); 4955 if (m_new->m_len < hoff) 4956 return; 4957 evl = mtod(m_new, const struct ether_vlan_header *); 4958 etype = ntohs(evl->evl_proto); 4959 } 4960 *l3proto = etype; 4961 4962 if (etype == ETHERTYPE_IP) 4963 *l4proto = hn_check_iplen(m_new, hoff); 4964 else 4965 *l4proto = IPPROTO_DONE; 4966 } 4967 4968 static int 4969 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4970 { 4971 struct sysctl_oid_list *child; 4972 struct sysctl_ctx_list *ctx; 4973 device_t dev = sc->hn_dev; 4974 #if defined(INET) || defined(INET6) 4975 int lroent_cnt; 4976 #endif 4977 int i; 4978 4979 /* 4980 * Create RXBUF for reception. 4981 * 4982 * NOTE: 4983 * - It is shared by all channels. 4984 * - A large enough buffer is allocated, certain version of NVSes 4985 * may further limit the usable space. 4986 */ 4987 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4988 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4989 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4990 if (sc->hn_rxbuf == NULL) { 4991 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4992 return (ENOMEM); 4993 } 4994 4995 sc->hn_rx_ring_cnt = ring_cnt; 4996 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4997 4998 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4999 M_DEVBUF, M_WAITOK | M_ZERO); 5000 5001 #if defined(INET) || defined(INET6) 5002 lroent_cnt = hn_lro_entry_count; 5003 if (lroent_cnt < TCP_LRO_ENTRIES) 5004 lroent_cnt = TCP_LRO_ENTRIES; 5005 if (bootverbose) 5006 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 5007 #endif /* INET || INET6 */ 5008 5009 ctx = device_get_sysctl_ctx(dev); 5010 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 5011 5012 /* Create dev.hn.UNIT.rx sysctl tree */ 5013 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 5014 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5015 5016 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5017 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5018 5019 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 5020 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 5021 &rxr->hn_br_dma, BUS_DMA_WAITOK); 5022 if (rxr->hn_br == NULL) { 5023 device_printf(dev, "allocate bufring failed\n"); 5024 return (ENOMEM); 5025 } 5026 5027 if (hn_trust_hosttcp) 5028 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5029 if (hn_trust_hostudp) 5030 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5031 if (hn_trust_hostip) 5032 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5033 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5034 rxr->hn_ifp = sc->hn_ifp; 5035 if (i < sc->hn_tx_ring_cnt) 5036 rxr->hn_txr = &sc->hn_tx_ring[i]; 5037 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5038 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5039 rxr->hn_rx_idx = i; 5040 rxr->hn_rxbuf = sc->hn_rxbuf; 5041 5042 /* 5043 * Initialize LRO. 5044 */ 5045 #if defined(INET) || defined(INET6) 5046 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5047 hn_lro_mbufq_depth); 5048 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5049 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5050 #endif /* INET || INET6 */ 5051 5052 if (sc->hn_rx_sysctl_tree != NULL) { 5053 char name[16]; 5054 5055 /* 5056 * Create per RX ring sysctl tree: 5057 * dev.hn.UNIT.rx.RINGID 5058 */ 5059 snprintf(name, sizeof(name), "%d", i); 5060 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5061 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5062 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5063 5064 if (rxr->hn_rx_sysctl_tree != NULL) { 5065 SYSCTL_ADD_ULONG(ctx, 5066 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5067 OID_AUTO, "packets", 5068 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5069 "# of packets received"); 5070 SYSCTL_ADD_ULONG(ctx, 5071 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5072 OID_AUTO, "rss_pkts", 5073 CTLFLAG_RW | CTLFLAG_STATS, 5074 &rxr->hn_rss_pkts, 5075 "# of packets w/ RSS info received"); 5076 SYSCTL_ADD_ULONG(ctx, 5077 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5078 OID_AUTO, "rsc_pkts", 5079 CTLFLAG_RW | CTLFLAG_STATS, 5080 &rxr->hn_rsc_pkts, 5081 "# of RSC packets received"); 5082 SYSCTL_ADD_ULONG(ctx, 5083 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5084 OID_AUTO, "rsc_drop", 5085 CTLFLAG_RW | CTLFLAG_STATS, 5086 &rxr->hn_rsc_drop, 5087 "# of RSC fragments dropped"); 5088 SYSCTL_ADD_INT(ctx, 5089 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5090 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5091 &rxr->hn_pktbuf_len, 0, 5092 "Temporary channel packet buffer length"); 5093 } 5094 } 5095 } 5096 5097 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5098 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5099 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5100 hn_rx_stat_u64_sysctl, 5101 "LU", "LRO queued"); 5102 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5103 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5104 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5105 hn_rx_stat_u64_sysctl, 5106 "LU", "LRO flushed"); 5107 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5108 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5109 __offsetof(struct hn_rx_ring, hn_lro_tried), 5110 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5112 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5113 hn_lro_lenlim_sysctl, "IU", 5114 "Max # of data bytes to be aggregated by LRO"); 5115 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5116 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5117 hn_lro_ackcnt_sysctl, "I", 5118 "Max # of ACKs to be aggregated by LRO"); 5119 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5121 hn_trust_hcsum_sysctl, "I", 5122 "Trust tcp segment verification on host side, " 5123 "when csum info is missing"); 5124 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5125 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5126 hn_trust_hcsum_sysctl, "I", 5127 "Trust udp datagram verification on host side, " 5128 "when csum info is missing"); 5129 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5130 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5131 hn_trust_hcsum_sysctl, "I", 5132 "Trust ip packet verification on host side, " 5133 "when csum info is missing"); 5134 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5135 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5136 __offsetof(struct hn_rx_ring, hn_csum_ip), 5137 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5138 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5139 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5140 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5141 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5142 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5143 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5144 __offsetof(struct hn_rx_ring, hn_csum_udp), 5145 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5146 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5147 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5148 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5149 hn_rx_stat_ulong_sysctl, "LU", 5150 "# of packets that we trust host's csum verification"); 5151 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5152 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5153 __offsetof(struct hn_rx_ring, hn_small_pkts), 5154 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5155 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5156 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5157 __offsetof(struct hn_rx_ring, hn_ack_failed), 5158 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5159 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5160 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5161 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5162 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5163 5164 return (0); 5165 } 5166 5167 static void 5168 hn_destroy_rx_data(struct hn_softc *sc) 5169 { 5170 int i; 5171 5172 if (sc->hn_rxbuf != NULL) { 5173 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5174 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5175 else 5176 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5177 sc->hn_rxbuf = NULL; 5178 } 5179 5180 if (sc->hn_rx_ring_cnt == 0) 5181 return; 5182 5183 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5184 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5185 5186 if (rxr->hn_br == NULL) 5187 continue; 5188 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5189 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5190 } else { 5191 device_printf(sc->hn_dev, 5192 "%dth channel bufring is referenced", i); 5193 } 5194 rxr->hn_br = NULL; 5195 5196 #if defined(INET) || defined(INET6) 5197 tcp_lro_free(&rxr->hn_lro); 5198 #endif 5199 free(rxr->hn_pktbuf, M_DEVBUF); 5200 } 5201 free(sc->hn_rx_ring, M_DEVBUF); 5202 sc->hn_rx_ring = NULL; 5203 5204 sc->hn_rx_ring_cnt = 0; 5205 sc->hn_rx_ring_inuse = 0; 5206 } 5207 5208 static int 5209 hn_tx_ring_create(struct hn_softc *sc, int id) 5210 { 5211 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5212 device_t dev = sc->hn_dev; 5213 bus_dma_tag_t parent_dtag; 5214 int error, i; 5215 5216 txr->hn_sc = sc; 5217 txr->hn_tx_idx = id; 5218 5219 #ifndef HN_USE_TXDESC_BUFRING 5220 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5221 #endif 5222 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5223 5224 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5225 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5226 M_DEVBUF, M_WAITOK | M_ZERO); 5227 #ifndef HN_USE_TXDESC_BUFRING 5228 SLIST_INIT(&txr->hn_txlist); 5229 #else 5230 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5231 M_WAITOK, &txr->hn_tx_lock); 5232 #endif 5233 5234 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5235 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5236 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5237 } else { 5238 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5239 } 5240 5241 #ifdef HN_IFSTART_SUPPORT 5242 if (hn_use_if_start) { 5243 txr->hn_txeof = hn_start_txeof; 5244 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5245 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5246 } else 5247 #endif 5248 { 5249 int br_depth; 5250 5251 txr->hn_txeof = hn_xmit_txeof; 5252 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5253 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5254 5255 br_depth = hn_get_txswq_depth(txr); 5256 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5257 M_WAITOK, &txr->hn_tx_lock); 5258 } 5259 5260 txr->hn_direct_tx_size = hn_direct_tx_size; 5261 5262 /* 5263 * Always schedule transmission instead of trying to do direct 5264 * transmission. This one gives the best performance so far. 5265 */ 5266 txr->hn_sched_tx = 1; 5267 5268 parent_dtag = bus_get_dma_tag(dev); 5269 5270 /* DMA tag for RNDIS packet messages. */ 5271 error = bus_dma_tag_create(parent_dtag, /* parent */ 5272 HN_RNDIS_PKT_ALIGN, /* alignment */ 5273 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5274 BUS_SPACE_MAXADDR, /* lowaddr */ 5275 BUS_SPACE_MAXADDR, /* highaddr */ 5276 NULL, NULL, /* filter, filterarg */ 5277 HN_RNDIS_PKT_LEN, /* maxsize */ 5278 1, /* nsegments */ 5279 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5280 0, /* flags */ 5281 NULL, /* lockfunc */ 5282 NULL, /* lockfuncarg */ 5283 &txr->hn_tx_rndis_dtag); 5284 if (error) { 5285 device_printf(dev, "failed to create rndis dmatag\n"); 5286 return error; 5287 } 5288 5289 /* DMA tag for data. */ 5290 error = bus_dma_tag_create(parent_dtag, /* parent */ 5291 1, /* alignment */ 5292 HN_TX_DATA_BOUNDARY, /* boundary */ 5293 BUS_SPACE_MAXADDR, /* lowaddr */ 5294 BUS_SPACE_MAXADDR, /* highaddr */ 5295 NULL, NULL, /* filter, filterarg */ 5296 HN_TX_DATA_MAXSIZE, /* maxsize */ 5297 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5298 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5299 0, /* flags */ 5300 NULL, /* lockfunc */ 5301 NULL, /* lockfuncarg */ 5302 &txr->hn_tx_data_dtag); 5303 if (error) { 5304 device_printf(dev, "failed to create data dmatag\n"); 5305 return error; 5306 } 5307 5308 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5309 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5310 5311 txd->txr = txr; 5312 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5313 STAILQ_INIT(&txd->agg_list); 5314 5315 /* 5316 * Allocate and load RNDIS packet message. 5317 */ 5318 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5319 (void **)&txd->rndis_pkt, 5320 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5321 &txd->rndis_pkt_dmap); 5322 if (error) { 5323 device_printf(dev, 5324 "failed to allocate rndis_packet_msg, %d\n", i); 5325 return error; 5326 } 5327 5328 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5329 txd->rndis_pkt_dmap, 5330 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5331 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5332 BUS_DMA_NOWAIT); 5333 if (error) { 5334 device_printf(dev, 5335 "failed to load rndis_packet_msg, %d\n", i); 5336 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5337 txd->rndis_pkt, txd->rndis_pkt_dmap); 5338 return error; 5339 } 5340 5341 /* DMA map for TX data. */ 5342 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5343 &txd->data_dmap); 5344 if (error) { 5345 device_printf(dev, 5346 "failed to allocate tx data dmamap\n"); 5347 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5348 txd->rndis_pkt_dmap); 5349 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5350 txd->rndis_pkt, txd->rndis_pkt_dmap); 5351 return error; 5352 } 5353 5354 /* All set, put it to list */ 5355 txd->flags |= HN_TXD_FLAG_ONLIST; 5356 #ifndef HN_USE_TXDESC_BUFRING 5357 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5358 #else 5359 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5360 #endif 5361 } 5362 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5363 5364 if (sc->hn_tx_sysctl_tree != NULL) { 5365 struct sysctl_oid_list *child; 5366 struct sysctl_ctx_list *ctx; 5367 char name[16]; 5368 5369 /* 5370 * Create per TX ring sysctl tree: 5371 * dev.hn.UNIT.tx.RINGID 5372 */ 5373 ctx = device_get_sysctl_ctx(dev); 5374 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5375 5376 snprintf(name, sizeof(name), "%d", id); 5377 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5378 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5379 5380 if (txr->hn_tx_sysctl_tree != NULL) { 5381 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5382 5383 #ifdef HN_DEBUG 5384 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5385 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5386 "# of available TX descs"); 5387 #endif 5388 #ifdef HN_IFSTART_SUPPORT 5389 if (!hn_use_if_start) 5390 #endif 5391 { 5392 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5393 CTLFLAG_RD, &txr->hn_oactive, 0, 5394 "over active"); 5395 } 5396 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5397 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5398 "# of packets transmitted"); 5399 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5400 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5401 "# of sends"); 5402 } 5403 } 5404 5405 return 0; 5406 } 5407 5408 static void 5409 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5410 { 5411 struct hn_tx_ring *txr = txd->txr; 5412 5413 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5414 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5415 5416 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5417 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5418 txd->rndis_pkt_dmap); 5419 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5420 } 5421 5422 static void 5423 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5424 { 5425 5426 KASSERT(txd->refs == 0 || txd->refs == 1, 5427 ("invalid txd refs %d", txd->refs)); 5428 5429 /* Aggregated txds will be freed by their aggregating txd. */ 5430 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5431 int freed __diagused; 5432 5433 freed = hn_txdesc_put(txr, txd); 5434 KASSERT(freed, ("can't free txdesc")); 5435 } 5436 } 5437 5438 static void 5439 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5440 { 5441 int i; 5442 5443 if (txr->hn_txdesc == NULL) 5444 return; 5445 5446 /* 5447 * NOTE: 5448 * Because the freeing of aggregated txds will be deferred 5449 * to the aggregating txd, two passes are used here: 5450 * - The first pass GCes any pending txds. This GC is necessary, 5451 * since if the channels are revoked, hypervisor will not 5452 * deliver send-done for all pending txds. 5453 * - The second pass frees the busdma stuffs, i.e. after all txds 5454 * were freed. 5455 */ 5456 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5457 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5458 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5459 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5460 5461 if (txr->hn_tx_data_dtag != NULL) 5462 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5463 if (txr->hn_tx_rndis_dtag != NULL) 5464 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5465 5466 #ifdef HN_USE_TXDESC_BUFRING 5467 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5468 #endif 5469 5470 free(txr->hn_txdesc, M_DEVBUF); 5471 txr->hn_txdesc = NULL; 5472 5473 if (txr->hn_mbuf_br != NULL) 5474 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5475 5476 #ifndef HN_USE_TXDESC_BUFRING 5477 mtx_destroy(&txr->hn_txlist_spin); 5478 #endif 5479 mtx_destroy(&txr->hn_tx_lock); 5480 } 5481 5482 static int 5483 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5484 { 5485 struct sysctl_oid_list *child; 5486 struct sysctl_ctx_list *ctx; 5487 int i; 5488 5489 /* 5490 * Create TXBUF for chimney sending. 5491 * 5492 * NOTE: It is shared by all channels. 5493 */ 5494 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5495 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5496 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5497 if (sc->hn_chim == NULL) { 5498 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5499 return (ENOMEM); 5500 } 5501 5502 sc->hn_tx_ring_cnt = ring_cnt; 5503 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5504 5505 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5506 M_DEVBUF, M_WAITOK | M_ZERO); 5507 5508 ctx = device_get_sysctl_ctx(sc->hn_dev); 5509 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5510 5511 /* Create dev.hn.UNIT.tx sysctl tree */ 5512 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5513 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5514 5515 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5516 int error; 5517 5518 error = hn_tx_ring_create(sc, i); 5519 if (error) 5520 return error; 5521 } 5522 5523 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5524 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5525 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5526 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5527 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5528 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5529 __offsetof(struct hn_tx_ring, hn_send_failed), 5530 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5531 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5532 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5533 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5534 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5535 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5536 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5537 __offsetof(struct hn_tx_ring, hn_flush_failed), 5538 hn_tx_stat_ulong_sysctl, "LU", 5539 "# of packet transmission aggregation flush failure"); 5540 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5541 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5542 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5543 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5544 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5545 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5546 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5547 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5548 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5549 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5550 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5551 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5552 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5553 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5554 "# of total TX descs"); 5555 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5556 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5557 "Chimney send packet size upper boundary"); 5558 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5559 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5560 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5561 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5562 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5563 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5564 hn_tx_conf_int_sysctl, "I", 5565 "Size of the packet for direct transmission"); 5566 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5567 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5568 __offsetof(struct hn_tx_ring, hn_sched_tx), 5569 hn_tx_conf_int_sysctl, "I", 5570 "Always schedule transmission " 5571 "instead of doing direct transmission"); 5572 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5573 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5574 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5575 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5576 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5577 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5578 "Applied packet transmission aggregation size"); 5579 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5580 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5581 hn_txagg_pktmax_sysctl, "I", 5582 "Applied packet transmission aggregation packets"); 5583 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5584 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5585 hn_txagg_align_sysctl, "I", 5586 "Applied packet transmission aggregation alignment"); 5587 5588 return 0; 5589 } 5590 5591 static void 5592 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5593 { 5594 int i; 5595 5596 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5597 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5598 } 5599 5600 static void 5601 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5602 { 5603 if_t ifp = sc->hn_ifp; 5604 u_int hw_tsomax; 5605 int tso_minlen; 5606 5607 HN_LOCK_ASSERT(sc); 5608 5609 if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5610 return; 5611 5612 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5613 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5614 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5615 5616 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5617 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5618 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5619 5620 if (tso_maxlen < tso_minlen) 5621 tso_maxlen = tso_minlen; 5622 else if (tso_maxlen > IP_MAXPACKET) 5623 tso_maxlen = IP_MAXPACKET; 5624 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5625 tso_maxlen = sc->hn_ndis_tso_szmax; 5626 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5627 5628 if (hn_xpnt_vf_isready(sc)) { 5629 if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp)) 5630 hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp); 5631 } 5632 if_sethwtsomax(ifp, hw_tsomax); 5633 if (bootverbose) 5634 if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp)); 5635 } 5636 5637 static void 5638 hn_fixup_tx_data(struct hn_softc *sc) 5639 { 5640 uint64_t csum_assist; 5641 int i; 5642 5643 hn_set_chim_size(sc, sc->hn_chim_szmax); 5644 if (hn_tx_chimney_size > 0 && 5645 hn_tx_chimney_size < sc->hn_chim_szmax) 5646 hn_set_chim_size(sc, hn_tx_chimney_size); 5647 5648 csum_assist = 0; 5649 if (sc->hn_caps & HN_CAP_IPCS) 5650 csum_assist |= CSUM_IP; 5651 if (sc->hn_caps & HN_CAP_TCP4CS) 5652 csum_assist |= CSUM_IP_TCP; 5653 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5654 csum_assist |= CSUM_IP_UDP; 5655 if (sc->hn_caps & HN_CAP_TCP6CS) 5656 csum_assist |= CSUM_IP6_TCP; 5657 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5658 csum_assist |= CSUM_IP6_UDP; 5659 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5660 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5661 5662 if (sc->hn_caps & HN_CAP_HASHVAL) { 5663 /* 5664 * Support HASHVAL pktinfo on TX path. 5665 */ 5666 if (bootverbose) 5667 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5668 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5669 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5670 } 5671 } 5672 5673 static void 5674 hn_fixup_rx_data(struct hn_softc *sc) 5675 { 5676 5677 if (sc->hn_caps & HN_CAP_UDPHASH) { 5678 int i; 5679 5680 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5681 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5682 } 5683 } 5684 5685 static void 5686 hn_destroy_tx_data(struct hn_softc *sc) 5687 { 5688 int i; 5689 5690 if (sc->hn_chim != NULL) { 5691 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5692 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5693 } else { 5694 device_printf(sc->hn_dev, 5695 "chimney sending buffer is referenced"); 5696 } 5697 sc->hn_chim = NULL; 5698 } 5699 5700 if (sc->hn_tx_ring_cnt == 0) 5701 return; 5702 5703 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5704 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5705 5706 free(sc->hn_tx_ring, M_DEVBUF); 5707 sc->hn_tx_ring = NULL; 5708 5709 sc->hn_tx_ring_cnt = 0; 5710 sc->hn_tx_ring_inuse = 0; 5711 } 5712 5713 #ifdef HN_IFSTART_SUPPORT 5714 5715 static void 5716 hn_start_taskfunc(void *xtxr, int pending __unused) 5717 { 5718 struct hn_tx_ring *txr = xtxr; 5719 5720 mtx_lock(&txr->hn_tx_lock); 5721 hn_start_locked(txr, 0); 5722 mtx_unlock(&txr->hn_tx_lock); 5723 } 5724 5725 static int 5726 hn_start_locked(struct hn_tx_ring *txr, int len) 5727 { 5728 struct hn_softc *sc = txr->hn_sc; 5729 if_t ifp = sc->hn_ifp; 5730 int sched = 0; 5731 5732 KASSERT(hn_use_if_start, 5733 ("hn_start_locked is called, when if_start is disabled")); 5734 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5735 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5736 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5737 5738 if (__predict_false(txr->hn_suspended)) 5739 return (0); 5740 5741 if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5742 IFF_DRV_RUNNING) 5743 return (0); 5744 5745 while (!if_sendq_empty(ifp)) { 5746 struct hn_txdesc *txd; 5747 struct mbuf *m_head; 5748 int error; 5749 5750 m_head = if_dequeue(ifp); 5751 if (m_head == NULL) 5752 break; 5753 5754 if (len > 0 && m_head->m_pkthdr.len > len) { 5755 /* 5756 * This sending could be time consuming; let callers 5757 * dispatch this packet sending (and sending of any 5758 * following up packets) to tx taskqueue. 5759 */ 5760 if_sendq_prepend(ifp, m_head); 5761 sched = 1; 5762 break; 5763 } 5764 5765 #if defined(INET6) || defined(INET) 5766 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5767 m_head = hn_tso_fixup(m_head); 5768 if (__predict_false(m_head == NULL)) { 5769 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5770 continue; 5771 } 5772 } else if (m_head->m_pkthdr.csum_flags & 5773 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5774 m_head = hn_set_hlen(m_head); 5775 if (__predict_false(m_head == NULL)) { 5776 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5777 continue; 5778 } 5779 } 5780 #endif 5781 5782 txd = hn_txdesc_get(txr); 5783 if (txd == NULL) { 5784 txr->hn_no_txdescs++; 5785 if_sendq_prepend(ifp, m_head); 5786 if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); 5787 break; 5788 } 5789 5790 error = hn_encap(ifp, txr, txd, &m_head); 5791 if (error) { 5792 /* Both txd and m_head are freed */ 5793 KASSERT(txr->hn_agg_txd == NULL, 5794 ("encap failed w/ pending aggregating txdesc")); 5795 continue; 5796 } 5797 5798 if (txr->hn_agg_pktleft == 0) { 5799 if (txr->hn_agg_txd != NULL) { 5800 KASSERT(m_head == NULL, 5801 ("pending mbuf for aggregating txdesc")); 5802 error = hn_flush_txagg(ifp, txr); 5803 if (__predict_false(error)) { 5804 if_setdrvflagbits(ifp, 5805 IFF_DRV_OACTIVE, 0); 5806 break; 5807 } 5808 } else { 5809 KASSERT(m_head != NULL, ("mbuf was freed")); 5810 error = hn_txpkt(ifp, txr, txd); 5811 if (__predict_false(error)) { 5812 /* txd is freed, but m_head is not */ 5813 if_sendq_prepend(ifp, m_head); 5814 if_setdrvflagbits(ifp, 5815 IFF_DRV_OACTIVE, 0); 5816 break; 5817 } 5818 } 5819 } 5820 #ifdef INVARIANTS 5821 else { 5822 KASSERT(txr->hn_agg_txd != NULL, 5823 ("no aggregating txdesc")); 5824 KASSERT(m_head == NULL, 5825 ("pending mbuf for aggregating txdesc")); 5826 } 5827 #endif 5828 } 5829 5830 /* Flush pending aggerated transmission. */ 5831 if (txr->hn_agg_txd != NULL) 5832 hn_flush_txagg(ifp, txr); 5833 return (sched); 5834 } 5835 5836 static void 5837 hn_start(if_t ifp) 5838 { 5839 struct hn_softc *sc = if_getsoftc(ifp); 5840 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5841 5842 if (txr->hn_sched_tx) 5843 goto do_sched; 5844 5845 if (mtx_trylock(&txr->hn_tx_lock)) { 5846 int sched; 5847 5848 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5849 mtx_unlock(&txr->hn_tx_lock); 5850 if (!sched) 5851 return; 5852 } 5853 do_sched: 5854 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5855 } 5856 5857 static void 5858 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5859 { 5860 struct hn_tx_ring *txr = xtxr; 5861 5862 mtx_lock(&txr->hn_tx_lock); 5863 if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE); 5864 hn_start_locked(txr, 0); 5865 mtx_unlock(&txr->hn_tx_lock); 5866 } 5867 5868 static void 5869 hn_start_txeof(struct hn_tx_ring *txr) 5870 { 5871 struct hn_softc *sc = txr->hn_sc; 5872 if_t ifp = sc->hn_ifp; 5873 5874 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5875 5876 if (txr->hn_sched_tx) 5877 goto do_sched; 5878 5879 if (mtx_trylock(&txr->hn_tx_lock)) { 5880 int sched; 5881 5882 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5883 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5884 mtx_unlock(&txr->hn_tx_lock); 5885 if (sched) { 5886 taskqueue_enqueue(txr->hn_tx_taskq, 5887 &txr->hn_tx_task); 5888 } 5889 } else { 5890 do_sched: 5891 /* 5892 * Release the OACTIVE earlier, with the hope, that 5893 * others could catch up. The task will clear the 5894 * flag again with the hn_tx_lock to avoid possible 5895 * races. 5896 */ 5897 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5898 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5899 } 5900 } 5901 5902 #endif /* HN_IFSTART_SUPPORT */ 5903 5904 static int 5905 hn_xmit(struct hn_tx_ring *txr, int len) 5906 { 5907 struct hn_softc *sc = txr->hn_sc; 5908 if_t ifp = sc->hn_ifp; 5909 struct mbuf *m_head; 5910 int sched = 0; 5911 5912 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5913 #ifdef HN_IFSTART_SUPPORT 5914 KASSERT(hn_use_if_start == 0, 5915 ("hn_xmit is called, when if_start is enabled")); 5916 #endif 5917 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5918 5919 if (__predict_false(txr->hn_suspended)) 5920 return (0); 5921 5922 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5923 return (0); 5924 5925 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5926 struct hn_txdesc *txd; 5927 int error; 5928 5929 if (len > 0 && m_head->m_pkthdr.len > len) { 5930 /* 5931 * This sending could be time consuming; let callers 5932 * dispatch this packet sending (and sending of any 5933 * following up packets) to tx taskqueue. 5934 */ 5935 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5936 sched = 1; 5937 break; 5938 } 5939 5940 txd = hn_txdesc_get(txr); 5941 if (txd == NULL) { 5942 txr->hn_no_txdescs++; 5943 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5944 txr->hn_oactive = 1; 5945 break; 5946 } 5947 5948 error = hn_encap(ifp, txr, txd, &m_head); 5949 if (error) { 5950 /* Both txd and m_head are freed; discard */ 5951 KASSERT(txr->hn_agg_txd == NULL, 5952 ("encap failed w/ pending aggregating txdesc")); 5953 drbr_advance(ifp, txr->hn_mbuf_br); 5954 continue; 5955 } 5956 5957 if (txr->hn_agg_pktleft == 0) { 5958 if (txr->hn_agg_txd != NULL) { 5959 KASSERT(m_head == NULL, 5960 ("pending mbuf for aggregating txdesc")); 5961 error = hn_flush_txagg(ifp, txr); 5962 if (__predict_false(error)) { 5963 txr->hn_oactive = 1; 5964 break; 5965 } 5966 } else { 5967 KASSERT(m_head != NULL, ("mbuf was freed")); 5968 error = hn_txpkt(ifp, txr, txd); 5969 if (__predict_false(error)) { 5970 /* txd is freed, but m_head is not */ 5971 drbr_putback(ifp, txr->hn_mbuf_br, 5972 m_head); 5973 txr->hn_oactive = 1; 5974 break; 5975 } 5976 } 5977 } 5978 #ifdef INVARIANTS 5979 else { 5980 KASSERT(txr->hn_agg_txd != NULL, 5981 ("no aggregating txdesc")); 5982 KASSERT(m_head == NULL, 5983 ("pending mbuf for aggregating txdesc")); 5984 } 5985 #endif 5986 5987 /* Sent */ 5988 drbr_advance(ifp, txr->hn_mbuf_br); 5989 } 5990 5991 /* Flush pending aggerated transmission. */ 5992 if (txr->hn_agg_txd != NULL) 5993 hn_flush_txagg(ifp, txr); 5994 return (sched); 5995 } 5996 5997 static int 5998 hn_transmit(if_t ifp, struct mbuf *m) 5999 { 6000 struct hn_softc *sc = if_getsoftc(ifp); 6001 struct hn_tx_ring *txr; 6002 int error, idx = 0; 6003 6004 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6005 struct rm_priotracker pt; 6006 6007 rm_rlock(&sc->hn_vf_lock, &pt); 6008 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6009 struct mbuf *m_bpf = NULL; 6010 int obytes, omcast; 6011 6012 obytes = m->m_pkthdr.len; 6013 omcast = (m->m_flags & M_MCAST) != 0; 6014 6015 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6016 if (bpf_peers_present(if_getbpf(ifp))) { 6017 m_bpf = m_copypacket(m, M_NOWAIT); 6018 if (m_bpf == NULL) { 6019 /* 6020 * Failed to grab a shallow 6021 * copy; tap now. 6022 */ 6023 ETHER_BPF_MTAP(ifp, m); 6024 } 6025 } 6026 } else { 6027 ETHER_BPF_MTAP(ifp, m); 6028 } 6029 6030 error = if_transmit(sc->hn_vf_ifp, m); 6031 rm_runlock(&sc->hn_vf_lock, &pt); 6032 6033 if (m_bpf != NULL) { 6034 if (!error) 6035 ETHER_BPF_MTAP(ifp, m_bpf); 6036 m_freem(m_bpf); 6037 } 6038 6039 if (error == ENOBUFS) { 6040 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6041 } else if (error) { 6042 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6043 } else { 6044 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6045 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6046 if (omcast) { 6047 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6048 omcast); 6049 } 6050 } 6051 return (error); 6052 } 6053 rm_runlock(&sc->hn_vf_lock, &pt); 6054 } 6055 6056 #if defined(INET6) || defined(INET) 6057 /* 6058 * Perform TSO packet header fixup or get l2/l3 header length now, 6059 * since packet headers should be cache-hot. 6060 */ 6061 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6062 m = hn_tso_fixup(m); 6063 if (__predict_false(m == NULL)) { 6064 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6065 return EIO; 6066 } 6067 } else if (m->m_pkthdr.csum_flags & 6068 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6069 m = hn_set_hlen(m); 6070 if (__predict_false(m == NULL)) { 6071 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6072 return EIO; 6073 } 6074 } 6075 #endif 6076 6077 /* 6078 * Select the TX ring based on flowid 6079 */ 6080 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6081 #ifdef RSS 6082 uint32_t bid; 6083 6084 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6085 &bid) == 0) 6086 idx = bid % sc->hn_tx_ring_inuse; 6087 else 6088 #endif 6089 { 6090 #if defined(INET6) || defined(INET) 6091 int tcpsyn = 0; 6092 6093 if (m->m_pkthdr.len < 128 && 6094 (m->m_pkthdr.csum_flags & 6095 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6096 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6097 m = hn_check_tcpsyn(m, &tcpsyn); 6098 if (__predict_false(m == NULL)) { 6099 if_inc_counter(ifp, 6100 IFCOUNTER_OERRORS, 1); 6101 return (EIO); 6102 } 6103 } 6104 #else 6105 const int tcpsyn = 0; 6106 #endif 6107 if (tcpsyn) 6108 idx = 0; 6109 else 6110 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6111 } 6112 } 6113 txr = &sc->hn_tx_ring[idx]; 6114 6115 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6116 if (error) { 6117 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6118 return error; 6119 } 6120 6121 if (txr->hn_oactive) 6122 return 0; 6123 6124 if (txr->hn_sched_tx) 6125 goto do_sched; 6126 6127 if (mtx_trylock(&txr->hn_tx_lock)) { 6128 int sched; 6129 6130 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6131 mtx_unlock(&txr->hn_tx_lock); 6132 if (!sched) 6133 return 0; 6134 } 6135 do_sched: 6136 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6137 return 0; 6138 } 6139 6140 static void 6141 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6142 { 6143 struct mbuf *m; 6144 6145 mtx_lock(&txr->hn_tx_lock); 6146 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6147 m_freem(m); 6148 mtx_unlock(&txr->hn_tx_lock); 6149 } 6150 6151 static void 6152 hn_xmit_qflush(if_t ifp) 6153 { 6154 struct hn_softc *sc = if_getsoftc(ifp); 6155 struct rm_priotracker pt; 6156 int i; 6157 6158 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6159 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6160 if_qflush(ifp); 6161 6162 rm_rlock(&sc->hn_vf_lock, &pt); 6163 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6164 if_qflush(sc->hn_vf_ifp); 6165 rm_runlock(&sc->hn_vf_lock, &pt); 6166 } 6167 6168 static void 6169 hn_xmit_txeof(struct hn_tx_ring *txr) 6170 { 6171 6172 if (txr->hn_sched_tx) 6173 goto do_sched; 6174 6175 if (mtx_trylock(&txr->hn_tx_lock)) { 6176 int sched; 6177 6178 txr->hn_oactive = 0; 6179 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6180 mtx_unlock(&txr->hn_tx_lock); 6181 if (sched) { 6182 taskqueue_enqueue(txr->hn_tx_taskq, 6183 &txr->hn_tx_task); 6184 } 6185 } else { 6186 do_sched: 6187 /* 6188 * Release the oactive earlier, with the hope, that 6189 * others could catch up. The task will clear the 6190 * oactive again with the hn_tx_lock to avoid possible 6191 * races. 6192 */ 6193 txr->hn_oactive = 0; 6194 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6195 } 6196 } 6197 6198 static void 6199 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6200 { 6201 struct hn_tx_ring *txr = xtxr; 6202 6203 mtx_lock(&txr->hn_tx_lock); 6204 hn_xmit(txr, 0); 6205 mtx_unlock(&txr->hn_tx_lock); 6206 } 6207 6208 static void 6209 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6210 { 6211 struct hn_tx_ring *txr = xtxr; 6212 6213 mtx_lock(&txr->hn_tx_lock); 6214 txr->hn_oactive = 0; 6215 hn_xmit(txr, 0); 6216 mtx_unlock(&txr->hn_tx_lock); 6217 } 6218 6219 static int 6220 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6221 { 6222 struct vmbus_chan_br cbr; 6223 struct hn_rx_ring *rxr; 6224 struct hn_tx_ring *txr = NULL; 6225 int idx, error; 6226 6227 idx = vmbus_chan_subidx(chan); 6228 6229 /* 6230 * Link this channel to RX/TX ring. 6231 */ 6232 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6233 ("invalid channel index %d, should > 0 && < %d", 6234 idx, sc->hn_rx_ring_inuse)); 6235 rxr = &sc->hn_rx_ring[idx]; 6236 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6237 ("RX ring %d already attached", idx)); 6238 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6239 rxr->hn_chan = chan; 6240 6241 if (bootverbose) { 6242 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6243 idx, vmbus_chan_id(chan)); 6244 } 6245 6246 if (idx < sc->hn_tx_ring_inuse) { 6247 txr = &sc->hn_tx_ring[idx]; 6248 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6249 ("TX ring %d already attached", idx)); 6250 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6251 6252 txr->hn_chan = chan; 6253 if (bootverbose) { 6254 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6255 idx, vmbus_chan_id(chan)); 6256 } 6257 } 6258 6259 /* Bind this channel to a proper CPU. */ 6260 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6261 6262 /* 6263 * Open this channel 6264 */ 6265 cbr.cbr = rxr->hn_br; 6266 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6267 cbr.cbr_txsz = HN_TXBR_SIZE; 6268 cbr.cbr_rxsz = HN_RXBR_SIZE; 6269 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6270 if (error) { 6271 if (error == EISCONN) { 6272 if_printf(sc->hn_ifp, "bufring is connected after " 6273 "chan%u open failure\n", vmbus_chan_id(chan)); 6274 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6275 } else { 6276 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6277 vmbus_chan_id(chan), error); 6278 } 6279 } 6280 return (error); 6281 } 6282 6283 static void 6284 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6285 { 6286 struct hn_rx_ring *rxr; 6287 int idx, error; 6288 6289 idx = vmbus_chan_subidx(chan); 6290 6291 /* 6292 * Link this channel to RX/TX ring. 6293 */ 6294 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6295 ("invalid channel index %d, should > 0 && < %d", 6296 idx, sc->hn_rx_ring_inuse)); 6297 rxr = &sc->hn_rx_ring[idx]; 6298 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6299 ("RX ring %d is not attached", idx)); 6300 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6301 6302 if (idx < sc->hn_tx_ring_inuse) { 6303 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6304 6305 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6306 ("TX ring %d is not attached attached", idx)); 6307 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6308 } 6309 6310 /* 6311 * Close this channel. 6312 * 6313 * NOTE: 6314 * Channel closing does _not_ destroy the target channel. 6315 */ 6316 error = vmbus_chan_close_direct(chan); 6317 if (error == EISCONN) { 6318 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6319 "after being closed\n", vmbus_chan_id(chan)); 6320 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6321 } else if (error) { 6322 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6323 vmbus_chan_id(chan), error); 6324 } 6325 } 6326 6327 static int 6328 hn_attach_subchans(struct hn_softc *sc) 6329 { 6330 struct vmbus_channel **subchans; 6331 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6332 int i, error = 0; 6333 6334 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6335 6336 /* Attach the sub-channels. */ 6337 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6338 for (i = 0; i < subchan_cnt; ++i) { 6339 int error1; 6340 6341 error1 = hn_chan_attach(sc, subchans[i]); 6342 if (error1) { 6343 error = error1; 6344 /* Move on; all channels will be detached later. */ 6345 } 6346 } 6347 vmbus_subchan_rel(subchans, subchan_cnt); 6348 6349 if (error) { 6350 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6351 } else { 6352 if (bootverbose) { 6353 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6354 subchan_cnt); 6355 } 6356 } 6357 return (error); 6358 } 6359 6360 static void 6361 hn_detach_allchans(struct hn_softc *sc) 6362 { 6363 struct vmbus_channel **subchans; 6364 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6365 int i; 6366 6367 if (subchan_cnt == 0) 6368 goto back; 6369 6370 /* Detach the sub-channels. */ 6371 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6372 for (i = 0; i < subchan_cnt; ++i) 6373 hn_chan_detach(sc, subchans[i]); 6374 vmbus_subchan_rel(subchans, subchan_cnt); 6375 6376 back: 6377 /* 6378 * Detach the primary channel, _after_ all sub-channels 6379 * are detached. 6380 */ 6381 hn_chan_detach(sc, sc->hn_prichan); 6382 6383 /* Wait for sub-channels to be destroyed, if any. */ 6384 vmbus_subchan_drain(sc->hn_prichan); 6385 6386 #ifdef INVARIANTS 6387 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6388 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6389 HN_RX_FLAG_ATTACHED) == 0, 6390 ("%dth RX ring is still attached", i)); 6391 } 6392 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6393 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6394 HN_TX_FLAG_ATTACHED) == 0, 6395 ("%dth TX ring is still attached", i)); 6396 } 6397 #endif 6398 } 6399 6400 static int 6401 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6402 { 6403 struct vmbus_channel **subchans; 6404 int nchan, rxr_cnt, error; 6405 6406 nchan = *nsubch + 1; 6407 if (nchan == 1) { 6408 /* 6409 * Multiple RX/TX rings are not requested. 6410 */ 6411 *nsubch = 0; 6412 return (0); 6413 } 6414 6415 /* 6416 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6417 * table entries. 6418 */ 6419 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6420 if (error) { 6421 /* No RSS; this is benign. */ 6422 *nsubch = 0; 6423 return (0); 6424 } 6425 if (bootverbose) { 6426 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6427 rxr_cnt, nchan); 6428 } 6429 6430 if (nchan > rxr_cnt) 6431 nchan = rxr_cnt; 6432 if (nchan == 1) { 6433 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6434 *nsubch = 0; 6435 return (0); 6436 } 6437 6438 /* 6439 * Allocate sub-channels from NVS. 6440 */ 6441 *nsubch = nchan - 1; 6442 error = hn_nvs_alloc_subchans(sc, nsubch); 6443 if (error || *nsubch == 0) { 6444 /* Failed to allocate sub-channels. */ 6445 *nsubch = 0; 6446 return (0); 6447 } 6448 6449 /* 6450 * Wait for all sub-channels to become ready before moving on. 6451 */ 6452 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6453 vmbus_subchan_rel(subchans, *nsubch); 6454 return (0); 6455 } 6456 6457 static bool 6458 hn_synth_attachable(const struct hn_softc *sc) 6459 { 6460 int i; 6461 6462 if (sc->hn_flags & HN_FLAG_ERRORS) 6463 return (false); 6464 6465 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6466 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6467 6468 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6469 return (false); 6470 } 6471 return (true); 6472 } 6473 6474 /* 6475 * Make sure that the RX filter is zero after the successful 6476 * RNDIS initialization. 6477 * 6478 * NOTE: 6479 * Under certain conditions on certain versions of Hyper-V, 6480 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6481 * after the successful RNDIS initialization, which breaks 6482 * the assumption of any following code (well, it breaks the 6483 * RNDIS API contract actually). Clear the RNDIS rxfilter 6484 * explicitly, drain packets sneaking through, and drain the 6485 * interrupt taskqueues scheduled due to the stealth packets. 6486 */ 6487 static void 6488 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6489 { 6490 6491 hn_disable_rx(sc); 6492 hn_drain_rxtx(sc, nchan); 6493 } 6494 6495 static int 6496 hn_synth_attach(struct hn_softc *sc, int mtu) 6497 { 6498 #define ATTACHED_NVS 0x0002 6499 #define ATTACHED_RNDIS 0x0004 6500 6501 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6502 int error, nsubch, nchan = 1, i, rndis_inited; 6503 uint32_t old_caps, attached = 0; 6504 6505 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6506 ("synthetic parts were attached")); 6507 6508 if (!hn_synth_attachable(sc)) 6509 return (ENXIO); 6510 6511 /* Save capabilities for later verification. */ 6512 old_caps = sc->hn_caps; 6513 sc->hn_caps = 0; 6514 6515 /* Clear RSS stuffs. */ 6516 sc->hn_rss_ind_size = 0; 6517 sc->hn_rss_hash = 0; 6518 sc->hn_rss_hcap = 0; 6519 6520 /* 6521 * Attach the primary channel _before_ attaching NVS and RNDIS. 6522 */ 6523 error = hn_chan_attach(sc, sc->hn_prichan); 6524 if (error) 6525 goto failed; 6526 6527 /* 6528 * Attach NVS. 6529 */ 6530 error = hn_nvs_attach(sc, mtu); 6531 if (error) 6532 goto failed; 6533 attached |= ATTACHED_NVS; 6534 6535 /* 6536 * Attach RNDIS _after_ NVS is attached. 6537 */ 6538 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6539 if (rndis_inited) 6540 attached |= ATTACHED_RNDIS; 6541 if (error) 6542 goto failed; 6543 6544 /* 6545 * Make sure capabilities are not changed. 6546 */ 6547 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6548 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6549 old_caps, sc->hn_caps); 6550 error = ENXIO; 6551 goto failed; 6552 } 6553 6554 /* 6555 * Allocate sub-channels for multi-TX/RX rings. 6556 * 6557 * NOTE: 6558 * The # of RX rings that can be used is equivalent to the # of 6559 * channels to be requested. 6560 */ 6561 nsubch = sc->hn_rx_ring_cnt - 1; 6562 error = hn_synth_alloc_subchans(sc, &nsubch); 6563 if (error) 6564 goto failed; 6565 /* NOTE: _Full_ synthetic parts detach is required now. */ 6566 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6567 6568 /* 6569 * Set the # of TX/RX rings that could be used according to 6570 * the # of channels that NVS offered. 6571 */ 6572 nchan = nsubch + 1; 6573 hn_set_ring_inuse(sc, nchan); 6574 if (nchan == 1) { 6575 /* Only the primary channel can be used; done */ 6576 goto back; 6577 } 6578 6579 /* 6580 * Attach the sub-channels. 6581 * 6582 * NOTE: hn_set_ring_inuse() _must_ have been called. 6583 */ 6584 error = hn_attach_subchans(sc); 6585 if (error) 6586 goto failed; 6587 6588 /* 6589 * Configure RSS key and indirect table _after_ all sub-channels 6590 * are attached. 6591 */ 6592 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6593 /* 6594 * RSS key is not set yet; set it to the default RSS key. 6595 */ 6596 if (bootverbose) 6597 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6598 #ifdef RSS 6599 rss_getkey(rss->rss_key); 6600 #else 6601 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6602 #endif 6603 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6604 } 6605 6606 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6607 /* 6608 * RSS indirect table is not set yet; set it up in round- 6609 * robin fashion. 6610 */ 6611 if (bootverbose) { 6612 if_printf(sc->hn_ifp, "setup default RSS indirect " 6613 "table\n"); 6614 } 6615 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6616 uint32_t subidx; 6617 6618 #ifdef RSS 6619 subidx = rss_get_indirection_to_bucket(i); 6620 #else 6621 subidx = i; 6622 #endif 6623 rss->rss_ind[i] = subidx % nchan; 6624 } 6625 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6626 } else { 6627 /* 6628 * # of usable channels may be changed, so we have to 6629 * make sure that all entries in RSS indirect table 6630 * are valid. 6631 * 6632 * NOTE: hn_set_ring_inuse() _must_ have been called. 6633 */ 6634 hn_rss_ind_fixup(sc); 6635 } 6636 6637 sc->hn_rss_hash = sc->hn_rss_hcap; 6638 if ((sc->hn_flags & HN_FLAG_RXVF) || 6639 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6640 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6641 hn_vf_rss_fixup(sc, false); 6642 } 6643 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6644 if (error) 6645 goto failed; 6646 back: 6647 /* 6648 * Fixup transmission aggregation setup. 6649 */ 6650 hn_set_txagg(sc); 6651 hn_rndis_init_fixat(sc, nchan); 6652 return (0); 6653 6654 failed: 6655 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6656 hn_rndis_init_fixat(sc, nchan); 6657 hn_synth_detach(sc); 6658 } else { 6659 if (attached & ATTACHED_RNDIS) { 6660 hn_rndis_init_fixat(sc, nchan); 6661 hn_rndis_detach(sc); 6662 } 6663 if (attached & ATTACHED_NVS) 6664 hn_nvs_detach(sc); 6665 hn_chan_detach(sc, sc->hn_prichan); 6666 /* Restore old capabilities. */ 6667 sc->hn_caps = old_caps; 6668 } 6669 return (error); 6670 6671 #undef ATTACHED_RNDIS 6672 #undef ATTACHED_NVS 6673 } 6674 6675 /* 6676 * NOTE: 6677 * The interface must have been suspended though hn_suspend(), before 6678 * this function get called. 6679 */ 6680 static void 6681 hn_synth_detach(struct hn_softc *sc) 6682 { 6683 6684 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6685 ("synthetic parts were not attached")); 6686 6687 /* Detach the RNDIS first. */ 6688 hn_rndis_detach(sc); 6689 6690 /* Detach NVS. */ 6691 hn_nvs_detach(sc); 6692 6693 /* Detach all of the channels. */ 6694 hn_detach_allchans(sc); 6695 6696 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6697 /* 6698 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6699 */ 6700 int error; 6701 6702 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6703 sc->hn_rxbuf_gpadl); 6704 if (error) { 6705 if_printf(sc->hn_ifp, 6706 "rxbuf gpadl disconn failed: %d\n", error); 6707 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6708 } 6709 sc->hn_rxbuf_gpadl = 0; 6710 } 6711 6712 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6713 /* 6714 * Host is post-Win2016, disconnect chimney sending buffer from 6715 * primary channel here. 6716 */ 6717 int error; 6718 6719 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6720 sc->hn_chim_gpadl); 6721 if (error) { 6722 if_printf(sc->hn_ifp, 6723 "chim gpadl disconn failed: %d\n", error); 6724 sc->hn_flags |= HN_FLAG_CHIM_REF; 6725 } 6726 sc->hn_chim_gpadl = 0; 6727 } 6728 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6729 } 6730 6731 static void 6732 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6733 { 6734 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6735 ("invalid ring count %d", ring_cnt)); 6736 6737 if (sc->hn_tx_ring_cnt > ring_cnt) 6738 sc->hn_tx_ring_inuse = ring_cnt; 6739 else 6740 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6741 sc->hn_rx_ring_inuse = ring_cnt; 6742 6743 #ifdef RSS 6744 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6745 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6746 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6747 rss_getnumbuckets()); 6748 } 6749 #endif 6750 6751 if (bootverbose) { 6752 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6753 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6754 } 6755 } 6756 6757 static void 6758 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6759 { 6760 6761 /* 6762 * NOTE: 6763 * The TX bufring will not be drained by the hypervisor, 6764 * if the primary channel is revoked. 6765 */ 6766 while (!vmbus_chan_rx_empty(chan) || 6767 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6768 !vmbus_chan_tx_empty(chan))) 6769 pause("waitch", 1); 6770 vmbus_chan_intr_drain(chan); 6771 } 6772 6773 static void 6774 hn_disable_rx(struct hn_softc *sc) 6775 { 6776 6777 /* 6778 * Disable RX by clearing RX filter forcefully. 6779 */ 6780 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6781 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6782 6783 /* 6784 * Give RNDIS enough time to flush all pending data packets. 6785 */ 6786 pause("waitrx", (200 * hz) / 1000); 6787 } 6788 6789 /* 6790 * NOTE: 6791 * RX/TX _must_ have been suspended/disabled, before this function 6792 * is called. 6793 */ 6794 static void 6795 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6796 { 6797 struct vmbus_channel **subch = NULL; 6798 int nsubch; 6799 6800 /* 6801 * Drain RX/TX bufrings and interrupts. 6802 */ 6803 nsubch = nchan - 1; 6804 if (nsubch > 0) 6805 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6806 6807 if (subch != NULL) { 6808 int i; 6809 6810 for (i = 0; i < nsubch; ++i) 6811 hn_chan_drain(sc, subch[i]); 6812 } 6813 hn_chan_drain(sc, sc->hn_prichan); 6814 6815 if (subch != NULL) 6816 vmbus_subchan_rel(subch, nsubch); 6817 } 6818 6819 static void 6820 hn_suspend_data(struct hn_softc *sc) 6821 { 6822 struct hn_tx_ring *txr; 6823 int i; 6824 6825 HN_LOCK_ASSERT(sc); 6826 6827 /* 6828 * Suspend TX. 6829 */ 6830 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6831 txr = &sc->hn_tx_ring[i]; 6832 6833 mtx_lock(&txr->hn_tx_lock); 6834 txr->hn_suspended = 1; 6835 mtx_unlock(&txr->hn_tx_lock); 6836 /* No one is able send more packets now. */ 6837 6838 /* 6839 * Wait for all pending sends to finish. 6840 * 6841 * NOTE: 6842 * We will _not_ receive all pending send-done, if the 6843 * primary channel is revoked. 6844 */ 6845 while (hn_tx_ring_pending(txr) && 6846 !vmbus_chan_is_revoked(sc->hn_prichan)) 6847 pause("hnwtx", 1 /* 1 tick */); 6848 } 6849 6850 /* 6851 * Disable RX. 6852 */ 6853 hn_disable_rx(sc); 6854 6855 /* 6856 * Drain RX/TX. 6857 */ 6858 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6859 6860 /* 6861 * Drain any pending TX tasks. 6862 * 6863 * NOTE: 6864 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6865 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6866 */ 6867 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6868 txr = &sc->hn_tx_ring[i]; 6869 6870 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6871 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6872 } 6873 } 6874 6875 static void 6876 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6877 { 6878 6879 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6880 } 6881 6882 static void 6883 hn_suspend_mgmt(struct hn_softc *sc) 6884 { 6885 struct task task; 6886 6887 HN_LOCK_ASSERT(sc); 6888 6889 /* 6890 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6891 * through hn_mgmt_taskq. 6892 */ 6893 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6894 vmbus_chan_run_task(sc->hn_prichan, &task); 6895 6896 /* 6897 * Make sure that all pending management tasks are completed. 6898 */ 6899 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6900 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6901 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6902 } 6903 6904 static void 6905 hn_suspend(struct hn_softc *sc) 6906 { 6907 6908 /* Disable polling. */ 6909 hn_polling(sc, 0); 6910 6911 /* 6912 * If the non-transparent mode VF is activated, the synthetic 6913 * device is receiving packets, so the data path of the 6914 * synthetic device must be suspended. 6915 */ 6916 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 6917 (sc->hn_flags & HN_FLAG_RXVF)) 6918 hn_suspend_data(sc); 6919 hn_suspend_mgmt(sc); 6920 } 6921 6922 static void 6923 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6924 { 6925 int i; 6926 6927 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6928 ("invalid TX ring count %d", tx_ring_cnt)); 6929 6930 for (i = 0; i < tx_ring_cnt; ++i) { 6931 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6932 6933 mtx_lock(&txr->hn_tx_lock); 6934 txr->hn_suspended = 0; 6935 mtx_unlock(&txr->hn_tx_lock); 6936 } 6937 } 6938 6939 static void 6940 hn_resume_data(struct hn_softc *sc) 6941 { 6942 int i; 6943 6944 HN_LOCK_ASSERT(sc); 6945 6946 /* 6947 * Re-enable RX. 6948 */ 6949 hn_rxfilter_config(sc); 6950 6951 /* 6952 * Make sure to clear suspend status on "all" TX rings, 6953 * since hn_tx_ring_inuse can be changed after 6954 * hn_suspend_data(). 6955 */ 6956 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6957 6958 #ifdef HN_IFSTART_SUPPORT 6959 if (!hn_use_if_start) 6960 #endif 6961 { 6962 /* 6963 * Flush unused drbrs, since hn_tx_ring_inuse may be 6964 * reduced. 6965 */ 6966 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6967 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6968 } 6969 6970 /* 6971 * Kick start TX. 6972 */ 6973 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6974 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6975 6976 /* 6977 * Use txeof task, so that any pending oactive can be 6978 * cleared properly. 6979 */ 6980 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6981 } 6982 } 6983 6984 static void 6985 hn_resume_mgmt(struct hn_softc *sc) 6986 { 6987 6988 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6989 6990 /* 6991 * Kick off network change detection, if it was pending. 6992 * If no network change was pending, start link status 6993 * checks, which is more lightweight than network change 6994 * detection. 6995 */ 6996 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6997 hn_change_network(sc); 6998 else 6999 hn_update_link_status(sc); 7000 } 7001 7002 static void 7003 hn_resume(struct hn_softc *sc) 7004 { 7005 7006 /* 7007 * If the non-transparent mode VF is activated, the synthetic 7008 * device have to receive packets, so the data path of the 7009 * synthetic device must be resumed. 7010 */ 7011 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 7012 (sc->hn_flags & HN_FLAG_RXVF)) 7013 hn_resume_data(sc); 7014 7015 /* 7016 * Don't resume link status change if VF is attached/activated. 7017 * - In the non-transparent VF mode, the synthetic device marks 7018 * link down until the VF is deactivated; i.e. VF is down. 7019 * - In transparent VF mode, VF's media status is used until 7020 * the VF is detached. 7021 */ 7022 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7023 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7024 hn_resume_mgmt(sc); 7025 7026 /* 7027 * Re-enable polling if this interface is running and 7028 * the polling is requested. 7029 */ 7030 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7031 hn_polling(sc, sc->hn_pollhz); 7032 } 7033 7034 static void 7035 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7036 { 7037 const struct rndis_status_msg *msg; 7038 int ofs; 7039 7040 if (dlen < sizeof(*msg)) { 7041 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7042 return; 7043 } 7044 msg = data; 7045 7046 switch (msg->rm_status) { 7047 case RNDIS_STATUS_MEDIA_CONNECT: 7048 case RNDIS_STATUS_MEDIA_DISCONNECT: 7049 hn_update_link_status(sc); 7050 break; 7051 7052 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7053 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7054 /* Not really useful; ignore. */ 7055 break; 7056 7057 case RNDIS_STATUS_NETWORK_CHANGE: 7058 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7059 if (dlen < ofs + msg->rm_stbuflen || 7060 msg->rm_stbuflen < sizeof(uint32_t)) { 7061 if_printf(sc->hn_ifp, "network changed\n"); 7062 } else { 7063 uint32_t change; 7064 7065 memcpy(&change, ((const uint8_t *)msg) + ofs, 7066 sizeof(change)); 7067 if_printf(sc->hn_ifp, "network changed, change %u\n", 7068 change); 7069 } 7070 hn_change_network(sc); 7071 break; 7072 7073 default: 7074 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7075 msg->rm_status); 7076 break; 7077 } 7078 } 7079 7080 static int 7081 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7082 { 7083 const struct rndis_pktinfo *pi = info_data; 7084 uint32_t mask = 0; 7085 7086 while (info_dlen != 0) { 7087 const void *data; 7088 uint32_t dlen; 7089 7090 if (__predict_false(info_dlen < sizeof(*pi))) 7091 return (EINVAL); 7092 if (__predict_false(info_dlen < pi->rm_size)) 7093 return (EINVAL); 7094 info_dlen -= pi->rm_size; 7095 7096 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7097 return (EINVAL); 7098 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7099 return (EINVAL); 7100 dlen = pi->rm_size - pi->rm_pktinfooffset; 7101 data = pi->rm_data; 7102 7103 if (pi->rm_internal == 1) { 7104 switch (pi->rm_type) { 7105 case NDIS_PKTINFO_IT_PKTINFO_ID: 7106 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7107 return (EINVAL); 7108 info->pktinfo_id = 7109 (const struct packet_info_id *)data; 7110 mask |= HN_RXINFO_PKTINFO_ID; 7111 break; 7112 7113 default: 7114 goto next; 7115 } 7116 } else { 7117 switch (pi->rm_type) { 7118 case NDIS_PKTINFO_TYPE_VLAN: 7119 if (__predict_false(dlen 7120 < NDIS_VLAN_INFO_SIZE)) 7121 return (EINVAL); 7122 info->vlan_info = (const uint32_t *)data; 7123 mask |= HN_RXINFO_VLAN; 7124 break; 7125 7126 case NDIS_PKTINFO_TYPE_CSUM: 7127 if (__predict_false(dlen 7128 < NDIS_RXCSUM_INFO_SIZE)) 7129 return (EINVAL); 7130 info->csum_info = (const uint32_t *)data; 7131 mask |= HN_RXINFO_CSUM; 7132 break; 7133 7134 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7135 if (__predict_false(dlen 7136 < HN_NDIS_HASH_VALUE_SIZE)) 7137 return (EINVAL); 7138 info->hash_value = (const uint32_t *)data; 7139 mask |= HN_RXINFO_HASHVAL; 7140 break; 7141 7142 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7143 if (__predict_false(dlen 7144 < HN_NDIS_HASH_INFO_SIZE)) 7145 return (EINVAL); 7146 info->hash_info = (const uint32_t *)data; 7147 mask |= HN_RXINFO_HASHINF; 7148 break; 7149 7150 default: 7151 goto next; 7152 } 7153 } 7154 7155 if (mask == HN_RXINFO_ALL) { 7156 /* All found; done */ 7157 break; 7158 } 7159 next: 7160 pi = (const struct rndis_pktinfo *) 7161 ((const uint8_t *)pi + pi->rm_size); 7162 } 7163 7164 /* 7165 * Final fixup. 7166 * - If there is no hash value, invalidate the hash info. 7167 */ 7168 if ((mask & HN_RXINFO_HASHVAL) == 0) 7169 info->hash_info = NULL; 7170 return (0); 7171 } 7172 7173 static __inline bool 7174 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7175 { 7176 7177 if (off < check_off) { 7178 if (__predict_true(off + len <= check_off)) 7179 return (false); 7180 } else if (off > check_off) { 7181 if (__predict_true(check_off + check_len <= off)) 7182 return (false); 7183 } 7184 return (true); 7185 } 7186 7187 static __inline void 7188 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7189 uint32_t len, struct hn_rxinfo *info) 7190 { 7191 uint32_t cnt = rxr->rsc.cnt; 7192 7193 if (cnt) { 7194 rxr->rsc.pktlen += len; 7195 } else { 7196 rxr->rsc.vlan_info = info->vlan_info; 7197 rxr->rsc.csum_info = info->csum_info; 7198 rxr->rsc.hash_info = info->hash_info; 7199 rxr->rsc.hash_value = info->hash_value; 7200 rxr->rsc.pktlen = len; 7201 } 7202 7203 rxr->rsc.frag_data[cnt] = data; 7204 rxr->rsc.frag_len[cnt] = len; 7205 rxr->rsc.cnt++; 7206 } 7207 7208 static void 7209 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7210 { 7211 const struct rndis_packet_msg *pkt; 7212 struct hn_rxinfo info; 7213 int data_off, pktinfo_off, data_len, pktinfo_len; 7214 bool rsc_more= false; 7215 7216 /* 7217 * Check length. 7218 */ 7219 if (__predict_false(dlen < sizeof(*pkt))) { 7220 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7221 return; 7222 } 7223 pkt = data; 7224 7225 if (__predict_false(dlen < pkt->rm_len)) { 7226 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7227 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7228 return; 7229 } 7230 if (__predict_false(pkt->rm_len < 7231 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7232 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7233 "msglen %u, data %u, oob %u, pktinfo %u\n", 7234 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7235 pkt->rm_pktinfolen); 7236 return; 7237 } 7238 if (__predict_false(pkt->rm_datalen == 0)) { 7239 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7240 return; 7241 } 7242 7243 /* 7244 * Check offests. 7245 */ 7246 #define IS_OFFSET_INVALID(ofs) \ 7247 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7248 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7249 7250 /* XXX Hyper-V does not meet data offset alignment requirement */ 7251 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7252 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7253 "data offset %u\n", pkt->rm_dataoffset); 7254 return; 7255 } 7256 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7257 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7258 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7259 "oob offset %u\n", pkt->rm_oobdataoffset); 7260 return; 7261 } 7262 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7263 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7264 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7265 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7266 return; 7267 } 7268 7269 #undef IS_OFFSET_INVALID 7270 7271 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7272 data_len = pkt->rm_datalen; 7273 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7274 pktinfo_len = pkt->rm_pktinfolen; 7275 7276 /* 7277 * Check OOB coverage. 7278 */ 7279 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7280 int oob_off, oob_len; 7281 7282 if_printf(rxr->hn_ifp, "got oobdata\n"); 7283 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7284 oob_len = pkt->rm_oobdatalen; 7285 7286 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7287 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7288 "oob overflow, msglen %u, oob abs %d len %d\n", 7289 pkt->rm_len, oob_off, oob_len); 7290 return; 7291 } 7292 7293 /* 7294 * Check against data. 7295 */ 7296 if (hn_rndis_check_overlap(oob_off, oob_len, 7297 data_off, data_len)) { 7298 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7299 "oob overlaps data, oob abs %d len %d, " 7300 "data abs %d len %d\n", 7301 oob_off, oob_len, data_off, data_len); 7302 return; 7303 } 7304 7305 /* 7306 * Check against pktinfo. 7307 */ 7308 if (pktinfo_len != 0 && 7309 hn_rndis_check_overlap(oob_off, oob_len, 7310 pktinfo_off, pktinfo_len)) { 7311 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7312 "oob overlaps pktinfo, oob abs %d len %d, " 7313 "pktinfo abs %d len %d\n", 7314 oob_off, oob_len, pktinfo_off, pktinfo_len); 7315 return; 7316 } 7317 } 7318 7319 /* 7320 * Check per-packet-info coverage and find useful per-packet-info. 7321 */ 7322 info.vlan_info = NULL; 7323 info.csum_info = NULL; 7324 info.hash_info = NULL; 7325 info.pktinfo_id = NULL; 7326 7327 if (__predict_true(pktinfo_len != 0)) { 7328 bool overlap; 7329 int error; 7330 7331 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7332 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7333 "pktinfo overflow, msglen %u, " 7334 "pktinfo abs %d len %d\n", 7335 pkt->rm_len, pktinfo_off, pktinfo_len); 7336 return; 7337 } 7338 7339 /* 7340 * Check packet info coverage. 7341 */ 7342 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7343 data_off, data_len); 7344 if (__predict_false(overlap)) { 7345 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7346 "pktinfo overlap data, pktinfo abs %d len %d, " 7347 "data abs %d len %d\n", 7348 pktinfo_off, pktinfo_len, data_off, data_len); 7349 return; 7350 } 7351 7352 /* 7353 * Find useful per-packet-info. 7354 */ 7355 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7356 pktinfo_len, &info); 7357 if (__predict_false(error)) { 7358 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7359 "pktinfo\n"); 7360 return; 7361 } 7362 } 7363 7364 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7365 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7366 "data overflow, msglen %u, data abs %d len %d\n", 7367 pkt->rm_len, data_off, data_len); 7368 return; 7369 } 7370 7371 /* Identify RSC fragments, drop invalid packets */ 7372 if ((info.pktinfo_id != NULL) && 7373 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7374 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7375 rxr->rsc.cnt = 0; 7376 rxr->hn_rsc_pkts++; 7377 } else if (rxr->rsc.cnt == 0) 7378 goto drop; 7379 7380 rsc_more = true; 7381 7382 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7383 rsc_more = false; 7384 7385 if (rsc_more && rxr->rsc.is_last) 7386 goto drop; 7387 } else { 7388 rxr->rsc.cnt = 0; 7389 } 7390 7391 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7392 goto drop; 7393 7394 /* Store data in per rx ring structure */ 7395 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7396 data_len, &info); 7397 7398 if (rsc_more) 7399 return; 7400 7401 hn_rxpkt(rxr); 7402 rxr->rsc.cnt = 0; 7403 return; 7404 drop: 7405 rxr->hn_rsc_drop++; 7406 return; 7407 } 7408 7409 static __inline void 7410 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7411 { 7412 const struct rndis_msghdr *hdr; 7413 7414 if (__predict_false(dlen < sizeof(*hdr))) { 7415 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7416 return; 7417 } 7418 hdr = data; 7419 7420 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7421 /* Hot data path. */ 7422 hn_rndis_rx_data(rxr, data, dlen); 7423 /* Done! */ 7424 return; 7425 } 7426 7427 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7428 hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen); 7429 else 7430 hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen); 7431 } 7432 7433 static void 7434 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7435 { 7436 const struct hn_nvs_hdr *hdr; 7437 7438 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7439 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7440 return; 7441 } 7442 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7443 7444 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7445 /* Useless; ignore */ 7446 return; 7447 } 7448 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7449 } 7450 7451 static void 7452 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7453 const struct vmbus_chanpkt_hdr *pkt) 7454 { 7455 struct hn_nvs_sendctx *sndc; 7456 7457 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7458 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7459 VMBUS_CHANPKT_DATALEN(pkt)); 7460 /* 7461 * NOTE: 7462 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7463 * its callback. 7464 */ 7465 } 7466 7467 static void 7468 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7469 const struct vmbus_chanpkt_hdr *pkthdr) 7470 { 7471 struct epoch_tracker et; 7472 const struct vmbus_chanpkt_rxbuf *pkt; 7473 const struct hn_nvs_hdr *nvs_hdr; 7474 int count, i, hlen; 7475 7476 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7477 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7478 return; 7479 } 7480 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7481 7482 /* Make sure that this is a RNDIS message. */ 7483 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7484 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7485 nvs_hdr->nvs_type); 7486 return; 7487 } 7488 7489 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7490 if (__predict_false(hlen < sizeof(*pkt))) { 7491 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7492 return; 7493 } 7494 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7495 7496 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7497 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7498 pkt->cp_rxbuf_id); 7499 return; 7500 } 7501 7502 count = pkt->cp_rxbuf_cnt; 7503 if (__predict_false(hlen < 7504 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7505 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7506 return; 7507 } 7508 7509 NET_EPOCH_ENTER(et); 7510 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7511 for (i = 0; i < count; ++i) { 7512 int ofs, len; 7513 7514 ofs = pkt->cp_rxbuf[i].rb_ofs; 7515 len = pkt->cp_rxbuf[i].rb_len; 7516 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7517 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7518 "ofs %d, len %d\n", i, ofs, len); 7519 continue; 7520 } 7521 7522 rxr->rsc.is_last = (i == (count - 1)); 7523 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7524 } 7525 NET_EPOCH_EXIT(et); 7526 7527 /* 7528 * Ack the consumed RXBUF associated w/ this channel packet, 7529 * so that this RXBUF can be recycled by the hypervisor. 7530 */ 7531 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7532 } 7533 7534 static void 7535 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7536 uint64_t tid) 7537 { 7538 struct hn_nvs_rndis_ack ack; 7539 int retries, error; 7540 7541 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7542 ack.nvs_status = HN_NVS_STATUS_OK; 7543 7544 retries = 0; 7545 again: 7546 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7547 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7548 if (__predict_false(error == EAGAIN)) { 7549 /* 7550 * NOTE: 7551 * This should _not_ happen in real world, since the 7552 * consumption of the TX bufring from the TX path is 7553 * controlled. 7554 */ 7555 if (rxr->hn_ack_failed == 0) 7556 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7557 rxr->hn_ack_failed++; 7558 retries++; 7559 if (retries < 10) { 7560 DELAY(100); 7561 goto again; 7562 } 7563 /* RXBUF leaks! */ 7564 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7565 } 7566 } 7567 7568 static void 7569 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7570 { 7571 struct hn_rx_ring *rxr = xrxr; 7572 struct hn_softc *sc = if_getsoftc(rxr->hn_ifp); 7573 7574 for (;;) { 7575 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7576 int error, pktlen; 7577 7578 pktlen = rxr->hn_pktbuf_len; 7579 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7580 if (__predict_false(error == ENOBUFS)) { 7581 void *nbuf; 7582 int nlen; 7583 7584 /* 7585 * Expand channel packet buffer. 7586 * 7587 * XXX 7588 * Use M_WAITOK here, since allocation failure 7589 * is fatal. 7590 */ 7591 nlen = rxr->hn_pktbuf_len * 2; 7592 while (nlen < pktlen) 7593 nlen *= 2; 7594 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7595 7596 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7597 rxr->hn_pktbuf_len, nlen); 7598 7599 free(rxr->hn_pktbuf, M_DEVBUF); 7600 rxr->hn_pktbuf = nbuf; 7601 rxr->hn_pktbuf_len = nlen; 7602 /* Retry! */ 7603 continue; 7604 } else if (__predict_false(error == EAGAIN)) { 7605 /* No more channel packets; done! */ 7606 break; 7607 } 7608 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7609 7610 switch (pkt->cph_type) { 7611 case VMBUS_CHANPKT_TYPE_COMP: 7612 hn_nvs_handle_comp(sc, chan, pkt); 7613 break; 7614 7615 case VMBUS_CHANPKT_TYPE_RXBUF: 7616 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7617 break; 7618 7619 case VMBUS_CHANPKT_TYPE_INBAND: 7620 hn_nvs_handle_notify(sc, pkt); 7621 break; 7622 7623 default: 7624 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7625 pkt->cph_type); 7626 break; 7627 } 7628 } 7629 hn_chan_rollup(rxr, rxr->hn_txr); 7630 } 7631 7632 static void 7633 hn_sysinit(void *arg __unused) 7634 { 7635 int i; 7636 7637 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7638 7639 #ifdef HN_IFSTART_SUPPORT 7640 /* 7641 * Don't use ifnet.if_start if transparent VF mode is requested; 7642 * mainly due to the IFF_DRV_OACTIVE flag. 7643 */ 7644 if (hn_xpnt_vf && hn_use_if_start) { 7645 hn_use_if_start = 0; 7646 printf("hn: tranparent VF mode, if_transmit will be used, " 7647 "instead of if_start\n"); 7648 } 7649 #endif 7650 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7651 printf("hn: invalid transparent VF attach routing " 7652 "wait timeout %d, reset to %d\n", 7653 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7654 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7655 } 7656 7657 /* 7658 * Initialize VF map. 7659 */ 7660 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7661 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7662 hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF, 7663 M_WAITOK | M_ZERO); 7664 7665 /* 7666 * Fix the # of TX taskqueues. 7667 */ 7668 if (hn_tx_taskq_cnt <= 0) 7669 hn_tx_taskq_cnt = 1; 7670 else if (hn_tx_taskq_cnt > mp_ncpus) 7671 hn_tx_taskq_cnt = mp_ncpus; 7672 7673 /* 7674 * Fix the TX taskqueue mode. 7675 */ 7676 switch (hn_tx_taskq_mode) { 7677 case HN_TX_TASKQ_M_INDEP: 7678 case HN_TX_TASKQ_M_GLOBAL: 7679 case HN_TX_TASKQ_M_EVTTQ: 7680 break; 7681 default: 7682 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7683 break; 7684 } 7685 7686 if (vm_guest != VM_GUEST_HV) 7687 return; 7688 7689 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7690 return; 7691 7692 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7693 M_DEVBUF, M_WAITOK); 7694 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7695 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7696 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7697 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7698 "hn tx%d", i); 7699 } 7700 } 7701 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7702 7703 static void 7704 hn_sysuninit(void *arg __unused) 7705 { 7706 7707 if (hn_tx_taskque != NULL) { 7708 int i; 7709 7710 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7711 taskqueue_free(hn_tx_taskque[i]); 7712 free(hn_tx_taskque, M_DEVBUF); 7713 } 7714 7715 if (hn_vfmap != NULL) 7716 free(hn_vfmap, M_DEVBUF); 7717 rm_destroy(&hn_vfmap_lock); 7718 7719 counter_u64_free(hn_udpcs_fixup); 7720 } 7721 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7722