1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 #include "opt_hn.h" 57 #include "opt_inet6.h" 58 #include "opt_inet.h" 59 #include "opt_rss.h" 60 61 #include <sys/param.h> 62 #include <sys/systm.h> 63 #include <sys/bus.h> 64 #include <sys/counter.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/proc.h> 73 #include <sys/rmlock.h> 74 #include <sys/sbuf.h> 75 #include <sys/sched.h> 76 #include <sys/smp.h> 77 #include <sys/socket.h> 78 #include <sys/sockio.h> 79 #include <sys/sx.h> 80 #include <sys/sysctl.h> 81 #include <sys/taskqueue.h> 82 #include <sys/buf_ring.h> 83 #include <sys/eventhandler.h> 84 #include <sys/epoch.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_extern.h> 88 #include <vm/pmap.h> 89 90 #include <machine/atomic.h> 91 #include <machine/in_cksum.h> 92 93 #include <net/bpf.h> 94 #include <net/ethernet.h> 95 #include <net/if.h> 96 #include <net/if_dl.h> 97 #include <net/if_media.h> 98 #include <net/if_types.h> 99 #include <net/if_var.h> 100 #include <net/rndis.h> 101 #ifdef RSS 102 #include <net/rss_config.h> 103 #endif 104 105 #include <netinet/in_systm.h> 106 #include <netinet/in.h> 107 #include <netinet/ip.h> 108 #include <netinet/ip6.h> 109 #include <netinet/tcp.h> 110 #include <netinet/tcp_lro.h> 111 #include <netinet/udp.h> 112 113 #include <dev/hyperv/include/hyperv.h> 114 #include <dev/hyperv/include/hyperv_busdma.h> 115 #include <dev/hyperv/include/vmbus.h> 116 #include <dev/hyperv/include/vmbus_xact.h> 117 118 #include <dev/hyperv/netvsc/ndis.h> 119 #include <dev/hyperv/netvsc/if_hnreg.h> 120 #include <dev/hyperv/netvsc/if_hnvar.h> 121 #include <dev/hyperv/netvsc/hn_nvs.h> 122 #include <dev/hyperv/netvsc/hn_rndis.h> 123 124 #include "vmbus_if.h" 125 126 #define HN_IFSTART_SUPPORT 127 128 #define HN_RING_CNT_DEF_MAX 8 129 130 #define HN_VFMAP_SIZE_DEF 8 131 132 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 133 134 /* YYY should get it from the underlying channel */ 135 #define HN_TX_DESC_CNT 512 136 137 #define HN_RNDIS_PKT_LEN \ 138 (sizeof(struct rndis_packet_msg) + \ 139 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 141 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 142 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 143 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 144 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 145 146 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 147 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 148 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 149 /* -1 for RNDIS packet message */ 150 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 151 152 #define HN_DIRECT_TX_SIZE_DEF 128 153 154 #define HN_EARLY_TXEOF_THRESH 8 155 156 #define HN_PKTBUF_LEN_DEF (16 * 1024) 157 158 #define HN_LROENT_CNT_DEF 128 159 160 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 161 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 162 /* YYY 2*MTU is a bit rough, but should be good enough. */ 163 #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp)) 164 165 #define HN_LRO_ACKCNT_DEF 1 166 167 #define HN_LOCK_INIT(sc) \ 168 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 169 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 170 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 171 #define HN_LOCK(sc) \ 172 do { \ 173 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 174 /* Relinquish cpu to avoid deadlock */ \ 175 sched_relinquish(curthread); \ 176 DELAY(1000); \ 177 } \ 178 } while (0) 179 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 180 181 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 182 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 183 #define HN_CSUM_IP_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 185 #define HN_CSUM_IP6_HWASSIST(sc) \ 186 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 187 188 #define HN_PKTSIZE_MIN(align) \ 189 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 190 HN_RNDIS_PKT_LEN, (align)) 191 #define HN_PKTSIZE(m, align) \ 192 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 193 194 #ifdef RSS 195 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 196 #else 197 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 198 #endif 199 200 struct hn_txdesc { 201 #ifndef HN_USE_TXDESC_BUFRING 202 SLIST_ENTRY(hn_txdesc) link; 203 #endif 204 STAILQ_ENTRY(hn_txdesc) agg_link; 205 206 /* Aggregated txdescs, in sending order. */ 207 STAILQ_HEAD(, hn_txdesc) agg_list; 208 209 /* The oldest packet, if transmission aggregation happens. */ 210 struct mbuf *m; 211 struct hn_tx_ring *txr; 212 int refs; 213 uint32_t flags; /* HN_TXD_FLAG_ */ 214 struct hn_nvs_sendctx send_ctx; 215 uint32_t chim_index; 216 int chim_size; 217 218 bus_dmamap_t data_dmap; 219 220 bus_addr_t rndis_pkt_paddr; 221 struct rndis_packet_msg *rndis_pkt; 222 bus_dmamap_t rndis_pkt_dmap; 223 }; 224 225 #define HN_TXD_FLAG_ONLIST 0x0001 226 #define HN_TXD_FLAG_DMAMAP 0x0002 227 #define HN_TXD_FLAG_ONAGG 0x0004 228 229 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 230 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 231 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 232 233 struct packet_info_id { 234 uint8_t ver; 235 uint8_t flag; 236 uint16_t pkt_id; 237 }; 238 239 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 240 241 242 struct hn_rxinfo { 243 const uint32_t *vlan_info; 244 const uint32_t *csum_info; 245 const uint32_t *hash_info; 246 const uint32_t *hash_value; 247 const struct packet_info_id *pktinfo_id; 248 }; 249 250 struct hn_rxvf_setarg { 251 struct hn_rx_ring *rxr; 252 if_t vf_ifp; 253 }; 254 255 #define HN_RXINFO_VLAN 0x0001 256 #define HN_RXINFO_CSUM 0x0002 257 #define HN_RXINFO_HASHINF 0x0004 258 #define HN_RXINFO_HASHVAL 0x0008 259 #define HN_RXINFO_PKTINFO_ID 0x0010 260 #define HN_RXINFO_ALL \ 261 (HN_RXINFO_VLAN | \ 262 HN_RXINFO_CSUM | \ 263 HN_RXINFO_HASHINF | \ 264 HN_RXINFO_HASHVAL | \ 265 HN_RXINFO_PKTINFO_ID) 266 267 static int hn_probe(device_t); 268 static int hn_attach(device_t); 269 static int hn_detach(device_t); 270 static int hn_shutdown(device_t); 271 static void hn_chan_callback(struct vmbus_channel *, 272 void *); 273 274 static void hn_init(void *); 275 static int hn_ioctl(if_t, u_long, caddr_t); 276 #ifdef HN_IFSTART_SUPPORT 277 static void hn_start(if_t); 278 #endif 279 static int hn_transmit(if_t, struct mbuf *); 280 static void hn_xmit_qflush(if_t); 281 static int hn_ifmedia_upd(if_t); 282 static void hn_ifmedia_sts(if_t, 283 struct ifmediareq *); 284 285 static void hn_ifnet_event(void *, if_t, int); 286 static void hn_ifaddr_event(void *, if_t); 287 static void hn_ifnet_attevent(void *, if_t); 288 static void hn_ifnet_detevent(void *, if_t); 289 static void hn_ifnet_lnkevent(void *, if_t, int); 290 291 static bool hn_ismyvf(const struct hn_softc *, 292 const if_t); 293 static void hn_rxvf_change(struct hn_softc *, 294 if_t, bool); 295 static void hn_rxvf_set(struct hn_softc *, if_t); 296 static void hn_rxvf_set_task(void *, int); 297 static void hn_xpnt_vf_input(if_t, struct mbuf *); 298 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 299 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 300 struct ifreq *); 301 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 302 static bool hn_xpnt_vf_isready(struct hn_softc *); 303 static void hn_xpnt_vf_setready(struct hn_softc *); 304 static void hn_xpnt_vf_init_taskfunc(void *, int); 305 static void hn_xpnt_vf_init(struct hn_softc *); 306 static void hn_xpnt_vf_setenable(struct hn_softc *); 307 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 308 static void hn_vf_rss_fixup(struct hn_softc *, bool); 309 static void hn_vf_rss_restore(struct hn_softc *); 310 311 static int hn_rndis_rxinfo(const void *, int, 312 struct hn_rxinfo *); 313 static void hn_rndis_rx_data(struct hn_rx_ring *, 314 const void *, int); 315 static void hn_rndis_rx_status(struct hn_softc *, 316 const void *, int); 317 static void hn_rndis_init_fixat(struct hn_softc *, int); 318 319 static void hn_nvs_handle_notify(struct hn_softc *, 320 const struct vmbus_chanpkt_hdr *); 321 static void hn_nvs_handle_comp(struct hn_softc *, 322 struct vmbus_channel *, 323 const struct vmbus_chanpkt_hdr *); 324 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 325 struct vmbus_channel *, 326 const struct vmbus_chanpkt_hdr *); 327 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 328 struct vmbus_channel *, uint64_t); 329 330 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 331 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 342 #ifndef RSS 343 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 345 #endif 346 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 347 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 349 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 360 361 static void hn_stop(struct hn_softc *, bool); 362 static void hn_init_locked(struct hn_softc *); 363 static int hn_chan_attach(struct hn_softc *, 364 struct vmbus_channel *); 365 static void hn_chan_detach(struct hn_softc *, 366 struct vmbus_channel *); 367 static int hn_attach_subchans(struct hn_softc *); 368 static void hn_detach_allchans(struct hn_softc *); 369 static void hn_chan_rollup(struct hn_rx_ring *, 370 struct hn_tx_ring *); 371 static void hn_set_ring_inuse(struct hn_softc *, int); 372 static int hn_synth_attach(struct hn_softc *, int); 373 static void hn_synth_detach(struct hn_softc *); 374 static int hn_synth_alloc_subchans(struct hn_softc *, 375 int *); 376 static bool hn_synth_attachable(const struct hn_softc *); 377 static void hn_suspend(struct hn_softc *); 378 static void hn_suspend_data(struct hn_softc *); 379 static void hn_suspend_mgmt(struct hn_softc *); 380 static void hn_resume(struct hn_softc *); 381 static void hn_resume_data(struct hn_softc *); 382 static void hn_resume_mgmt(struct hn_softc *); 383 static void hn_suspend_mgmt_taskfunc(void *, int); 384 static void hn_chan_drain(struct hn_softc *, 385 struct vmbus_channel *); 386 static void hn_disable_rx(struct hn_softc *); 387 static void hn_drain_rxtx(struct hn_softc *, int); 388 static void hn_polling(struct hn_softc *, u_int); 389 static void hn_chan_polling(struct vmbus_channel *, u_int); 390 static void hn_mtu_change_fixup(struct hn_softc *); 391 392 static void hn_update_link_status(struct hn_softc *); 393 static void hn_change_network(struct hn_softc *); 394 static void hn_link_taskfunc(void *, int); 395 static void hn_netchg_init_taskfunc(void *, int); 396 static void hn_netchg_status_taskfunc(void *, int); 397 static void hn_link_status(struct hn_softc *); 398 399 static int hn_create_rx_data(struct hn_softc *, int); 400 static void hn_destroy_rx_data(struct hn_softc *); 401 static int hn_check_iplen(const struct mbuf *, int); 402 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 403 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 404 static int hn_rxfilter_config(struct hn_softc *); 405 static int hn_rss_reconfig(struct hn_softc *); 406 static void hn_rss_ind_fixup(struct hn_softc *); 407 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 408 static int hn_rxpkt(struct hn_rx_ring *); 409 static uint32_t hn_rss_type_fromndis(uint32_t); 410 static uint32_t hn_rss_type_tondis(uint32_t); 411 412 static int hn_tx_ring_create(struct hn_softc *, int); 413 static void hn_tx_ring_destroy(struct hn_tx_ring *); 414 static int hn_create_tx_data(struct hn_softc *, int); 415 static void hn_fixup_tx_data(struct hn_softc *); 416 static void hn_fixup_rx_data(struct hn_softc *); 417 static void hn_destroy_tx_data(struct hn_softc *); 418 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 419 static void hn_txdesc_gc(struct hn_tx_ring *, 420 struct hn_txdesc *); 421 static int hn_encap(if_t, struct hn_tx_ring *, 422 struct hn_txdesc *, struct mbuf **); 423 static int hn_txpkt(if_t, struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static void hn_set_chim_size(struct hn_softc *, int); 426 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 427 static bool hn_tx_ring_pending(struct hn_tx_ring *); 428 static void hn_tx_ring_qflush(struct hn_tx_ring *); 429 static void hn_resume_tx(struct hn_softc *, int); 430 static void hn_set_txagg(struct hn_softc *); 431 static void *hn_try_txagg(if_t, 432 struct hn_tx_ring *, struct hn_txdesc *, 433 int); 434 static int hn_get_txswq_depth(const struct hn_tx_ring *); 435 static void hn_txpkt_done(struct hn_nvs_sendctx *, 436 struct hn_softc *, struct vmbus_channel *, 437 const void *, int); 438 static int hn_txpkt_sglist(struct hn_tx_ring *, 439 struct hn_txdesc *); 440 static int hn_txpkt_chim(struct hn_tx_ring *, 441 struct hn_txdesc *); 442 static int hn_xmit(struct hn_tx_ring *, int); 443 static void hn_xmit_taskfunc(void *, int); 444 static void hn_xmit_txeof(struct hn_tx_ring *); 445 static void hn_xmit_txeof_taskfunc(void *, int); 446 #ifdef HN_IFSTART_SUPPORT 447 static int hn_start_locked(struct hn_tx_ring *, int); 448 static void hn_start_taskfunc(void *, int); 449 static void hn_start_txeof(struct hn_tx_ring *); 450 static void hn_start_txeof_taskfunc(void *, int); 451 #endif 452 453 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 454 455 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 456 "Hyper-V network interface"); 457 458 /* Trust tcp segment verification on host side. */ 459 static int hn_trust_hosttcp = 1; 460 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 461 &hn_trust_hosttcp, 0, 462 "Trust tcp segment verification on host side, " 463 "when csum info is missing (global setting)"); 464 465 /* Trust udp datagrams verification on host side. */ 466 static int hn_trust_hostudp = 1; 467 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 468 &hn_trust_hostudp, 0, 469 "Trust udp datagram verification on host side, " 470 "when csum info is missing (global setting)"); 471 472 /* Trust ip packets verification on host side. */ 473 static int hn_trust_hostip = 1; 474 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 475 &hn_trust_hostip, 0, 476 "Trust ip packet verification on host side, " 477 "when csum info is missing (global setting)"); 478 479 /* 480 * Offload UDP/IPv4 checksum. 481 */ 482 static int hn_enable_udp4cs = 1; 483 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 484 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 485 486 /* 487 * Offload UDP/IPv6 checksum. 488 */ 489 static int hn_enable_udp6cs = 1; 490 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 491 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 492 493 /* Stats. */ 494 static counter_u64_t hn_udpcs_fixup; 495 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 496 &hn_udpcs_fixup, "# of UDP checksum fixup"); 497 498 /* 499 * See hn_set_hlen(). 500 * 501 * This value is for Azure. For Hyper-V, set this above 502 * 65536 to disable UDP datagram checksum fixup. 503 */ 504 static int hn_udpcs_fixup_mtu = 1420; 505 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 506 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 507 508 /* Limit TSO burst size */ 509 static int hn_tso_maxlen = IP_MAXPACKET; 510 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 511 &hn_tso_maxlen, 0, "TSO burst limit"); 512 513 /* Limit chimney send size */ 514 static int hn_tx_chimney_size = 0; 515 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 516 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 517 518 /* Limit the size of packet for direct transmission */ 519 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 520 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 521 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 522 523 /* # of LRO entries per RX ring */ 524 #if defined(INET) || defined(INET6) 525 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 526 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 527 &hn_lro_entry_count, 0, "LRO entry count"); 528 #endif 529 530 static int hn_tx_taskq_cnt = 1; 531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 532 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 533 534 #define HN_TX_TASKQ_M_INDEP 0 535 #define HN_TX_TASKQ_M_GLOBAL 1 536 #define HN_TX_TASKQ_M_EVTTQ 2 537 538 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 539 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 540 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 541 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 542 543 #ifndef HN_USE_TXDESC_BUFRING 544 static int hn_use_txdesc_bufring = 0; 545 #else 546 static int hn_use_txdesc_bufring = 1; 547 #endif 548 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 549 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 550 551 #ifdef HN_IFSTART_SUPPORT 552 /* Use ifnet.if_start instead of ifnet.if_transmit */ 553 static int hn_use_if_start = 0; 554 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 555 &hn_use_if_start, 0, "Use if_start TX method"); 556 #endif 557 558 /* # of channels to use */ 559 static int hn_chan_cnt = 0; 560 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 561 &hn_chan_cnt, 0, 562 "# of channels to use; each channel has one RX ring and one TX ring"); 563 564 /* # of transmit rings to use */ 565 static int hn_tx_ring_cnt = 0; 566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 567 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 568 569 /* Software TX ring deptch */ 570 static int hn_tx_swq_depth = 0; 571 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 572 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 573 574 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 575 static u_int hn_lro_mbufq_depth = 0; 576 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 577 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 578 579 /* Packet transmission aggregation size limit */ 580 static int hn_tx_agg_size = -1; 581 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 582 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 583 584 /* Packet transmission aggregation count limit */ 585 static int hn_tx_agg_pkts = -1; 586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 587 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 588 589 /* VF list */ 590 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 591 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 592 hn_vflist_sysctl, "A", 593 "VF list"); 594 595 /* VF mapping */ 596 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 597 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 598 hn_vfmap_sysctl, "A", 599 "VF mapping"); 600 601 /* Transparent VF */ 602 static int hn_xpnt_vf = 1; 603 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 604 &hn_xpnt_vf, 0, "Transparent VF mod"); 605 606 /* Accurate BPF support for Transparent VF */ 607 static int hn_xpnt_vf_accbpf = 0; 608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 609 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 610 611 /* Extra wait for transparent VF attach routing; unit seconds. */ 612 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 614 &hn_xpnt_vf_attwait, 0, 615 "Extra wait for transparent VF attach routing; unit: seconds"); 616 617 static u_int hn_cpu_index; /* next CPU for channel */ 618 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 619 620 static struct rmlock hn_vfmap_lock; 621 static int hn_vfmap_size; 622 static if_t *hn_vfmap; 623 624 #ifndef RSS 625 static const uint8_t 626 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 627 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 628 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 629 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 630 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 631 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 632 }; 633 #endif /* !RSS */ 634 635 static const struct hyperv_guid hn_guid = { 636 .hv_guid = { 637 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 638 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 639 }; 640 641 static device_method_t hn_methods[] = { 642 /* Device interface */ 643 DEVMETHOD(device_probe, hn_probe), 644 DEVMETHOD(device_attach, hn_attach), 645 DEVMETHOD(device_detach, hn_detach), 646 DEVMETHOD(device_shutdown, hn_shutdown), 647 DEVMETHOD_END 648 }; 649 650 static driver_t hn_driver = { 651 "hn", 652 hn_methods, 653 sizeof(struct hn_softc) 654 }; 655 656 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); 657 MODULE_VERSION(hn, 1); 658 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 659 660 static void 661 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 662 { 663 int i; 664 665 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 666 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 667 } 668 669 static int 670 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 671 { 672 673 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 674 txd->chim_size == 0, ("invalid rndis sglist txd")); 675 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 676 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 677 } 678 679 static int 680 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 681 { 682 struct hn_nvs_rndis rndis; 683 684 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 685 txd->chim_size > 0, ("invalid rndis chim txd")); 686 687 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 688 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 689 rndis.nvs_chim_idx = txd->chim_index; 690 rndis.nvs_chim_sz = txd->chim_size; 691 692 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 693 &rndis, sizeof(rndis), &txd->send_ctx)); 694 } 695 696 static __inline uint32_t 697 hn_chim_alloc(struct hn_softc *sc) 698 { 699 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 700 u_long *bmap = sc->hn_chim_bmap; 701 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 702 703 for (i = 0; i < bmap_cnt; ++i) { 704 int idx; 705 706 idx = ffsl(~bmap[i]); 707 if (idx == 0) 708 continue; 709 710 --idx; /* ffsl is 1-based */ 711 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 712 ("invalid i %d and idx %d", i, idx)); 713 714 if (atomic_testandset_long(&bmap[i], idx)) 715 continue; 716 717 ret = i * LONG_BIT + idx; 718 break; 719 } 720 return (ret); 721 } 722 723 static __inline void 724 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 725 { 726 u_long mask; 727 uint32_t idx; 728 729 idx = chim_idx / LONG_BIT; 730 KASSERT(idx < sc->hn_chim_bmap_cnt, 731 ("invalid chimney index 0x%x", chim_idx)); 732 733 mask = 1UL << (chim_idx % LONG_BIT); 734 KASSERT(sc->hn_chim_bmap[idx] & mask, 735 ("index bitmap 0x%lx, chimney index %u, " 736 "bitmap idx %d, bitmask 0x%lx", 737 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 738 739 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 740 } 741 742 #if defined(INET6) || defined(INET) 743 744 #define PULLUP_HDR(m, len) \ 745 do { \ 746 if (__predict_false((m)->m_len < (len))) { \ 747 (m) = m_pullup((m), (len)); \ 748 if ((m) == NULL) \ 749 return (NULL); \ 750 } \ 751 } while (0) 752 753 /* 754 * NOTE: If this function failed, the m_head would be freed. 755 */ 756 static __inline struct mbuf * 757 hn_tso_fixup(struct mbuf *m_head) 758 { 759 struct ether_vlan_header *evl; 760 struct tcphdr *th; 761 int ehlen; 762 763 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 764 765 PULLUP_HDR(m_head, sizeof(*evl)); 766 evl = mtod(m_head, struct ether_vlan_header *); 767 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 768 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 769 else 770 ehlen = ETHER_HDR_LEN; 771 m_head->m_pkthdr.l2hlen = ehlen; 772 773 #ifdef INET 774 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 775 struct ip *ip; 776 int iphlen; 777 778 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 779 ip = mtodo(m_head, ehlen); 780 iphlen = ip->ip_hl << 2; 781 m_head->m_pkthdr.l3hlen = iphlen; 782 783 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 784 th = mtodo(m_head, ehlen + iphlen); 785 786 ip->ip_len = 0; 787 ip->ip_sum = 0; 788 th->th_sum = in_pseudo(ip->ip_src.s_addr, 789 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 790 } 791 #endif 792 #if defined(INET6) && defined(INET) 793 else 794 #endif 795 #ifdef INET6 796 { 797 struct ip6_hdr *ip6; 798 799 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 800 ip6 = mtodo(m_head, ehlen); 801 if (ip6->ip6_nxt != IPPROTO_TCP) { 802 m_freem(m_head); 803 return (NULL); 804 } 805 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 806 807 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 808 th = mtodo(m_head, ehlen + sizeof(*ip6)); 809 810 ip6->ip6_plen = 0; 811 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 812 } 813 #endif 814 return (m_head); 815 } 816 817 /* 818 * NOTE: If this function failed, the m_head would be freed. 819 */ 820 static __inline struct mbuf * 821 hn_set_hlen(struct mbuf *m_head) 822 { 823 const struct ether_vlan_header *evl; 824 int ehlen; 825 826 PULLUP_HDR(m_head, sizeof(*evl)); 827 evl = mtod(m_head, const struct ether_vlan_header *); 828 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 829 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 830 else 831 ehlen = ETHER_HDR_LEN; 832 m_head->m_pkthdr.l2hlen = ehlen; 833 834 #ifdef INET 835 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 836 const struct ip *ip; 837 int iphlen; 838 839 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 840 ip = mtodo(m_head, ehlen); 841 iphlen = ip->ip_hl << 2; 842 m_head->m_pkthdr.l3hlen = iphlen; 843 844 /* 845 * UDP checksum offload does not work in Azure, if the 846 * following conditions meet: 847 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 848 * - IP_DF is not set in the IP hdr. 849 * 850 * Fallback to software checksum for these UDP datagrams. 851 */ 852 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 853 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 854 (ntohs(ip->ip_off) & IP_DF) == 0) { 855 uint16_t off = ehlen + iphlen; 856 857 counter_u64_add(hn_udpcs_fixup, 1); 858 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 859 *(uint16_t *)(m_head->m_data + off + 860 m_head->m_pkthdr.csum_data) = in_cksum_skip( 861 m_head, m_head->m_pkthdr.len, off); 862 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 863 } 864 } 865 #endif 866 #if defined(INET6) && defined(INET) 867 else 868 #endif 869 #ifdef INET6 870 { 871 const struct ip6_hdr *ip6; 872 873 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 874 ip6 = mtodo(m_head, ehlen); 875 if (ip6->ip6_nxt != IPPROTO_TCP && 876 ip6->ip6_nxt != IPPROTO_UDP) { 877 m_freem(m_head); 878 return (NULL); 879 } 880 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 881 } 882 #endif 883 return (m_head); 884 } 885 886 /* 887 * NOTE: If this function failed, the m_head would be freed. 888 */ 889 static __inline struct mbuf * 890 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 891 { 892 const struct tcphdr *th; 893 int ehlen, iphlen; 894 895 *tcpsyn = 0; 896 ehlen = m_head->m_pkthdr.l2hlen; 897 iphlen = m_head->m_pkthdr.l3hlen; 898 899 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 900 th = mtodo(m_head, ehlen + iphlen); 901 if (th->th_flags & TH_SYN) 902 *tcpsyn = 1; 903 return (m_head); 904 } 905 906 #undef PULLUP_HDR 907 908 #endif /* INET6 || INET */ 909 910 static int 911 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 912 { 913 int error = 0; 914 915 HN_LOCK_ASSERT(sc); 916 917 if (sc->hn_rx_filter != filter) { 918 error = hn_rndis_set_rxfilter(sc, filter); 919 if (!error) 920 sc->hn_rx_filter = filter; 921 } 922 return (error); 923 } 924 925 static int 926 hn_rxfilter_config(struct hn_softc *sc) 927 { 928 if_t ifp = sc->hn_ifp; 929 uint32_t filter; 930 931 HN_LOCK_ASSERT(sc); 932 933 /* 934 * If the non-transparent mode VF is activated, we don't know how 935 * its RX filter is configured, so stick the synthetic device in 936 * the promiscous mode. 937 */ 938 if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 939 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 940 } else { 941 filter = NDIS_PACKET_TYPE_DIRECTED; 942 if (if_getflags(ifp) & IFF_BROADCAST) 943 filter |= NDIS_PACKET_TYPE_BROADCAST; 944 /* TODO: support multicast list */ 945 if ((if_getflags(ifp) & IFF_ALLMULTI) || 946 !if_maddr_empty(ifp)) 947 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 948 } 949 return (hn_set_rxfilter(sc, filter)); 950 } 951 952 static void 953 hn_set_txagg(struct hn_softc *sc) 954 { 955 uint32_t size, pkts; 956 int i; 957 958 /* 959 * Setup aggregation size. 960 */ 961 if (sc->hn_agg_size < 0) 962 size = UINT32_MAX; 963 else 964 size = sc->hn_agg_size; 965 966 if (sc->hn_rndis_agg_size < size) 967 size = sc->hn_rndis_agg_size; 968 969 /* NOTE: We only aggregate packets using chimney sending buffers. */ 970 if (size > (uint32_t)sc->hn_chim_szmax) 971 size = sc->hn_chim_szmax; 972 973 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 974 /* Disable */ 975 size = 0; 976 pkts = 0; 977 goto done; 978 } 979 980 /* NOTE: Type of the per TX ring setting is 'int'. */ 981 if (size > INT_MAX) 982 size = INT_MAX; 983 984 /* 985 * Setup aggregation packet count. 986 */ 987 if (sc->hn_agg_pkts < 0) 988 pkts = UINT32_MAX; 989 else 990 pkts = sc->hn_agg_pkts; 991 992 if (sc->hn_rndis_agg_pkts < pkts) 993 pkts = sc->hn_rndis_agg_pkts; 994 995 if (pkts <= 1) { 996 /* Disable */ 997 size = 0; 998 pkts = 0; 999 goto done; 1000 } 1001 1002 /* NOTE: Type of the per TX ring setting is 'short'. */ 1003 if (pkts > SHRT_MAX) 1004 pkts = SHRT_MAX; 1005 1006 done: 1007 /* NOTE: Type of the per TX ring setting is 'short'. */ 1008 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1009 /* Disable */ 1010 size = 0; 1011 pkts = 0; 1012 } 1013 1014 if (bootverbose) { 1015 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1016 size, pkts, sc->hn_rndis_agg_align); 1017 } 1018 1019 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1020 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1021 1022 mtx_lock(&txr->hn_tx_lock); 1023 txr->hn_agg_szmax = size; 1024 txr->hn_agg_pktmax = pkts; 1025 txr->hn_agg_align = sc->hn_rndis_agg_align; 1026 mtx_unlock(&txr->hn_tx_lock); 1027 } 1028 } 1029 1030 static int 1031 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1032 { 1033 1034 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1035 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1036 return txr->hn_txdesc_cnt; 1037 return hn_tx_swq_depth; 1038 } 1039 1040 static int 1041 hn_rss_reconfig(struct hn_softc *sc) 1042 { 1043 int error; 1044 1045 HN_LOCK_ASSERT(sc); 1046 1047 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1048 return (ENXIO); 1049 1050 /* 1051 * Disable RSS first. 1052 * 1053 * NOTE: 1054 * Direct reconfiguration by setting the UNCHG flags does 1055 * _not_ work properly. 1056 */ 1057 if (bootverbose) 1058 if_printf(sc->hn_ifp, "disable RSS\n"); 1059 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1060 if (error) { 1061 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1062 return (error); 1063 } 1064 1065 /* 1066 * Reenable the RSS w/ the updated RSS key or indirect 1067 * table. 1068 */ 1069 if (bootverbose) 1070 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1071 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1072 if (error) { 1073 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1074 return (error); 1075 } 1076 return (0); 1077 } 1078 1079 static void 1080 hn_rss_ind_fixup(struct hn_softc *sc) 1081 { 1082 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1083 int i, nchan; 1084 1085 nchan = sc->hn_rx_ring_inuse; 1086 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1087 1088 /* 1089 * Check indirect table to make sure that all channels in it 1090 * can be used. 1091 */ 1092 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1093 if (rss->rss_ind[i] >= nchan) { 1094 if_printf(sc->hn_ifp, 1095 "RSS indirect table %d fixup: %u -> %d\n", 1096 i, rss->rss_ind[i], nchan - 1); 1097 rss->rss_ind[i] = nchan - 1; 1098 } 1099 } 1100 } 1101 1102 static int 1103 hn_ifmedia_upd(if_t ifp __unused) 1104 { 1105 1106 return EOPNOTSUPP; 1107 } 1108 1109 static void 1110 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) 1111 { 1112 struct hn_softc *sc = if_getsoftc(ifp); 1113 1114 ifmr->ifm_status = IFM_AVALID; 1115 ifmr->ifm_active = IFM_ETHER; 1116 1117 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1118 ifmr->ifm_active |= IFM_NONE; 1119 return; 1120 } 1121 ifmr->ifm_status |= IFM_ACTIVE; 1122 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1123 } 1124 1125 static void 1126 hn_rxvf_set_task(void *xarg, int pending __unused) 1127 { 1128 struct hn_rxvf_setarg *arg = xarg; 1129 1130 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1131 } 1132 1133 static void 1134 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp) 1135 { 1136 struct hn_rx_ring *rxr; 1137 struct hn_rxvf_setarg arg; 1138 struct task task; 1139 int i; 1140 1141 HN_LOCK_ASSERT(sc); 1142 1143 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1144 1145 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1146 rxr = &sc->hn_rx_ring[i]; 1147 1148 if (i < sc->hn_rx_ring_inuse) { 1149 arg.rxr = rxr; 1150 arg.vf_ifp = vf_ifp; 1151 vmbus_chan_run_task(rxr->hn_chan, &task); 1152 } else { 1153 rxr->hn_rxvf_ifp = vf_ifp; 1154 } 1155 } 1156 } 1157 1158 static bool 1159 hn_ismyvf(const struct hn_softc *sc, const if_t ifp) 1160 { 1161 if_t hn_ifp; 1162 1163 hn_ifp = sc->hn_ifp; 1164 1165 if (ifp == hn_ifp) 1166 return (false); 1167 1168 if (if_getalloctype(ifp) != IFT_ETHER) 1169 return (false); 1170 1171 /* Ignore lagg/vlan interfaces */ 1172 if (strcmp(if_getdname(ifp), "lagg") == 0 || 1173 strcmp(if_getdname(ifp), "vlan") == 0) 1174 return (false); 1175 1176 /* 1177 * During detach events if_getifaddr(ifp) might be NULL. 1178 * Make sure the bcmp() below doesn't panic on that: 1179 */ 1180 if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL) 1181 return (false); 1182 1183 if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0) 1184 return (false); 1185 1186 return (true); 1187 } 1188 1189 static void 1190 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf) 1191 { 1192 if_t hn_ifp; 1193 1194 HN_LOCK(sc); 1195 1196 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1197 goto out; 1198 1199 if (!hn_ismyvf(sc, ifp)) 1200 goto out; 1201 hn_ifp = sc->hn_ifp; 1202 1203 if (rxvf) { 1204 if (sc->hn_flags & HN_FLAG_RXVF) 1205 goto out; 1206 1207 sc->hn_flags |= HN_FLAG_RXVF; 1208 hn_rxfilter_config(sc); 1209 } else { 1210 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1211 goto out; 1212 1213 sc->hn_flags &= ~HN_FLAG_RXVF; 1214 if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING) 1215 hn_rxfilter_config(sc); 1216 else 1217 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1218 } 1219 1220 hn_nvs_set_datapath(sc, 1221 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1222 1223 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1224 1225 if (rxvf) { 1226 hn_vf_rss_fixup(sc, true); 1227 hn_suspend_mgmt(sc); 1228 sc->hn_link_flags &= 1229 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1230 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1231 } else { 1232 hn_vf_rss_restore(sc); 1233 hn_resume_mgmt(sc); 1234 } 1235 1236 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1237 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1238 1239 if (bootverbose) { 1240 if_printf(hn_ifp, "datapath is switched %s %s\n", 1241 rxvf ? "to" : "from", if_name(ifp)); 1242 } 1243 out: 1244 HN_UNLOCK(sc); 1245 } 1246 1247 static void 1248 hn_ifnet_event(void *arg, if_t ifp, int event) 1249 { 1250 1251 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1252 return; 1253 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1254 } 1255 1256 static void 1257 hn_ifaddr_event(void *arg, if_t ifp) 1258 { 1259 1260 hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP); 1261 } 1262 1263 static int 1264 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused) 1265 { 1266 if_t ifp, vf_ifp; 1267 1268 HN_LOCK_ASSERT(sc); 1269 ifp = sc->hn_ifp; 1270 vf_ifp = sc->hn_vf_ifp; 1271 1272 /* 1273 * Just sync up with VF's enabled capabilities. 1274 */ 1275 if_setcapenable(ifp, if_getcapenable(vf_ifp)); 1276 if_sethwassist(ifp, if_gethwassist(vf_ifp)); 1277 1278 return (0); 1279 } 1280 1281 static int 1282 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1283 { 1284 if_t vf_ifp; 1285 struct ifreq ifr; 1286 1287 HN_LOCK_ASSERT(sc); 1288 vf_ifp = sc->hn_vf_ifp; 1289 1290 memset(&ifr, 0, sizeof(ifr)); 1291 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1292 ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff; 1293 ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16; 1294 return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread)); 1295 } 1296 1297 static void 1298 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1299 { 1300 if_t ifp = sc->hn_ifp; 1301 int allmulti = 0; 1302 1303 HN_LOCK_ASSERT(sc); 1304 1305 /* XXX vlan(4) style mcast addr maintenance */ 1306 if (!if_maddr_empty(ifp)) 1307 allmulti = IFF_ALLMULTI; 1308 1309 /* Always set the VF's if_flags */ 1310 if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti); 1311 } 1312 1313 static void 1314 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m) 1315 { 1316 struct rm_priotracker pt; 1317 if_t hn_ifp = NULL; 1318 struct mbuf *mn; 1319 1320 /* 1321 * XXX racy, if hn(4) ever detached. 1322 */ 1323 rm_rlock(&hn_vfmap_lock, &pt); 1324 if (if_getindex(vf_ifp) < hn_vfmap_size) 1325 hn_ifp = hn_vfmap[if_getindex(vf_ifp)]; 1326 rm_runlock(&hn_vfmap_lock, &pt); 1327 1328 if (hn_ifp != NULL) { 1329 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1330 /* 1331 * Allow tapping on the VF. 1332 */ 1333 ETHER_BPF_MTAP(vf_ifp, mn); 1334 1335 /* 1336 * Update VF stats. 1337 */ 1338 if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) { 1339 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1340 mn->m_pkthdr.len); 1341 } 1342 /* 1343 * XXX IFCOUNTER_IMCAST 1344 * This stat updating is kinda invasive, since it 1345 * requires two checks on the mbuf: the length check 1346 * and the ethernet header check. As of this write, 1347 * all multicast packets go directly to hn(4), which 1348 * makes imcast stat updating in the VF a try in vian. 1349 */ 1350 1351 /* 1352 * Fix up rcvif and increase hn(4)'s ipackets. 1353 */ 1354 mn->m_pkthdr.rcvif = hn_ifp; 1355 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1356 } 1357 /* 1358 * Go through hn(4)'s if_input. 1359 */ 1360 if_input(hn_ifp, m); 1361 } else { 1362 /* 1363 * In the middle of the transition; free this 1364 * mbuf chain. 1365 */ 1366 while (m != NULL) { 1367 mn = m->m_nextpkt; 1368 m->m_nextpkt = NULL; 1369 m_freem(m); 1370 m = mn; 1371 } 1372 } 1373 } 1374 1375 static void 1376 hn_mtu_change_fixup(struct hn_softc *sc) 1377 { 1378 if_t ifp; 1379 1380 HN_LOCK_ASSERT(sc); 1381 ifp = sc->hn_ifp; 1382 1383 hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp)); 1384 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1385 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1386 } 1387 1388 static uint32_t 1389 hn_rss_type_fromndis(uint32_t rss_hash) 1390 { 1391 uint32_t types = 0; 1392 1393 if (rss_hash & NDIS_HASH_IPV4) 1394 types |= RSS_TYPE_IPV4; 1395 if (rss_hash & NDIS_HASH_TCP_IPV4) 1396 types |= RSS_TYPE_TCP_IPV4; 1397 if (rss_hash & NDIS_HASH_IPV6) 1398 types |= RSS_TYPE_IPV6; 1399 if (rss_hash & NDIS_HASH_IPV6_EX) 1400 types |= RSS_TYPE_IPV6_EX; 1401 if (rss_hash & NDIS_HASH_TCP_IPV6) 1402 types |= RSS_TYPE_TCP_IPV6; 1403 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1404 types |= RSS_TYPE_TCP_IPV6_EX; 1405 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1406 types |= RSS_TYPE_UDP_IPV4; 1407 return (types); 1408 } 1409 1410 static uint32_t 1411 hn_rss_type_tondis(uint32_t types) 1412 { 1413 uint32_t rss_hash = 0; 1414 1415 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1416 ("UDP6 and UDP6EX are not supported")); 1417 1418 if (types & RSS_TYPE_IPV4) 1419 rss_hash |= NDIS_HASH_IPV4; 1420 if (types & RSS_TYPE_TCP_IPV4) 1421 rss_hash |= NDIS_HASH_TCP_IPV4; 1422 if (types & RSS_TYPE_IPV6) 1423 rss_hash |= NDIS_HASH_IPV6; 1424 if (types & RSS_TYPE_IPV6_EX) 1425 rss_hash |= NDIS_HASH_IPV6_EX; 1426 if (types & RSS_TYPE_TCP_IPV6) 1427 rss_hash |= NDIS_HASH_TCP_IPV6; 1428 if (types & RSS_TYPE_TCP_IPV6_EX) 1429 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1430 if (types & RSS_TYPE_UDP_IPV4) 1431 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1432 return (rss_hash); 1433 } 1434 1435 static void 1436 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1437 { 1438 int i; 1439 1440 HN_LOCK_ASSERT(sc); 1441 1442 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1443 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1444 } 1445 1446 static void 1447 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1448 { 1449 if_t ifp, vf_ifp; 1450 struct ifrsshash ifrh; 1451 struct ifrsskey ifrk; 1452 int error; 1453 uint32_t my_types, diff_types, mbuf_types = 0; 1454 1455 HN_LOCK_ASSERT(sc); 1456 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1457 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1458 1459 if (sc->hn_rx_ring_inuse == 1) { 1460 /* No RSS on synthetic parts; done. */ 1461 return; 1462 } 1463 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1464 /* Synthetic parts do not support Toeplitz; done. */ 1465 return; 1466 } 1467 1468 ifp = sc->hn_ifp; 1469 vf_ifp = sc->hn_vf_ifp; 1470 1471 /* 1472 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1473 * supported. 1474 */ 1475 memset(&ifrk, 0, sizeof(ifrk)); 1476 strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name)); 1477 error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread); 1478 if (error) { 1479 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1480 if_name(vf_ifp), error); 1481 goto done; 1482 } 1483 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1484 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1485 if_name(vf_ifp), ifrk.ifrk_func); 1486 goto done; 1487 } 1488 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1489 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1490 if_name(vf_ifp), ifrk.ifrk_keylen); 1491 goto done; 1492 } 1493 1494 /* 1495 * Extract VF's RSS hash. Only Toeplitz is supported. 1496 */ 1497 memset(&ifrh, 0, sizeof(ifrh)); 1498 strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name)); 1499 error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread); 1500 if (error) { 1501 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1502 if_name(vf_ifp), error); 1503 goto done; 1504 } 1505 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1506 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1507 if_name(vf_ifp), ifrh.ifrh_func); 1508 goto done; 1509 } 1510 1511 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1512 if ((ifrh.ifrh_types & my_types) == 0) { 1513 /* This disables RSS; ignore it then */ 1514 if_printf(ifp, "%s intersection of RSS types failed. " 1515 "VF %#x, mine %#x\n", if_name(vf_ifp), 1516 ifrh.ifrh_types, my_types); 1517 goto done; 1518 } 1519 1520 diff_types = my_types ^ ifrh.ifrh_types; 1521 my_types &= ifrh.ifrh_types; 1522 mbuf_types = my_types; 1523 1524 /* 1525 * Detect RSS hash value/type confliction. 1526 * 1527 * NOTE: 1528 * We don't disable the hash type, but stop delivery the hash 1529 * value/type through mbufs on RX path. 1530 * 1531 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1532 * hash is delivered with type of TCP_IPV4. This means if 1533 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1534 * least to hn_mbuf_hash. However, given that _all_ of the 1535 * NICs implement TCP_IPV4, this will _not_ impose any issues 1536 * here. 1537 */ 1538 if ((my_types & RSS_TYPE_IPV4) && 1539 (diff_types & ifrh.ifrh_types & 1540 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1541 /* Conflict; disable IPV4 hash type/value delivery. */ 1542 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1543 mbuf_types &= ~RSS_TYPE_IPV4; 1544 } 1545 if ((my_types & RSS_TYPE_IPV6) && 1546 (diff_types & ifrh.ifrh_types & 1547 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1548 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1549 RSS_TYPE_IPV6_EX))) { 1550 /* Conflict; disable IPV6 hash type/value delivery. */ 1551 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1552 mbuf_types &= ~RSS_TYPE_IPV6; 1553 } 1554 if ((my_types & RSS_TYPE_IPV6_EX) && 1555 (diff_types & ifrh.ifrh_types & 1556 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1557 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1558 RSS_TYPE_IPV6))) { 1559 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1560 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1561 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1562 } 1563 if ((my_types & RSS_TYPE_TCP_IPV6) && 1564 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1565 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1566 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1567 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1568 } 1569 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1570 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1571 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1572 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1573 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1574 } 1575 if ((my_types & RSS_TYPE_UDP_IPV6) && 1576 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1577 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1578 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1579 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1580 } 1581 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1582 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1583 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1584 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1585 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1586 } 1587 1588 /* 1589 * Indirect table does not matter. 1590 */ 1591 1592 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1593 hn_rss_type_tondis(my_types); 1594 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1595 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1596 1597 if (reconf) { 1598 error = hn_rss_reconfig(sc); 1599 if (error) { 1600 /* XXX roll-back? */ 1601 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1602 /* XXX keep going. */ 1603 } 1604 } 1605 done: 1606 /* Hash deliverability for mbufs. */ 1607 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1608 } 1609 1610 static void 1611 hn_vf_rss_restore(struct hn_softc *sc) 1612 { 1613 1614 HN_LOCK_ASSERT(sc); 1615 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1616 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1617 1618 if (sc->hn_rx_ring_inuse == 1) 1619 goto done; 1620 1621 /* 1622 * Restore hash types. Key does _not_ matter. 1623 */ 1624 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1625 int error; 1626 1627 sc->hn_rss_hash = sc->hn_rss_hcap; 1628 error = hn_rss_reconfig(sc); 1629 if (error) { 1630 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1631 error); 1632 /* XXX keep going. */ 1633 } 1634 } 1635 done: 1636 /* Hash deliverability for mbufs. */ 1637 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1638 } 1639 1640 static void 1641 hn_xpnt_vf_setready(struct hn_softc *sc) 1642 { 1643 if_t ifp, vf_ifp; 1644 struct ifreq ifr; 1645 1646 HN_LOCK_ASSERT(sc); 1647 ifp = sc->hn_ifp; 1648 vf_ifp = sc->hn_vf_ifp; 1649 1650 /* 1651 * Mark the VF ready. 1652 */ 1653 sc->hn_vf_rdytick = 0; 1654 1655 /* 1656 * Save information for restoration. 1657 */ 1658 sc->hn_saved_caps = if_getcapabilities(ifp); 1659 sc->hn_saved_tsomax = if_gethwtsomax(ifp); 1660 sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp); 1661 sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp); 1662 sc->hn_saved_capenable = if_getcapenable(ifp); 1663 sc->hn_saved_hwassist = if_gethwassist(ifp); 1664 1665 /* 1666 * Intersect supported/enabled capabilities. 1667 * 1668 * NOTE: 1669 * if_hwassist is not changed here. 1670 */ 1671 if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp)); 1672 if_setcapenablebit(ifp, 0, if_getcapabilities(ifp)); 1673 1674 /* 1675 * Fix TSO settings. 1676 */ 1677 if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp)) 1678 if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp)); 1679 if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp)) 1680 if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp)); 1681 if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp)) 1682 if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp)); 1683 1684 /* 1685 * Change VF's enabled capabilities. 1686 */ 1687 memset(&ifr, 0, sizeof(ifr)); 1688 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1689 ifr.ifr_reqcap = if_getcapenable(ifp); 1690 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1691 1692 if (if_getmtu(ifp) != ETHERMTU) { 1693 int error; 1694 1695 /* 1696 * Change VF's MTU. 1697 */ 1698 memset(&ifr, 0, sizeof(ifr)); 1699 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1700 ifr.ifr_mtu = if_getmtu(ifp); 1701 error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread); 1702 if (error) { 1703 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1704 if_name(vf_ifp), if_getmtu(ifp)); 1705 if (if_getmtu(ifp) > ETHERMTU) { 1706 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1707 1708 /* 1709 * XXX 1710 * No need to adjust the synthetic parts' MTU; 1711 * failure of the adjustment will cause us 1712 * infinite headache. 1713 */ 1714 if_setmtu(ifp, ETHERMTU); 1715 hn_mtu_change_fixup(sc); 1716 } 1717 } 1718 } 1719 } 1720 1721 static bool 1722 hn_xpnt_vf_isready(struct hn_softc *sc) 1723 { 1724 1725 HN_LOCK_ASSERT(sc); 1726 1727 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1728 return (false); 1729 1730 if (sc->hn_vf_rdytick == 0) 1731 return (true); 1732 1733 if (sc->hn_vf_rdytick > ticks) 1734 return (false); 1735 1736 /* Mark VF as ready. */ 1737 hn_xpnt_vf_setready(sc); 1738 return (true); 1739 } 1740 1741 static void 1742 hn_xpnt_vf_setenable(struct hn_softc *sc) 1743 { 1744 int i; 1745 1746 HN_LOCK_ASSERT(sc); 1747 1748 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1749 rm_wlock(&sc->hn_vf_lock); 1750 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1751 rm_wunlock(&sc->hn_vf_lock); 1752 1753 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1754 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1755 } 1756 1757 static void 1758 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1759 { 1760 int i; 1761 1762 HN_LOCK_ASSERT(sc); 1763 1764 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1765 rm_wlock(&sc->hn_vf_lock); 1766 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1767 if (clear_vf) 1768 sc->hn_vf_ifp = NULL; 1769 rm_wunlock(&sc->hn_vf_lock); 1770 1771 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1772 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1773 } 1774 1775 static void 1776 hn_xpnt_vf_init(struct hn_softc *sc) 1777 { 1778 int error; 1779 1780 HN_LOCK_ASSERT(sc); 1781 1782 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1783 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1784 1785 if (bootverbose) { 1786 if_printf(sc->hn_ifp, "try bringing up %s\n", 1787 if_name(sc->hn_vf_ifp)); 1788 } 1789 1790 /* 1791 * Bring the VF up. 1792 */ 1793 hn_xpnt_vf_saveifflags(sc); 1794 if_setflagbits(sc->hn_ifp, IFF_UP, 0); 1795 error = hn_xpnt_vf_iocsetflags(sc); 1796 if (error) { 1797 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1798 if_name(sc->hn_vf_ifp), error); 1799 return; 1800 } 1801 1802 /* 1803 * NOTE: 1804 * Datapath setting must happen _after_ bringing the VF up. 1805 */ 1806 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1807 1808 /* 1809 * NOTE: 1810 * Fixup RSS related bits _after_ the VF is brought up, since 1811 * many VFs generate RSS key during it's initialization. 1812 */ 1813 hn_vf_rss_fixup(sc, true); 1814 1815 /* Mark transparent mode VF as enabled. */ 1816 hn_xpnt_vf_setenable(sc); 1817 } 1818 1819 static void 1820 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1821 { 1822 struct hn_softc *sc = xsc; 1823 1824 HN_LOCK(sc); 1825 1826 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1827 goto done; 1828 if (sc->hn_vf_ifp == NULL) 1829 goto done; 1830 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1831 goto done; 1832 1833 if (sc->hn_vf_rdytick != 0) { 1834 /* Mark VF as ready. */ 1835 hn_xpnt_vf_setready(sc); 1836 } 1837 1838 if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) { 1839 /* 1840 * Delayed VF initialization. 1841 */ 1842 if (bootverbose) { 1843 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1844 if_name(sc->hn_vf_ifp)); 1845 } 1846 hn_xpnt_vf_init(sc); 1847 } 1848 done: 1849 HN_UNLOCK(sc); 1850 } 1851 1852 static void 1853 hn_ifnet_attevent(void *xsc, if_t ifp) 1854 { 1855 struct hn_softc *sc = xsc; 1856 1857 HN_LOCK(sc); 1858 1859 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1860 goto done; 1861 1862 if (!hn_ismyvf(sc, ifp)) 1863 goto done; 1864 1865 if (sc->hn_vf_ifp != NULL) { 1866 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1867 if_name(sc->hn_vf_ifp)); 1868 goto done; 1869 } 1870 1871 if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) { 1872 /* 1873 * ifnet.if_start is _not_ supported by transparent 1874 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1875 */ 1876 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1877 "in transparent VF mode.\n", if_name(sc->hn_vf_ifp)); 1878 1879 goto done; 1880 } 1881 1882 rm_wlock(&hn_vfmap_lock); 1883 1884 if (if_getindex(ifp) >= hn_vfmap_size) { 1885 if_t *newmap; 1886 int newsize; 1887 1888 newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF; 1889 newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF, 1890 M_WAITOK | M_ZERO); 1891 1892 memcpy(newmap, hn_vfmap, 1893 sizeof(if_t) * hn_vfmap_size); 1894 free(hn_vfmap, M_DEVBUF); 1895 hn_vfmap = newmap; 1896 hn_vfmap_size = newsize; 1897 } 1898 KASSERT(hn_vfmap[if_getindex(ifp)] == NULL, 1899 ("%s: ifindex %d was mapped to %s", 1900 if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); 1901 hn_vfmap[if_getindex(ifp)] = sc->hn_ifp; 1902 1903 rm_wunlock(&hn_vfmap_lock); 1904 1905 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1906 rm_wlock(&sc->hn_vf_lock); 1907 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1908 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1909 sc->hn_vf_ifp = ifp; 1910 rm_wunlock(&sc->hn_vf_lock); 1911 1912 if (hn_xpnt_vf) { 1913 int wait_ticks; 1914 1915 /* 1916 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1917 * Save vf_ifp's current if_input for later restoration. 1918 */ 1919 sc->hn_vf_input = if_getinputfn(ifp); 1920 if_setinputfn(ifp, hn_xpnt_vf_input); 1921 1922 /* 1923 * Stop link status management; use the VF's. 1924 */ 1925 hn_suspend_mgmt(sc); 1926 1927 /* 1928 * Give VF sometime to complete its attach routing. 1929 */ 1930 wait_ticks = hn_xpnt_vf_attwait * hz; 1931 sc->hn_vf_rdytick = ticks + wait_ticks; 1932 1933 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1934 wait_ticks); 1935 } 1936 done: 1937 HN_UNLOCK(sc); 1938 } 1939 1940 static void 1941 hn_ifnet_detevent(void *xsc, if_t ifp) 1942 { 1943 struct hn_softc *sc = xsc; 1944 1945 HN_LOCK(sc); 1946 1947 if (sc->hn_vf_ifp == NULL) 1948 goto done; 1949 1950 if (!hn_ismyvf(sc, ifp)) 1951 goto done; 1952 1953 if (hn_xpnt_vf) { 1954 /* 1955 * Make sure that the delayed initialization is not running. 1956 * 1957 * NOTE: 1958 * - This lock _must_ be released, since the hn_vf_init task 1959 * will try holding this lock. 1960 * - It is safe to release this lock here, since the 1961 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1962 * 1963 * XXX racy, if hn(4) ever detached. 1964 */ 1965 HN_UNLOCK(sc); 1966 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 1967 HN_LOCK(sc); 1968 1969 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 1970 if_name(sc->hn_ifp))); 1971 if_setinputfn(ifp, sc->hn_vf_input); 1972 sc->hn_vf_input = NULL; 1973 1974 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 1975 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 1976 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 1977 1978 if (sc->hn_vf_rdytick == 0) { 1979 /* 1980 * The VF was ready; restore some settings. 1981 */ 1982 if_setcapabilities(ifp, sc->hn_saved_caps); 1983 1984 if_sethwtsomax(ifp, sc->hn_saved_tsomax); 1985 if_sethwtsomaxsegcount(sc->hn_ifp, 1986 sc->hn_saved_tsosegcnt); 1987 if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz); 1988 1989 if_setcapenable(ifp, sc->hn_saved_capenable); 1990 if_sethwassist(ifp, sc->hn_saved_hwassist); 1991 } 1992 1993 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1994 /* 1995 * Restore RSS settings. 1996 */ 1997 hn_vf_rss_restore(sc); 1998 1999 /* 2000 * Resume link status management, which was suspended 2001 * by hn_ifnet_attevent(). 2002 */ 2003 hn_resume_mgmt(sc); 2004 } 2005 } 2006 2007 /* Mark transparent mode VF as disabled. */ 2008 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2009 2010 rm_wlock(&hn_vfmap_lock); 2011 2012 KASSERT(if_getindex(ifp) < hn_vfmap_size, 2013 ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size)); 2014 if (hn_vfmap[if_getindex(ifp)] != NULL) { 2015 KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp, 2016 ("%s: ifindex %d was mapped to %s", 2017 if_name(ifp), if_getindex(ifp), 2018 if_name(hn_vfmap[if_getindex(ifp)]))); 2019 hn_vfmap[if_getindex(ifp)] = NULL; 2020 } 2021 2022 rm_wunlock(&hn_vfmap_lock); 2023 done: 2024 HN_UNLOCK(sc); 2025 } 2026 2027 static void 2028 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state) 2029 { 2030 struct hn_softc *sc = xsc; 2031 2032 if (sc->hn_vf_ifp == ifp) 2033 if_link_state_change(sc->hn_ifp, link_state); 2034 } 2035 2036 static int 2037 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS) 2038 { 2039 struct hn_softc *sc = arg1; 2040 unsigned int tsomax; 2041 int error; 2042 2043 tsomax = if_gethwtsomax(sc->hn_ifp); 2044 error = sysctl_handle_int(oidp, &tsomax, 0, req); 2045 return error; 2046 } 2047 2048 static int 2049 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS) 2050 { 2051 struct hn_softc *sc = arg1; 2052 unsigned int tsomaxsegcnt; 2053 int error; 2054 2055 tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp); 2056 error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req); 2057 return error; 2058 } 2059 2060 static int 2061 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS) 2062 { 2063 struct hn_softc *sc = arg1; 2064 unsigned int tsomaxsegsz; 2065 int error; 2066 2067 tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp); 2068 error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req); 2069 return error; 2070 } 2071 2072 static int 2073 hn_probe(device_t dev) 2074 { 2075 2076 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2077 device_set_desc(dev, "Hyper-V Network Interface"); 2078 return BUS_PROBE_DEFAULT; 2079 } 2080 return ENXIO; 2081 } 2082 2083 static int 2084 hn_attach(device_t dev) 2085 { 2086 struct hn_softc *sc = device_get_softc(dev); 2087 struct sysctl_oid_list *child; 2088 struct sysctl_ctx_list *ctx; 2089 uint8_t eaddr[ETHER_ADDR_LEN]; 2090 if_t ifp = NULL; 2091 int error, ring_cnt, tx_ring_cnt; 2092 uint32_t mtu; 2093 2094 sc->hn_dev = dev; 2095 sc->hn_prichan = vmbus_get_channel(dev); 2096 HN_LOCK_INIT(sc); 2097 rm_init(&sc->hn_vf_lock, "hnvf"); 2098 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2099 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2100 2101 /* 2102 * Initialize these tunables once. 2103 */ 2104 sc->hn_agg_size = hn_tx_agg_size; 2105 sc->hn_agg_pkts = hn_tx_agg_pkts; 2106 2107 /* 2108 * Setup taskqueue for transmission. 2109 */ 2110 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2111 int i; 2112 2113 sc->hn_tx_taskqs = 2114 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2115 M_DEVBUF, M_WAITOK); 2116 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2117 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2118 M_WAITOK, taskqueue_thread_enqueue, 2119 &sc->hn_tx_taskqs[i]); 2120 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2121 "%s tx%d", device_get_nameunit(dev), i); 2122 } 2123 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2124 sc->hn_tx_taskqs = hn_tx_taskque; 2125 } 2126 2127 /* 2128 * Setup taskqueue for mangement tasks, e.g. link status. 2129 */ 2130 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2131 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2132 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2133 device_get_nameunit(dev)); 2134 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2135 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2136 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2137 hn_netchg_status_taskfunc, sc); 2138 2139 if (hn_xpnt_vf) { 2140 /* 2141 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2142 */ 2143 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2144 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2145 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2146 device_get_nameunit(dev)); 2147 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2148 hn_xpnt_vf_init_taskfunc, sc); 2149 } 2150 2151 /* 2152 * Allocate ifnet and setup its name earlier, so that if_printf 2153 * can be used by functions, which will be called after 2154 * ether_ifattach(). 2155 */ 2156 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2157 if_setsoftc(ifp, sc); 2158 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2159 2160 /* 2161 * Initialize ifmedia earlier so that it can be unconditionally 2162 * destroyed, if error happened later on. 2163 */ 2164 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2165 2166 /* 2167 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2168 * to use (tx_ring_cnt). 2169 * 2170 * NOTE: 2171 * The # of RX rings to use is same as the # of channels to use. 2172 */ 2173 ring_cnt = hn_chan_cnt; 2174 if (ring_cnt <= 0) { 2175 /* Default */ 2176 ring_cnt = mp_ncpus; 2177 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2178 ring_cnt = HN_RING_CNT_DEF_MAX; 2179 } else if (ring_cnt > mp_ncpus) { 2180 ring_cnt = mp_ncpus; 2181 } 2182 #ifdef RSS 2183 if (ring_cnt > rss_getnumbuckets()) 2184 ring_cnt = rss_getnumbuckets(); 2185 #endif 2186 2187 tx_ring_cnt = hn_tx_ring_cnt; 2188 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2189 tx_ring_cnt = ring_cnt; 2190 #ifdef HN_IFSTART_SUPPORT 2191 if (hn_use_if_start) { 2192 /* ifnet.if_start only needs one TX ring. */ 2193 tx_ring_cnt = 1; 2194 } 2195 #endif 2196 2197 /* 2198 * Set the leader CPU for channels. 2199 */ 2200 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2201 2202 /* 2203 * Create enough TX/RX rings, even if only limited number of 2204 * channels can be allocated. 2205 */ 2206 error = hn_create_tx_data(sc, tx_ring_cnt); 2207 if (error) 2208 goto failed; 2209 error = hn_create_rx_data(sc, ring_cnt); 2210 if (error) 2211 goto failed; 2212 2213 /* 2214 * Create transaction context for NVS and RNDIS transactions. 2215 */ 2216 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2217 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2218 if (sc->hn_xact == NULL) { 2219 error = ENXIO; 2220 goto failed; 2221 } 2222 2223 /* 2224 * Install orphan handler for the revocation of this device's 2225 * primary channel. 2226 * 2227 * NOTE: 2228 * The processing order is critical here: 2229 * Install the orphan handler, _before_ testing whether this 2230 * device's primary channel has been revoked or not. 2231 */ 2232 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2233 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2234 error = ENXIO; 2235 goto failed; 2236 } 2237 2238 /* 2239 * Attach the synthetic parts, i.e. NVS and RNDIS. 2240 */ 2241 error = hn_synth_attach(sc, ETHERMTU); 2242 if (error) 2243 goto failed; 2244 2245 error = hn_rndis_get_eaddr(sc, eaddr); 2246 if (error) 2247 goto failed; 2248 2249 error = hn_rndis_get_mtu(sc, &mtu); 2250 if (error) 2251 mtu = ETHERMTU; 2252 else if (bootverbose) 2253 device_printf(dev, "RNDIS mtu %u\n", mtu); 2254 2255 if (sc->hn_rx_ring_inuse > 1) { 2256 /* 2257 * Reduce TCP segment aggregation limit for multiple 2258 * RX rings to increase ACK timeliness. 2259 */ 2260 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2261 } 2262 2263 /* 2264 * Fixup TX/RX stuffs after synthetic parts are attached. 2265 */ 2266 hn_fixup_tx_data(sc); 2267 hn_fixup_rx_data(sc); 2268 2269 ctx = device_get_sysctl_ctx(dev); 2270 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2271 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2272 &sc->hn_nvs_ver, 0, "NVS version"); 2273 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2274 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2275 hn_ndis_version_sysctl, "A", "NDIS version"); 2276 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2277 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2278 hn_caps_sysctl, "A", "capabilities"); 2279 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2280 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2281 hn_hwassist_sysctl, "A", "hwassist"); 2282 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max", 2283 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl, 2284 "IU", "max TSO size"); 2285 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt", 2286 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl, 2287 "IU", "max # of TSO segments"); 2288 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz", 2289 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl, 2290 "IU", "max size of TSO segment"); 2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2292 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2293 hn_rxfilter_sysctl, "A", "rxfilter"); 2294 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2295 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2296 hn_rss_hash_sysctl, "A", "RSS hash"); 2297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2298 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2299 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2300 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2301 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2302 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2303 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2304 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2305 #ifndef RSS 2306 /* 2307 * Don't allow RSS key/indirect table changes, if RSS is defined. 2308 */ 2309 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2310 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2311 hn_rss_key_sysctl, "IU", "RSS key"); 2312 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2313 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2314 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2315 #endif 2316 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2317 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2318 "RNDIS offered packet transmission aggregation size limit"); 2319 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2320 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2321 "RNDIS offered packet transmission aggregation count limit"); 2322 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2323 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2324 "RNDIS packet transmission aggregation alignment"); 2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2326 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2327 hn_txagg_size_sysctl, "I", 2328 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2330 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2331 hn_txagg_pkts_sysctl, "I", 2332 "Packet transmission aggregation packets, " 2333 "0 -- disable, -1 -- auto"); 2334 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2335 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2336 hn_polling_sysctl, "I", 2337 "Polling frequency: [100,1000000], 0 disable polling"); 2338 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2339 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2340 hn_vf_sysctl, "A", "Virtual Function's name"); 2341 if (!hn_xpnt_vf) { 2342 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2343 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2344 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2345 } else { 2346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2347 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2348 hn_xpnt_vf_enabled_sysctl, "I", 2349 "Transparent VF enabled"); 2350 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2351 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2352 hn_xpnt_vf_accbpf_sysctl, "I", 2353 "Accurate BPF for transparent VF"); 2354 } 2355 2356 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2357 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2358 "switch to rsc"); 2359 2360 /* 2361 * Setup the ifmedia, which has been initialized earlier. 2362 */ 2363 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2364 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2365 /* XXX ifmedia_set really should do this for us */ 2366 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2367 2368 /* 2369 * Setup the ifnet for this interface. 2370 */ 2371 2372 if_setbaudrate(ifp, IF_Gbps(10)); 2373 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); 2374 if_setioctlfn(ifp, hn_ioctl); 2375 if_setinitfn(ifp, hn_init); 2376 #ifdef HN_IFSTART_SUPPORT 2377 if (hn_use_if_start) { 2378 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2379 2380 if_setstartfn(ifp, hn_start); 2381 if_setsendqlen(ifp, qdepth); 2382 if_setsendqready(ifp); 2383 } else 2384 #endif 2385 { 2386 if_settransmitfn(ifp, hn_transmit); 2387 if_setqflushfn(ifp, hn_xmit_qflush); 2388 } 2389 2390 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0); 2391 #ifdef foo 2392 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2393 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); 2394 #endif 2395 if (sc->hn_caps & HN_CAP_VLAN) { 2396 /* XXX not sure about VLAN_MTU. */ 2397 if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0); 2398 } 2399 2400 if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist); 2401 if (if_gethwassist(ifp) & HN_CSUM_IP_MASK) 2402 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0); 2403 if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK) 2404 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0); 2405 if (sc->hn_caps & HN_CAP_TSO4) { 2406 if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); 2407 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 2408 } 2409 if (sc->hn_caps & HN_CAP_TSO6) { 2410 if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); 2411 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 2412 } 2413 2414 /* Enable all available capabilities by default. */ 2415 if_setcapenable(ifp, if_getcapabilities(ifp)); 2416 2417 /* 2418 * Disable IPv6 TSO and TXCSUM by default, they still can 2419 * be enabled through SIOCSIFCAP. 2420 */ 2421 if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6)); 2422 if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO)); 2423 2424 if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) { 2425 /* 2426 * Lock hn_set_tso_maxsize() to simplify its 2427 * internal logic. 2428 */ 2429 HN_LOCK(sc); 2430 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2431 HN_UNLOCK(sc); 2432 if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX); 2433 if_sethwtsomaxsegsize(ifp, PAGE_SIZE); 2434 } 2435 2436 ether_ifattach(ifp, eaddr); 2437 2438 if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2439 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2440 if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp)); 2441 } 2442 if (mtu < ETHERMTU) { 2443 2444 if_setmtu(ifp, mtu); 2445 } 2446 2447 /* Inform the upper layer about the long frame support. */ 2448 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); 2449 2450 /* 2451 * Kick off link status check. 2452 */ 2453 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2454 hn_update_link_status(sc); 2455 2456 if (!hn_xpnt_vf) { 2457 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2458 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2459 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2460 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2461 } else { 2462 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2463 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2464 } 2465 2466 /* 2467 * NOTE: 2468 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2469 * since interface's LLADDR is needed; interface LLADDR is not 2470 * available when ifnet_arrival event is triggered. 2471 */ 2472 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2473 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2474 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2475 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2476 2477 return (0); 2478 failed: 2479 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2480 hn_synth_detach(sc); 2481 hn_detach(dev); 2482 return (error); 2483 } 2484 2485 static int 2486 hn_detach(device_t dev) 2487 { 2488 struct hn_softc *sc = device_get_softc(dev); 2489 if_t ifp = sc->hn_ifp, vf_ifp; 2490 2491 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2492 /* 2493 * In case that the vmbus missed the orphan handler 2494 * installation. 2495 */ 2496 vmbus_xact_ctx_orphan(sc->hn_xact); 2497 } 2498 2499 if (sc->hn_ifaddr_evthand != NULL) 2500 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2501 if (sc->hn_ifnet_evthand != NULL) 2502 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2503 if (sc->hn_ifnet_atthand != NULL) { 2504 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2505 sc->hn_ifnet_atthand); 2506 } 2507 if (sc->hn_ifnet_dethand != NULL) { 2508 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2509 sc->hn_ifnet_dethand); 2510 } 2511 if (sc->hn_ifnet_lnkhand != NULL) 2512 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2513 2514 vf_ifp = sc->hn_vf_ifp; 2515 __compiler_membar(); 2516 if (vf_ifp != NULL) 2517 hn_ifnet_detevent(sc, vf_ifp); 2518 2519 if (device_is_attached(dev)) { 2520 HN_LOCK(sc); 2521 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2522 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 2523 hn_stop(sc, true); 2524 /* 2525 * NOTE: 2526 * hn_stop() only suspends data, so managment 2527 * stuffs have to be suspended manually here. 2528 */ 2529 hn_suspend_mgmt(sc); 2530 hn_synth_detach(sc); 2531 } 2532 HN_UNLOCK(sc); 2533 ether_ifdetach(ifp); 2534 } 2535 2536 ifmedia_removeall(&sc->hn_media); 2537 hn_destroy_rx_data(sc); 2538 hn_destroy_tx_data(sc); 2539 2540 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2541 int i; 2542 2543 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2544 taskqueue_free(sc->hn_tx_taskqs[i]); 2545 free(sc->hn_tx_taskqs, M_DEVBUF); 2546 } 2547 taskqueue_free(sc->hn_mgmt_taskq0); 2548 if (sc->hn_vf_taskq != NULL) 2549 taskqueue_free(sc->hn_vf_taskq); 2550 2551 if (sc->hn_xact != NULL) { 2552 /* 2553 * Uninstall the orphan handler _before_ the xact is 2554 * destructed. 2555 */ 2556 vmbus_chan_unset_orphan(sc->hn_prichan); 2557 vmbus_xact_ctx_destroy(sc->hn_xact); 2558 } 2559 2560 if_free(ifp); 2561 2562 HN_LOCK_DESTROY(sc); 2563 rm_destroy(&sc->hn_vf_lock); 2564 return (0); 2565 } 2566 2567 static int 2568 hn_shutdown(device_t dev) 2569 { 2570 2571 return (0); 2572 } 2573 2574 static void 2575 hn_link_status(struct hn_softc *sc) 2576 { 2577 uint32_t link_status; 2578 int error; 2579 2580 error = hn_rndis_get_linkstatus(sc, &link_status); 2581 if (error) { 2582 /* XXX what to do? */ 2583 return; 2584 } 2585 2586 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2587 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2588 else 2589 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2590 if_link_state_change(sc->hn_ifp, 2591 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2592 LINK_STATE_UP : LINK_STATE_DOWN); 2593 } 2594 2595 static void 2596 hn_link_taskfunc(void *xsc, int pending __unused) 2597 { 2598 struct hn_softc *sc = xsc; 2599 2600 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2601 return; 2602 hn_link_status(sc); 2603 } 2604 2605 static void 2606 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2607 { 2608 struct hn_softc *sc = xsc; 2609 2610 /* Prevent any link status checks from running. */ 2611 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2612 2613 /* 2614 * Fake up a [link down --> link up] state change; 5 seconds 2615 * delay is used, which closely simulates miibus reaction 2616 * upon link down event. 2617 */ 2618 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2619 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2620 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2621 &sc->hn_netchg_status, 5 * hz); 2622 } 2623 2624 static void 2625 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2626 { 2627 struct hn_softc *sc = xsc; 2628 2629 /* Re-allow link status checks. */ 2630 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2631 hn_link_status(sc); 2632 } 2633 2634 static void 2635 hn_update_link_status(struct hn_softc *sc) 2636 { 2637 2638 if (sc->hn_mgmt_taskq != NULL) 2639 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2640 } 2641 2642 static void 2643 hn_change_network(struct hn_softc *sc) 2644 { 2645 2646 if (sc->hn_mgmt_taskq != NULL) 2647 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2648 } 2649 2650 static __inline int 2651 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2652 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2653 { 2654 struct mbuf *m = *m_head; 2655 int error; 2656 2657 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2658 2659 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2660 m, segs, nsegs, BUS_DMA_NOWAIT); 2661 if (error == EFBIG) { 2662 struct mbuf *m_new; 2663 2664 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2665 if (m_new == NULL) 2666 return ENOBUFS; 2667 else 2668 *m_head = m = m_new; 2669 txr->hn_tx_collapsed++; 2670 2671 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2672 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2673 } 2674 if (!error) { 2675 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2676 BUS_DMASYNC_PREWRITE); 2677 txd->flags |= HN_TXD_FLAG_DMAMAP; 2678 } 2679 return error; 2680 } 2681 2682 static __inline int 2683 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2684 { 2685 2686 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2687 ("put an onlist txd %#x", txd->flags)); 2688 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2689 ("put an onagg txd %#x", txd->flags)); 2690 2691 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2692 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2693 return 0; 2694 2695 if (!STAILQ_EMPTY(&txd->agg_list)) { 2696 struct hn_txdesc *tmp_txd; 2697 2698 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2699 int freed __diagused; 2700 2701 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2702 ("resursive aggregation on aggregated txdesc")); 2703 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2704 ("not aggregated txdesc")); 2705 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2706 ("aggregated txdesc uses dmamap")); 2707 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2708 ("aggregated txdesc consumes " 2709 "chimney sending buffer")); 2710 KASSERT(tmp_txd->chim_size == 0, 2711 ("aggregated txdesc has non-zero " 2712 "chimney sending size")); 2713 2714 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2715 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2716 freed = hn_txdesc_put(txr, tmp_txd); 2717 KASSERT(freed, ("failed to free aggregated txdesc")); 2718 } 2719 } 2720 2721 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2722 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2723 ("chim txd uses dmamap")); 2724 hn_chim_free(txr->hn_sc, txd->chim_index); 2725 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2726 txd->chim_size = 0; 2727 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2728 bus_dmamap_sync(txr->hn_tx_data_dtag, 2729 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2730 bus_dmamap_unload(txr->hn_tx_data_dtag, 2731 txd->data_dmap); 2732 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2733 } 2734 2735 if (txd->m != NULL) { 2736 m_freem(txd->m); 2737 txd->m = NULL; 2738 } 2739 2740 txd->flags |= HN_TXD_FLAG_ONLIST; 2741 #ifndef HN_USE_TXDESC_BUFRING 2742 mtx_lock_spin(&txr->hn_txlist_spin); 2743 KASSERT(txr->hn_txdesc_avail >= 0 && 2744 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2745 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2746 txr->hn_txdesc_avail++; 2747 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2748 mtx_unlock_spin(&txr->hn_txlist_spin); 2749 #else /* HN_USE_TXDESC_BUFRING */ 2750 #ifdef HN_DEBUG 2751 atomic_add_int(&txr->hn_txdesc_avail, 1); 2752 #endif 2753 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2754 #endif /* !HN_USE_TXDESC_BUFRING */ 2755 2756 return 1; 2757 } 2758 2759 static __inline struct hn_txdesc * 2760 hn_txdesc_get(struct hn_tx_ring *txr) 2761 { 2762 struct hn_txdesc *txd; 2763 2764 #ifndef HN_USE_TXDESC_BUFRING 2765 mtx_lock_spin(&txr->hn_txlist_spin); 2766 txd = SLIST_FIRST(&txr->hn_txlist); 2767 if (txd != NULL) { 2768 KASSERT(txr->hn_txdesc_avail > 0, 2769 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2770 txr->hn_txdesc_avail--; 2771 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2772 } 2773 mtx_unlock_spin(&txr->hn_txlist_spin); 2774 #else 2775 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2776 #endif 2777 2778 if (txd != NULL) { 2779 #ifdef HN_USE_TXDESC_BUFRING 2780 #ifdef HN_DEBUG 2781 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2782 #endif 2783 #endif /* HN_USE_TXDESC_BUFRING */ 2784 KASSERT(txd->m == NULL && txd->refs == 0 && 2785 STAILQ_EMPTY(&txd->agg_list) && 2786 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2787 txd->chim_size == 0 && 2788 (txd->flags & HN_TXD_FLAG_ONLIST) && 2789 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2790 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2791 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2792 txd->refs = 1; 2793 } 2794 return txd; 2795 } 2796 2797 static __inline void 2798 hn_txdesc_hold(struct hn_txdesc *txd) 2799 { 2800 2801 /* 0->1 transition will never work */ 2802 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2803 atomic_add_int(&txd->refs, 1); 2804 } 2805 2806 static __inline void 2807 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2808 { 2809 2810 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2811 ("recursive aggregation on aggregating txdesc")); 2812 2813 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2814 ("already aggregated")); 2815 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2816 ("recursive aggregation on to-be-aggregated txdesc")); 2817 2818 txd->flags |= HN_TXD_FLAG_ONAGG; 2819 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2820 } 2821 2822 static bool 2823 hn_tx_ring_pending(struct hn_tx_ring *txr) 2824 { 2825 bool pending = false; 2826 2827 #ifndef HN_USE_TXDESC_BUFRING 2828 mtx_lock_spin(&txr->hn_txlist_spin); 2829 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2830 pending = true; 2831 mtx_unlock_spin(&txr->hn_txlist_spin); 2832 #else 2833 if (!buf_ring_full(txr->hn_txdesc_br)) 2834 pending = true; 2835 #endif 2836 return (pending); 2837 } 2838 2839 static __inline void 2840 hn_txeof(struct hn_tx_ring *txr) 2841 { 2842 txr->hn_has_txeof = 0; 2843 txr->hn_txeof(txr); 2844 } 2845 2846 static void 2847 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2848 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2849 { 2850 struct hn_txdesc *txd = sndc->hn_cbarg; 2851 struct hn_tx_ring *txr; 2852 2853 txr = txd->txr; 2854 KASSERT(txr->hn_chan == chan, 2855 ("channel mismatch, on chan%u, should be chan%u", 2856 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2857 2858 txr->hn_has_txeof = 1; 2859 hn_txdesc_put(txr, txd); 2860 2861 ++txr->hn_txdone_cnt; 2862 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2863 txr->hn_txdone_cnt = 0; 2864 if (txr->hn_oactive) 2865 hn_txeof(txr); 2866 } 2867 } 2868 2869 static void 2870 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2871 { 2872 #if defined(INET) || defined(INET6) 2873 struct epoch_tracker et; 2874 2875 NET_EPOCH_ENTER(et); 2876 tcp_lro_flush_all(&rxr->hn_lro); 2877 NET_EPOCH_EXIT(et); 2878 #endif 2879 2880 /* 2881 * NOTE: 2882 * 'txr' could be NULL, if multiple channels and 2883 * ifnet.if_start method are enabled. 2884 */ 2885 if (txr == NULL || !txr->hn_has_txeof) 2886 return; 2887 2888 txr->hn_txdone_cnt = 0; 2889 hn_txeof(txr); 2890 } 2891 2892 static __inline uint32_t 2893 hn_rndis_pktmsg_offset(uint32_t ofs) 2894 { 2895 2896 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2897 ("invalid RNDIS packet msg offset %u", ofs)); 2898 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2899 } 2900 2901 static __inline void * 2902 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2903 size_t pi_dlen, uint32_t pi_type) 2904 { 2905 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2906 struct rndis_pktinfo *pi; 2907 2908 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2909 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2910 2911 /* 2912 * Per-packet-info does not move; it only grows. 2913 * 2914 * NOTE: 2915 * rm_pktinfooffset in this phase counts from the beginning 2916 * of rndis_packet_msg. 2917 */ 2918 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2919 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2920 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2921 pkt->rm_pktinfolen); 2922 pkt->rm_pktinfolen += pi_size; 2923 2924 pi->rm_size = pi_size; 2925 pi->rm_type = pi_type; 2926 pi->rm_internal = 0; 2927 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2928 2929 return (pi->rm_data); 2930 } 2931 2932 static __inline int 2933 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr) 2934 { 2935 struct hn_txdesc *txd; 2936 struct mbuf *m; 2937 int error, pkts; 2938 2939 txd = txr->hn_agg_txd; 2940 KASSERT(txd != NULL, ("no aggregate txdesc")); 2941 2942 /* 2943 * Since hn_txpkt() will reset this temporary stat, save 2944 * it now, so that oerrors can be updated properly, if 2945 * hn_txpkt() ever fails. 2946 */ 2947 pkts = txr->hn_stat_pkts; 2948 2949 /* 2950 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2951 * failure, save it for later freeing, if hn_txpkt() ever 2952 * fails. 2953 */ 2954 m = txd->m; 2955 error = hn_txpkt(ifp, txr, txd); 2956 if (__predict_false(error)) { 2957 /* txd is freed, but m is not. */ 2958 m_freem(m); 2959 2960 txr->hn_flush_failed++; 2961 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2962 } 2963 2964 /* Reset all aggregation states. */ 2965 txr->hn_agg_txd = NULL; 2966 txr->hn_agg_szleft = 0; 2967 txr->hn_agg_pktleft = 0; 2968 txr->hn_agg_prevpkt = NULL; 2969 2970 return (error); 2971 } 2972 2973 static void * 2974 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2975 int pktsize) 2976 { 2977 void *chim; 2978 2979 if (txr->hn_agg_txd != NULL) { 2980 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2981 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2982 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2983 int olen; 2984 2985 /* 2986 * Update the previous RNDIS packet's total length, 2987 * it can be increased due to the mandatory alignment 2988 * padding for this RNDIS packet. And update the 2989 * aggregating txdesc's chimney sending buffer size 2990 * accordingly. 2991 * 2992 * XXX 2993 * Zero-out the padding, as required by the RNDIS spec. 2994 */ 2995 olen = pkt->rm_len; 2996 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 2997 agg_txd->chim_size += pkt->rm_len - olen; 2998 2999 /* Link this txdesc to the parent. */ 3000 hn_txdesc_agg(agg_txd, txd); 3001 3002 chim = (uint8_t *)pkt + pkt->rm_len; 3003 /* Save the current packet for later fixup. */ 3004 txr->hn_agg_prevpkt = chim; 3005 3006 txr->hn_agg_pktleft--; 3007 txr->hn_agg_szleft -= pktsize; 3008 if (txr->hn_agg_szleft <= 3009 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3010 /* 3011 * Probably can't aggregate more packets, 3012 * flush this aggregating txdesc proactively. 3013 */ 3014 txr->hn_agg_pktleft = 0; 3015 } 3016 /* Done! */ 3017 return (chim); 3018 } 3019 hn_flush_txagg(ifp, txr); 3020 } 3021 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3022 3023 txr->hn_tx_chimney_tried++; 3024 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3025 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3026 return (NULL); 3027 txr->hn_tx_chimney++; 3028 3029 chim = txr->hn_sc->hn_chim + 3030 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3031 3032 if (txr->hn_agg_pktmax > 1 && 3033 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3034 txr->hn_agg_txd = txd; 3035 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3036 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3037 txr->hn_agg_prevpkt = chim; 3038 } 3039 return (chim); 3040 } 3041 3042 /* 3043 * NOTE: 3044 * If this function fails, then both txd and m_head0 will be freed. 3045 */ 3046 static int 3047 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3048 struct mbuf **m_head0) 3049 { 3050 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3051 int error, nsegs, i; 3052 struct mbuf *m_head = *m_head0; 3053 struct rndis_packet_msg *pkt; 3054 uint32_t *pi_data; 3055 void *chim = NULL; 3056 int pkt_hlen, pkt_size; 3057 3058 pkt = txd->rndis_pkt; 3059 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3060 if (pkt_size < txr->hn_chim_size) { 3061 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3062 if (chim != NULL) 3063 pkt = chim; 3064 } else { 3065 if (txr->hn_agg_txd != NULL) 3066 hn_flush_txagg(ifp, txr); 3067 } 3068 3069 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3070 pkt->rm_len = m_head->m_pkthdr.len; 3071 pkt->rm_dataoffset = 0; 3072 pkt->rm_datalen = m_head->m_pkthdr.len; 3073 pkt->rm_oobdataoffset = 0; 3074 pkt->rm_oobdatalen = 0; 3075 pkt->rm_oobdataelements = 0; 3076 pkt->rm_pktinfooffset = sizeof(*pkt); 3077 pkt->rm_pktinfolen = 0; 3078 pkt->rm_vchandle = 0; 3079 pkt->rm_reserved = 0; 3080 3081 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3082 /* 3083 * Set the hash value for this packet. 3084 */ 3085 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3086 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3087 3088 if (M_HASHTYPE_ISHASH(m_head)) 3089 /* 3090 * The flowid field contains the hash value host 3091 * set in the rx queue if it is a ip forwarding pkt. 3092 * Set the same hash value so host can send on the 3093 * cpu it was received. 3094 */ 3095 *pi_data = m_head->m_pkthdr.flowid; 3096 else 3097 /* 3098 * Otherwise just put the tx queue index. 3099 */ 3100 *pi_data = txr->hn_tx_idx; 3101 } 3102 3103 if (m_head->m_flags & M_VLANTAG) { 3104 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3105 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3106 *pi_data = NDIS_VLAN_INFO_MAKE( 3107 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3108 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3109 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3110 } 3111 3112 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3113 #if defined(INET6) || defined(INET) 3114 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3115 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3116 #ifdef INET 3117 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3118 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3119 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3120 m_head->m_pkthdr.tso_segsz); 3121 } 3122 #endif 3123 #if defined(INET6) && defined(INET) 3124 else 3125 #endif 3126 #ifdef INET6 3127 { 3128 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3129 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3130 m_head->m_pkthdr.tso_segsz); 3131 } 3132 #endif 3133 #endif /* INET6 || INET */ 3134 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3135 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3136 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3137 if (m_head->m_pkthdr.csum_flags & 3138 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3139 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3140 } else { 3141 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3142 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3143 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3144 } 3145 3146 if (m_head->m_pkthdr.csum_flags & 3147 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3148 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3149 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3150 } else if (m_head->m_pkthdr.csum_flags & 3151 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3152 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3153 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3154 } 3155 } 3156 3157 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3158 /* Fixup RNDIS packet message total length */ 3159 pkt->rm_len += pkt_hlen; 3160 /* Convert RNDIS packet message offsets */ 3161 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3162 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3163 3164 /* 3165 * Fast path: Chimney sending. 3166 */ 3167 if (chim != NULL) { 3168 struct hn_txdesc *tgt_txd = txd; 3169 3170 if (txr->hn_agg_txd != NULL) { 3171 tgt_txd = txr->hn_agg_txd; 3172 #ifdef INVARIANTS 3173 *m_head0 = NULL; 3174 #endif 3175 } 3176 3177 KASSERT(pkt == chim, 3178 ("RNDIS pkt not in chimney sending buffer")); 3179 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3180 ("chimney sending buffer is not used")); 3181 tgt_txd->chim_size += pkt->rm_len; 3182 3183 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3184 ((uint8_t *)chim) + pkt_hlen); 3185 3186 txr->hn_gpa_cnt = 0; 3187 txr->hn_sendpkt = hn_txpkt_chim; 3188 goto done; 3189 } 3190 3191 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3192 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3193 ("chimney buffer is used")); 3194 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3195 3196 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3197 if (__predict_false(error)) { 3198 int freed __diagused; 3199 3200 /* 3201 * This mbuf is not linked w/ the txd yet, so free it now. 3202 */ 3203 m_freem(m_head); 3204 *m_head0 = NULL; 3205 3206 freed = hn_txdesc_put(txr, txd); 3207 KASSERT(freed != 0, 3208 ("fail to free txd upon txdma error")); 3209 3210 txr->hn_txdma_failed++; 3211 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3212 return error; 3213 } 3214 *m_head0 = m_head; 3215 3216 /* +1 RNDIS packet message */ 3217 txr->hn_gpa_cnt = nsegs + 1; 3218 3219 /* send packet with page buffer */ 3220 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3221 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3222 txr->hn_gpa[0].gpa_len = pkt_hlen; 3223 3224 /* 3225 * Fill the page buffers with mbuf info after the page 3226 * buffer for RNDIS packet message. 3227 */ 3228 for (i = 0; i < nsegs; ++i) { 3229 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3230 3231 gpa->gpa_page = atop(segs[i].ds_addr); 3232 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3233 gpa->gpa_len = segs[i].ds_len; 3234 } 3235 3236 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3237 txd->chim_size = 0; 3238 txr->hn_sendpkt = hn_txpkt_sglist; 3239 done: 3240 txd->m = m_head; 3241 3242 /* Set the completion routine */ 3243 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3244 3245 /* Update temporary stats for later use. */ 3246 txr->hn_stat_pkts++; 3247 txr->hn_stat_size += m_head->m_pkthdr.len; 3248 if (m_head->m_flags & M_MCAST) 3249 txr->hn_stat_mcasts++; 3250 3251 return 0; 3252 } 3253 3254 /* 3255 * NOTE: 3256 * If this function fails, then txd will be freed, but the mbuf 3257 * associated w/ the txd will _not_ be freed. 3258 */ 3259 static int 3260 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3261 { 3262 int error, send_failed = 0, has_bpf; 3263 3264 again: 3265 has_bpf = bpf_peers_present(if_getbpf(ifp)); 3266 if (has_bpf) { 3267 /* 3268 * Make sure that this txd and any aggregated txds are not 3269 * freed before ETHER_BPF_MTAP. 3270 */ 3271 hn_txdesc_hold(txd); 3272 } 3273 error = txr->hn_sendpkt(txr, txd); 3274 if (!error) { 3275 if (has_bpf) { 3276 const struct hn_txdesc *tmp_txd; 3277 3278 ETHER_BPF_MTAP(ifp, txd->m); 3279 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3280 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3281 } 3282 3283 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3284 #ifdef HN_IFSTART_SUPPORT 3285 if (!hn_use_if_start) 3286 #endif 3287 { 3288 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3289 txr->hn_stat_size); 3290 if (txr->hn_stat_mcasts != 0) { 3291 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3292 txr->hn_stat_mcasts); 3293 } 3294 } 3295 txr->hn_pkts += txr->hn_stat_pkts; 3296 txr->hn_sends++; 3297 } 3298 if (has_bpf) 3299 hn_txdesc_put(txr, txd); 3300 3301 if (__predict_false(error)) { 3302 int freed __diagused; 3303 3304 /* 3305 * This should "really rarely" happen. 3306 * 3307 * XXX Too many RX to be acked or too many sideband 3308 * commands to run? Ask netvsc_channel_rollup() 3309 * to kick start later. 3310 */ 3311 txr->hn_has_txeof = 1; 3312 if (!send_failed) { 3313 txr->hn_send_failed++; 3314 send_failed = 1; 3315 /* 3316 * Try sending again after set hn_has_txeof; 3317 * in case that we missed the last 3318 * netvsc_channel_rollup(). 3319 */ 3320 goto again; 3321 } 3322 if_printf(ifp, "send failed\n"); 3323 3324 /* 3325 * Caller will perform further processing on the 3326 * associated mbuf, so don't free it in hn_txdesc_put(); 3327 * only unload it from the DMA map in hn_txdesc_put(), 3328 * if it was loaded. 3329 */ 3330 txd->m = NULL; 3331 freed = hn_txdesc_put(txr, txd); 3332 KASSERT(freed != 0, 3333 ("fail to free txd upon send error")); 3334 3335 txr->hn_send_failed++; 3336 } 3337 3338 /* Reset temporary stats, after this sending is done. */ 3339 txr->hn_stat_size = 0; 3340 txr->hn_stat_pkts = 0; 3341 txr->hn_stat_mcasts = 0; 3342 3343 return (error); 3344 } 3345 3346 /* 3347 * Append the specified data to the indicated mbuf chain, 3348 * Extend the mbuf chain if the new data does not fit in 3349 * existing space. 3350 * 3351 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3352 * There should be an equivalent in the kernel mbuf code, 3353 * but there does not appear to be one yet. 3354 * 3355 * Differs from m_append() in that additional mbufs are 3356 * allocated with cluster size MJUMPAGESIZE, and filled 3357 * accordingly. 3358 * 3359 * Return the last mbuf in the chain or NULL if failed to 3360 * allocate new mbuf. 3361 */ 3362 static struct mbuf * 3363 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3364 { 3365 struct mbuf *m, *n; 3366 int remainder, space; 3367 3368 for (m = m0; m->m_next != NULL; m = m->m_next) 3369 ; 3370 remainder = len; 3371 space = M_TRAILINGSPACE(m); 3372 if (space > 0) { 3373 /* 3374 * Copy into available space. 3375 */ 3376 if (space > remainder) 3377 space = remainder; 3378 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3379 m->m_len += space; 3380 cp += space; 3381 remainder -= space; 3382 } 3383 while (remainder > 0) { 3384 /* 3385 * Allocate a new mbuf; could check space 3386 * and allocate a cluster instead. 3387 */ 3388 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3389 if (n == NULL) 3390 return NULL; 3391 n->m_len = min(MJUMPAGESIZE, remainder); 3392 bcopy(cp, mtod(n, caddr_t), n->m_len); 3393 cp += n->m_len; 3394 remainder -= n->m_len; 3395 m->m_next = n; 3396 m = n; 3397 } 3398 3399 return m; 3400 } 3401 3402 #if defined(INET) || defined(INET6) 3403 static __inline int 3404 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3405 { 3406 if (hn_lro_mbufq_depth) { 3407 tcp_lro_queue_mbuf(lc, m); 3408 return 0; 3409 } 3410 return tcp_lro_rx(lc, m, 0); 3411 } 3412 #endif 3413 3414 static int 3415 hn_rxpkt(struct hn_rx_ring *rxr) 3416 { 3417 if_t ifp, hn_ifp = rxr->hn_ifp; 3418 struct mbuf *m_new, *n; 3419 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3420 int hash_type = M_HASHTYPE_NONE; 3421 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3422 int i; 3423 3424 ifp = hn_ifp; 3425 if (rxr->hn_rxvf_ifp != NULL) { 3426 /* 3427 * Non-transparent mode VF; pretend this packet is from 3428 * the VF. 3429 */ 3430 ifp = rxr->hn_rxvf_ifp; 3431 is_vf = 1; 3432 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3433 /* Transparent mode VF. */ 3434 is_vf = 1; 3435 } 3436 3437 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { 3438 /* 3439 * NOTE: 3440 * See the NOTE of hn_rndis_init_fixat(). This 3441 * function can be reached, immediately after the 3442 * RNDIS is initialized but before the ifnet is 3443 * setup on the hn_attach() path; drop the unexpected 3444 * packets. 3445 */ 3446 return (0); 3447 } 3448 3449 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3450 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3451 return (0); 3452 } 3453 3454 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3455 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3456 if (m_new == NULL) { 3457 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3458 return (0); 3459 } 3460 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3461 rxr->rsc.frag_len[0]); 3462 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3463 } else { 3464 /* 3465 * Get an mbuf with a cluster. For packets 2K or less, 3466 * get a standard 2K cluster. For anything larger, get a 3467 * 4K cluster. Any buffers larger than 4K can cause problems 3468 * if looped around to the Hyper-V TX channel, so avoid them. 3469 */ 3470 size = MCLBYTES; 3471 if (rxr->rsc.pktlen > MCLBYTES) { 3472 /* 4096 */ 3473 size = MJUMPAGESIZE; 3474 } 3475 3476 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3477 if (m_new == NULL) { 3478 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3479 return (0); 3480 } 3481 3482 n = m_new; 3483 for (i = 0; i < rxr->rsc.cnt; i++) { 3484 n = hv_m_append(n, rxr->rsc.frag_len[i], 3485 rxr->rsc.frag_data[i]); 3486 if (n == NULL) { 3487 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3488 return (0); 3489 } else { 3490 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3491 } 3492 } 3493 } 3494 if (rxr->rsc.pktlen <= MHLEN) 3495 rxr->hn_small_pkts++; 3496 3497 m_new->m_pkthdr.rcvif = ifp; 3498 3499 if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0)) 3500 do_csum = 0; 3501 3502 /* receive side checksum offload */ 3503 if (rxr->rsc.csum_info != NULL) { 3504 /* IP csum offload */ 3505 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3506 m_new->m_pkthdr.csum_flags |= 3507 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3508 rxr->hn_csum_ip++; 3509 } 3510 3511 /* TCP/UDP csum offload */ 3512 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3513 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3514 m_new->m_pkthdr.csum_flags |= 3515 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3516 m_new->m_pkthdr.csum_data = 0xffff; 3517 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3518 rxr->hn_csum_tcp++; 3519 else 3520 rxr->hn_csum_udp++; 3521 } 3522 3523 /* 3524 * XXX 3525 * As of this write (Oct 28th, 2016), host side will turn 3526 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3527 * the do_lro setting here is actually _not_ accurate. We 3528 * depend on the RSS hash type check to reset do_lro. 3529 */ 3530 if ((*(rxr->rsc.csum_info) & 3531 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3532 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3533 do_lro = 1; 3534 } else { 3535 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3536 if (l3proto == ETHERTYPE_IP) { 3537 if (l4proto == IPPROTO_TCP) { 3538 if (do_csum && 3539 (rxr->hn_trust_hcsum & 3540 HN_TRUST_HCSUM_TCP)) { 3541 rxr->hn_csum_trusted++; 3542 m_new->m_pkthdr.csum_flags |= 3543 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3544 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3545 m_new->m_pkthdr.csum_data = 0xffff; 3546 } 3547 do_lro = 1; 3548 } else if (l4proto == IPPROTO_UDP) { 3549 if (do_csum && 3550 (rxr->hn_trust_hcsum & 3551 HN_TRUST_HCSUM_UDP)) { 3552 rxr->hn_csum_trusted++; 3553 m_new->m_pkthdr.csum_flags |= 3554 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3555 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3556 m_new->m_pkthdr.csum_data = 0xffff; 3557 } 3558 } else if (l4proto != IPPROTO_DONE && do_csum && 3559 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3560 rxr->hn_csum_trusted++; 3561 m_new->m_pkthdr.csum_flags |= 3562 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3563 } 3564 } 3565 } 3566 3567 if (rxr->rsc.vlan_info != NULL) { 3568 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3569 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3570 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3571 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3572 m_new->m_flags |= M_VLANTAG; 3573 } 3574 3575 /* 3576 * If VF is activated (tranparent/non-transparent mode does not 3577 * matter here). 3578 * 3579 * - Disable LRO 3580 * 3581 * hn(4) will only receive broadcast packets, multicast packets, 3582 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3583 * packet types. 3584 * 3585 * For non-transparent, we definitely _cannot_ enable LRO at 3586 * all, since the LRO flush will use hn(4) as the receiving 3587 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3588 */ 3589 if (is_vf) 3590 do_lro = 0; 3591 3592 /* 3593 * If VF is activated (tranparent/non-transparent mode does not 3594 * matter here), do _not_ mess with unsupported hash types or 3595 * functions. 3596 */ 3597 if (rxr->rsc.hash_info != NULL) { 3598 rxr->hn_rss_pkts++; 3599 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3600 if (!is_vf) 3601 hash_type = M_HASHTYPE_OPAQUE_HASH; 3602 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3603 NDIS_HASH_FUNCTION_TOEPLITZ) { 3604 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3605 rxr->hn_mbuf_hash); 3606 3607 /* 3608 * NOTE: 3609 * do_lro is resetted, if the hash types are not TCP 3610 * related. See the comment in the above csum_flags 3611 * setup section. 3612 */ 3613 switch (type) { 3614 case NDIS_HASH_IPV4: 3615 hash_type = M_HASHTYPE_RSS_IPV4; 3616 do_lro = 0; 3617 break; 3618 3619 case NDIS_HASH_TCP_IPV4: 3620 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3621 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3622 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3623 3624 if (is_vf) 3625 def_htype = M_HASHTYPE_NONE; 3626 3627 /* 3628 * UDP 4-tuple hash is delivered as 3629 * TCP 4-tuple hash. 3630 */ 3631 if (l3proto == ETHERTYPE_MAX) { 3632 hn_rxpkt_proto(m_new, 3633 &l3proto, &l4proto); 3634 } 3635 if (l3proto == ETHERTYPE_IP) { 3636 if (l4proto == IPPROTO_UDP && 3637 (rxr->hn_mbuf_hash & 3638 NDIS_HASH_UDP_IPV4_X)) { 3639 hash_type = 3640 M_HASHTYPE_RSS_UDP_IPV4; 3641 do_lro = 0; 3642 } else if (l4proto != 3643 IPPROTO_TCP) { 3644 hash_type = def_htype; 3645 do_lro = 0; 3646 } 3647 } else { 3648 hash_type = def_htype; 3649 do_lro = 0; 3650 } 3651 } 3652 break; 3653 3654 case NDIS_HASH_IPV6: 3655 hash_type = M_HASHTYPE_RSS_IPV6; 3656 do_lro = 0; 3657 break; 3658 3659 case NDIS_HASH_IPV6_EX: 3660 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3661 do_lro = 0; 3662 break; 3663 3664 case NDIS_HASH_TCP_IPV6: 3665 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3666 break; 3667 3668 case NDIS_HASH_TCP_IPV6_EX: 3669 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3670 break; 3671 } 3672 } 3673 } else if (!is_vf) { 3674 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3675 hash_type = M_HASHTYPE_OPAQUE; 3676 } 3677 M_HASHTYPE_SET(m_new, hash_type); 3678 3679 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3680 if (hn_ifp != ifp) { 3681 const struct ether_header *eh; 3682 3683 /* 3684 * Non-transparent mode VF is activated. 3685 */ 3686 3687 /* 3688 * Allow tapping on hn(4). 3689 */ 3690 ETHER_BPF_MTAP(hn_ifp, m_new); 3691 3692 /* 3693 * Update hn(4)'s stats. 3694 */ 3695 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3696 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3697 /* Checked at the beginning of this function. */ 3698 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3699 eh = mtod(m_new, struct ether_header *); 3700 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3701 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3702 } 3703 rxr->hn_pkts++; 3704 3705 if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) { 3706 #if defined(INET) || defined(INET6) 3707 struct lro_ctrl *lro = &rxr->hn_lro; 3708 3709 if (lro->lro_cnt) { 3710 rxr->hn_lro_tried++; 3711 if (hn_lro_rx(lro, m_new) == 0) { 3712 /* DONE! */ 3713 return 0; 3714 } 3715 } 3716 #endif 3717 } 3718 if_input(ifp, m_new); 3719 3720 return (0); 3721 } 3722 3723 static int 3724 hn_ioctl(if_t ifp, u_long cmd, caddr_t data) 3725 { 3726 struct hn_softc *sc = if_getsoftc(ifp); 3727 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3728 if_t vf_ifp; 3729 int mask, error = 0; 3730 struct ifrsskey *ifrk; 3731 struct ifrsshash *ifrh; 3732 uint32_t mtu; 3733 3734 switch (cmd) { 3735 case SIOCSIFMTU: 3736 if (ifr->ifr_mtu > HN_MTU_MAX) { 3737 error = EINVAL; 3738 break; 3739 } 3740 3741 HN_LOCK(sc); 3742 3743 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3744 HN_UNLOCK(sc); 3745 break; 3746 } 3747 3748 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3749 /* Can't change MTU */ 3750 HN_UNLOCK(sc); 3751 error = EOPNOTSUPP; 3752 break; 3753 } 3754 3755 if (if_getmtu(ifp) == ifr->ifr_mtu) { 3756 HN_UNLOCK(sc); 3757 break; 3758 } 3759 3760 if (hn_xpnt_vf_isready(sc)) { 3761 vf_ifp = sc->hn_vf_ifp; 3762 ifr_vf = *ifr; 3763 strlcpy(ifr_vf.ifr_name, if_name(vf_ifp), 3764 sizeof(ifr_vf.ifr_name)); 3765 error = ifhwioctl(SIOCSIFMTU,vf_ifp, 3766 (caddr_t)&ifr_vf, curthread); 3767 if (error) { 3768 HN_UNLOCK(sc); 3769 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3770 if_name(vf_ifp), ifr->ifr_mtu, error); 3771 break; 3772 } 3773 } 3774 3775 /* 3776 * Suspend this interface before the synthetic parts 3777 * are ripped. 3778 */ 3779 hn_suspend(sc); 3780 3781 /* 3782 * Detach the synthetics parts, i.e. NVS and RNDIS. 3783 */ 3784 hn_synth_detach(sc); 3785 3786 /* 3787 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3788 * with the new MTU setting. 3789 */ 3790 error = hn_synth_attach(sc, ifr->ifr_mtu); 3791 if (error) { 3792 HN_UNLOCK(sc); 3793 break; 3794 } 3795 3796 error = hn_rndis_get_mtu(sc, &mtu); 3797 if (error) 3798 mtu = ifr->ifr_mtu; 3799 else if (bootverbose) 3800 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3801 3802 /* 3803 * Commit the requested MTU, after the synthetic parts 3804 * have been successfully attached. 3805 */ 3806 if (mtu >= ifr->ifr_mtu) { 3807 mtu = ifr->ifr_mtu; 3808 } else { 3809 if_printf(ifp, "fixup mtu %d -> %u\n", 3810 ifr->ifr_mtu, mtu); 3811 } 3812 if_setmtu(ifp, mtu); 3813 3814 /* 3815 * Synthetic parts' reattach may change the chimney 3816 * sending size; update it. 3817 */ 3818 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3819 hn_set_chim_size(sc, sc->hn_chim_szmax); 3820 3821 /* 3822 * Make sure that various parameters based on MTU are 3823 * still valid, after the MTU change. 3824 */ 3825 hn_mtu_change_fixup(sc); 3826 3827 /* 3828 * All done! Resume the interface now. 3829 */ 3830 hn_resume(sc); 3831 3832 if ((sc->hn_flags & HN_FLAG_RXVF) || 3833 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3834 /* 3835 * Since we have reattached the NVS part, 3836 * change the datapath to VF again; in case 3837 * that it is lost, after the NVS was detached. 3838 */ 3839 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3840 } 3841 3842 HN_UNLOCK(sc); 3843 break; 3844 3845 case SIOCSIFFLAGS: 3846 HN_LOCK(sc); 3847 3848 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3849 HN_UNLOCK(sc); 3850 break; 3851 } 3852 3853 if (hn_xpnt_vf_isready(sc)) 3854 hn_xpnt_vf_saveifflags(sc); 3855 3856 if (if_getflags(ifp) & IFF_UP) { 3857 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3858 /* 3859 * Caller meight hold mutex, e.g. 3860 * bpf; use busy-wait for the RNDIS 3861 * reply. 3862 */ 3863 HN_NO_SLEEPING(sc); 3864 hn_rxfilter_config(sc); 3865 HN_SLEEPING_OK(sc); 3866 3867 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3868 error = hn_xpnt_vf_iocsetflags(sc); 3869 } else { 3870 hn_init_locked(sc); 3871 } 3872 } else { 3873 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 3874 hn_stop(sc, false); 3875 } 3876 sc->hn_if_flags = if_getflags(ifp); 3877 3878 HN_UNLOCK(sc); 3879 break; 3880 3881 case SIOCSIFCAP: 3882 HN_LOCK(sc); 3883 3884 if (hn_xpnt_vf_isready(sc)) { 3885 ifr_vf = *ifr; 3886 strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp), 3887 sizeof(ifr_vf.ifr_name)); 3888 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3889 HN_UNLOCK(sc); 3890 break; 3891 } 3892 3893 /* 3894 * Fix up requested capabilities w/ supported capabilities, 3895 * since the supported capabilities could have been changed. 3896 */ 3897 mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ 3898 if_getcapenable(ifp); 3899 3900 if (mask & IFCAP_TXCSUM) { 3901 if_togglecapenable(ifp, IFCAP_TXCSUM); 3902 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 3903 if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0); 3904 else 3905 if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc)); 3906 } 3907 if (mask & IFCAP_TXCSUM_IPV6) { 3908 if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); 3909 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 3910 if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0); 3911 else 3912 if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc)); 3913 } 3914 3915 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3916 if (mask & IFCAP_RXCSUM) 3917 if_togglecapenable(ifp, IFCAP_RXCSUM); 3918 #ifdef foo 3919 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3920 if (mask & IFCAP_RXCSUM_IPV6) 3921 if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); 3922 #endif 3923 3924 if (mask & IFCAP_LRO) 3925 if_togglecapenable(ifp, IFCAP_LRO); 3926 3927 if (mask & IFCAP_TSO4) { 3928 if_togglecapenable(ifp, IFCAP_TSO4); 3929 if (if_getcapenable(ifp) & IFCAP_TSO4) 3930 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 3931 else 3932 if_sethwassistbits(ifp, 0, CSUM_IP_TSO); 3933 } 3934 if (mask & IFCAP_TSO6) { 3935 if_togglecapenable(ifp, IFCAP_TSO6); 3936 if (if_getcapenable(ifp) & IFCAP_TSO6) 3937 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 3938 else 3939 if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); 3940 } 3941 3942 HN_UNLOCK(sc); 3943 break; 3944 3945 case SIOCADDMULTI: 3946 case SIOCDELMULTI: 3947 HN_LOCK(sc); 3948 3949 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3950 HN_UNLOCK(sc); 3951 break; 3952 } 3953 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3954 /* 3955 * Multicast uses mutex; use busy-wait for 3956 * the RNDIS reply. 3957 */ 3958 HN_NO_SLEEPING(sc); 3959 hn_rxfilter_config(sc); 3960 HN_SLEEPING_OK(sc); 3961 } 3962 3963 /* XXX vlan(4) style mcast addr maintenance */ 3964 if (hn_xpnt_vf_isready(sc)) { 3965 int old_if_flags; 3966 3967 old_if_flags = if_getflags(sc->hn_vf_ifp); 3968 hn_xpnt_vf_saveifflags(sc); 3969 3970 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3971 ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) & 3972 IFF_ALLMULTI)) 3973 error = hn_xpnt_vf_iocsetflags(sc); 3974 } 3975 3976 HN_UNLOCK(sc); 3977 break; 3978 3979 case SIOCSIFMEDIA: 3980 case SIOCGIFMEDIA: 3981 HN_LOCK(sc); 3982 if (hn_xpnt_vf_isready(sc)) { 3983 /* 3984 * SIOCGIFMEDIA expects ifmediareq, so don't 3985 * create and pass ifr_vf to the VF here; just 3986 * replace the ifr_name. 3987 */ 3988 vf_ifp = sc->hn_vf_ifp; 3989 strlcpy(ifr->ifr_name, if_name(vf_ifp), 3990 sizeof(ifr->ifr_name)); 3991 error = ifhwioctl(cmd, vf_ifp, data, curthread); 3992 /* Restore the ifr_name. */ 3993 strlcpy(ifr->ifr_name, if_name(ifp), 3994 sizeof(ifr->ifr_name)); 3995 HN_UNLOCK(sc); 3996 break; 3997 } 3998 HN_UNLOCK(sc); 3999 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4000 break; 4001 4002 case SIOCGIFRSSHASH: 4003 ifrh = (struct ifrsshash *)data; 4004 HN_LOCK(sc); 4005 if (sc->hn_rx_ring_inuse == 1) { 4006 HN_UNLOCK(sc); 4007 ifrh->ifrh_func = RSS_FUNC_NONE; 4008 ifrh->ifrh_types = 0; 4009 break; 4010 } 4011 4012 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4013 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4014 else 4015 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4016 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4017 HN_UNLOCK(sc); 4018 break; 4019 4020 case SIOCGIFRSSKEY: 4021 ifrk = (struct ifrsskey *)data; 4022 HN_LOCK(sc); 4023 if (sc->hn_rx_ring_inuse == 1) { 4024 HN_UNLOCK(sc); 4025 ifrk->ifrk_func = RSS_FUNC_NONE; 4026 ifrk->ifrk_keylen = 0; 4027 break; 4028 } 4029 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4030 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4031 else 4032 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4033 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4034 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4035 NDIS_HASH_KEYSIZE_TOEPLITZ); 4036 HN_UNLOCK(sc); 4037 break; 4038 4039 default: 4040 error = ether_ioctl(ifp, cmd, data); 4041 break; 4042 } 4043 return (error); 4044 } 4045 4046 static void 4047 hn_stop(struct hn_softc *sc, bool detaching) 4048 { 4049 if_t ifp = sc->hn_ifp; 4050 int i; 4051 4052 HN_LOCK_ASSERT(sc); 4053 4054 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4055 ("synthetic parts were not attached")); 4056 4057 /* Clear RUNNING bit ASAP. */ 4058 if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); 4059 4060 /* Disable polling. */ 4061 hn_polling(sc, 0); 4062 4063 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4064 KASSERT(sc->hn_vf_ifp != NULL, 4065 ("%s: VF is not attached", if_name(ifp))); 4066 4067 /* Mark transparent mode VF as disabled. */ 4068 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4069 4070 /* 4071 * NOTE: 4072 * Datapath setting must happen _before_ bringing 4073 * the VF down. 4074 */ 4075 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4076 4077 /* 4078 * Bring the VF down. 4079 */ 4080 hn_xpnt_vf_saveifflags(sc); 4081 if_setflagbits(ifp, 0, IFF_UP); 4082 hn_xpnt_vf_iocsetflags(sc); 4083 } 4084 4085 /* Suspend data transfers. */ 4086 hn_suspend_data(sc); 4087 4088 /* Clear OACTIVE bit. */ 4089 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4090 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4091 sc->hn_tx_ring[i].hn_oactive = 0; 4092 4093 /* 4094 * If the non-transparent mode VF is active, make sure 4095 * that the RX filter still allows packet reception. 4096 */ 4097 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4098 hn_rxfilter_config(sc); 4099 } 4100 4101 static void 4102 hn_init_locked(struct hn_softc *sc) 4103 { 4104 if_t ifp = sc->hn_ifp; 4105 int i; 4106 4107 HN_LOCK_ASSERT(sc); 4108 4109 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4110 return; 4111 4112 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 4113 return; 4114 4115 /* Configure RX filter */ 4116 hn_rxfilter_config(sc); 4117 4118 /* Clear OACTIVE bit. */ 4119 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4120 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4121 sc->hn_tx_ring[i].hn_oactive = 0; 4122 4123 /* Clear TX 'suspended' bit. */ 4124 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4125 4126 if (hn_xpnt_vf_isready(sc)) { 4127 /* Initialize transparent VF. */ 4128 hn_xpnt_vf_init(sc); 4129 } 4130 4131 /* Everything is ready; unleash! */ 4132 if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); 4133 4134 /* Re-enable polling if requested. */ 4135 if (sc->hn_pollhz > 0) 4136 hn_polling(sc, sc->hn_pollhz); 4137 } 4138 4139 static void 4140 hn_init(void *xsc) 4141 { 4142 struct hn_softc *sc = xsc; 4143 4144 HN_LOCK(sc); 4145 hn_init_locked(sc); 4146 HN_UNLOCK(sc); 4147 } 4148 4149 static int 4150 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4151 { 4152 struct hn_softc *sc = arg1; 4153 unsigned int lenlim; 4154 int error; 4155 4156 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4157 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4158 if (error || req->newptr == NULL) 4159 return error; 4160 4161 HN_LOCK(sc); 4162 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4163 lenlim > TCP_LRO_LENGTH_MAX) { 4164 HN_UNLOCK(sc); 4165 return EINVAL; 4166 } 4167 hn_set_lro_lenlim(sc, lenlim); 4168 HN_UNLOCK(sc); 4169 4170 return 0; 4171 } 4172 4173 static int 4174 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4175 { 4176 struct hn_softc *sc = arg1; 4177 int ackcnt, error, i; 4178 4179 /* 4180 * lro_ackcnt_lim is append count limit, 4181 * +1 to turn it into aggregation limit. 4182 */ 4183 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4184 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4185 if (error || req->newptr == NULL) 4186 return error; 4187 4188 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4189 return EINVAL; 4190 4191 /* 4192 * Convert aggregation limit back to append 4193 * count limit. 4194 */ 4195 --ackcnt; 4196 HN_LOCK(sc); 4197 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4198 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4199 HN_UNLOCK(sc); 4200 return 0; 4201 } 4202 4203 static int 4204 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4205 { 4206 struct hn_softc *sc = arg1; 4207 int hcsum = arg2; 4208 int on, error, i; 4209 4210 on = 0; 4211 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4212 on = 1; 4213 4214 error = sysctl_handle_int(oidp, &on, 0, req); 4215 if (error || req->newptr == NULL) 4216 return error; 4217 4218 HN_LOCK(sc); 4219 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4220 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4221 4222 if (on) 4223 rxr->hn_trust_hcsum |= hcsum; 4224 else 4225 rxr->hn_trust_hcsum &= ~hcsum; 4226 } 4227 HN_UNLOCK(sc); 4228 return 0; 4229 } 4230 4231 static int 4232 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4233 { 4234 struct hn_softc *sc = arg1; 4235 int chim_size, error; 4236 4237 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4238 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4239 if (error || req->newptr == NULL) 4240 return error; 4241 4242 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4243 return EINVAL; 4244 4245 HN_LOCK(sc); 4246 hn_set_chim_size(sc, chim_size); 4247 HN_UNLOCK(sc); 4248 return 0; 4249 } 4250 4251 static int 4252 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4253 { 4254 struct hn_softc *sc = arg1; 4255 int ofs = arg2, i, error; 4256 struct hn_rx_ring *rxr; 4257 uint64_t stat; 4258 4259 stat = 0; 4260 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4261 rxr = &sc->hn_rx_ring[i]; 4262 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4263 } 4264 4265 error = sysctl_handle_64(oidp, &stat, 0, req); 4266 if (error || req->newptr == NULL) 4267 return error; 4268 4269 /* Zero out this stat. */ 4270 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4271 rxr = &sc->hn_rx_ring[i]; 4272 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4273 } 4274 return 0; 4275 } 4276 4277 static int 4278 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4279 { 4280 struct hn_softc *sc = arg1; 4281 int ofs = arg2, i, error; 4282 struct hn_rx_ring *rxr; 4283 u_long stat; 4284 4285 stat = 0; 4286 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4287 rxr = &sc->hn_rx_ring[i]; 4288 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4289 } 4290 4291 error = sysctl_handle_long(oidp, &stat, 0, req); 4292 if (error || req->newptr == NULL) 4293 return error; 4294 4295 /* Zero out this stat. */ 4296 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4297 rxr = &sc->hn_rx_ring[i]; 4298 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4299 } 4300 return 0; 4301 } 4302 4303 static int 4304 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4305 { 4306 struct hn_softc *sc = arg1; 4307 int ofs = arg2, i, error; 4308 struct hn_tx_ring *txr; 4309 u_long stat; 4310 4311 stat = 0; 4312 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4313 txr = &sc->hn_tx_ring[i]; 4314 stat += *((u_long *)((uint8_t *)txr + ofs)); 4315 } 4316 4317 error = sysctl_handle_long(oidp, &stat, 0, req); 4318 if (error || req->newptr == NULL) 4319 return error; 4320 4321 /* Zero out this stat. */ 4322 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4323 txr = &sc->hn_tx_ring[i]; 4324 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4325 } 4326 return 0; 4327 } 4328 4329 static int 4330 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4331 { 4332 struct hn_softc *sc = arg1; 4333 int ofs = arg2, i, error, conf; 4334 struct hn_tx_ring *txr; 4335 4336 txr = &sc->hn_tx_ring[0]; 4337 conf = *((int *)((uint8_t *)txr + ofs)); 4338 4339 error = sysctl_handle_int(oidp, &conf, 0, req); 4340 if (error || req->newptr == NULL) 4341 return error; 4342 4343 HN_LOCK(sc); 4344 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4345 txr = &sc->hn_tx_ring[i]; 4346 *((int *)((uint8_t *)txr + ofs)) = conf; 4347 } 4348 HN_UNLOCK(sc); 4349 4350 return 0; 4351 } 4352 4353 static int 4354 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4355 { 4356 struct hn_softc *sc = arg1; 4357 int error, size; 4358 4359 size = sc->hn_agg_size; 4360 error = sysctl_handle_int(oidp, &size, 0, req); 4361 if (error || req->newptr == NULL) 4362 return (error); 4363 4364 HN_LOCK(sc); 4365 sc->hn_agg_size = size; 4366 hn_set_txagg(sc); 4367 HN_UNLOCK(sc); 4368 4369 return (0); 4370 } 4371 4372 static int 4373 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4374 { 4375 struct hn_softc *sc = arg1; 4376 int error, pkts; 4377 4378 pkts = sc->hn_agg_pkts; 4379 error = sysctl_handle_int(oidp, &pkts, 0, req); 4380 if (error || req->newptr == NULL) 4381 return (error); 4382 4383 HN_LOCK(sc); 4384 sc->hn_agg_pkts = pkts; 4385 hn_set_txagg(sc); 4386 HN_UNLOCK(sc); 4387 4388 return (0); 4389 } 4390 4391 static int 4392 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4393 { 4394 struct hn_softc *sc = arg1; 4395 int pkts; 4396 4397 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4398 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4399 } 4400 4401 static int 4402 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4403 { 4404 struct hn_softc *sc = arg1; 4405 int align; 4406 4407 align = sc->hn_tx_ring[0].hn_agg_align; 4408 return (sysctl_handle_int(oidp, &align, 0, req)); 4409 } 4410 4411 static void 4412 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4413 { 4414 if (pollhz == 0) 4415 vmbus_chan_poll_disable(chan); 4416 else 4417 vmbus_chan_poll_enable(chan, pollhz); 4418 } 4419 4420 static void 4421 hn_polling(struct hn_softc *sc, u_int pollhz) 4422 { 4423 int nsubch = sc->hn_rx_ring_inuse - 1; 4424 4425 HN_LOCK_ASSERT(sc); 4426 4427 if (nsubch > 0) { 4428 struct vmbus_channel **subch; 4429 int i; 4430 4431 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4432 for (i = 0; i < nsubch; ++i) 4433 hn_chan_polling(subch[i], pollhz); 4434 vmbus_subchan_rel(subch, nsubch); 4435 } 4436 hn_chan_polling(sc->hn_prichan, pollhz); 4437 } 4438 4439 static int 4440 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4441 { 4442 struct hn_softc *sc = arg1; 4443 int pollhz, error; 4444 4445 pollhz = sc->hn_pollhz; 4446 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4447 if (error || req->newptr == NULL) 4448 return (error); 4449 4450 if (pollhz != 0 && 4451 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4452 return (EINVAL); 4453 4454 HN_LOCK(sc); 4455 if (sc->hn_pollhz != pollhz) { 4456 sc->hn_pollhz = pollhz; 4457 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && 4458 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4459 hn_polling(sc, sc->hn_pollhz); 4460 } 4461 HN_UNLOCK(sc); 4462 4463 return (0); 4464 } 4465 4466 static int 4467 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4468 { 4469 struct hn_softc *sc = arg1; 4470 char verstr[16]; 4471 4472 snprintf(verstr, sizeof(verstr), "%u.%u", 4473 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4474 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4475 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4476 } 4477 4478 static int 4479 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4480 { 4481 struct hn_softc *sc = arg1; 4482 char caps_str[128]; 4483 uint32_t caps; 4484 4485 HN_LOCK(sc); 4486 caps = sc->hn_caps; 4487 HN_UNLOCK(sc); 4488 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4489 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4490 } 4491 4492 static int 4493 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4494 { 4495 struct hn_softc *sc = arg1; 4496 char assist_str[128]; 4497 uint32_t hwassist; 4498 4499 HN_LOCK(sc); 4500 hwassist = if_gethwassist(sc->hn_ifp); 4501 HN_UNLOCK(sc); 4502 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4503 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4504 } 4505 4506 static int 4507 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4508 { 4509 struct hn_softc *sc = arg1; 4510 char filter_str[128]; 4511 uint32_t filter; 4512 4513 HN_LOCK(sc); 4514 filter = sc->hn_rx_filter; 4515 HN_UNLOCK(sc); 4516 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4517 NDIS_PACKET_TYPES); 4518 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4519 } 4520 4521 static int 4522 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4523 { 4524 struct hn_softc *sc = arg1; 4525 uint32_t mtu; 4526 int error; 4527 HN_LOCK(sc); 4528 error = hn_rndis_get_mtu(sc, &mtu); 4529 if (error) { 4530 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4531 goto back; 4532 } 4533 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4534 if (error || req->newptr == NULL) 4535 goto back; 4536 4537 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4538 if (error) 4539 goto back; 4540 error = hn_rndis_reconf_offload(sc, mtu); 4541 back: 4542 HN_UNLOCK(sc); 4543 return (error); 4544 } 4545 #ifndef RSS 4546 4547 static int 4548 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4549 { 4550 struct hn_softc *sc = arg1; 4551 int error; 4552 4553 HN_LOCK(sc); 4554 4555 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4556 if (error || req->newptr == NULL) 4557 goto back; 4558 4559 if ((sc->hn_flags & HN_FLAG_RXVF) || 4560 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4561 /* 4562 * RSS key is synchronized w/ VF's, don't allow users 4563 * to change it. 4564 */ 4565 error = EBUSY; 4566 goto back; 4567 } 4568 4569 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4570 if (error) 4571 goto back; 4572 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4573 4574 if (sc->hn_rx_ring_inuse > 1) { 4575 error = hn_rss_reconfig(sc); 4576 } else { 4577 /* Not RSS capable, at least for now; just save the RSS key. */ 4578 error = 0; 4579 } 4580 back: 4581 HN_UNLOCK(sc); 4582 return (error); 4583 } 4584 4585 static int 4586 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4587 { 4588 struct hn_softc *sc = arg1; 4589 int error; 4590 4591 HN_LOCK(sc); 4592 4593 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4594 if (error || req->newptr == NULL) 4595 goto back; 4596 4597 /* 4598 * Don't allow RSS indirect table change, if this interface is not 4599 * RSS capable currently. 4600 */ 4601 if (sc->hn_rx_ring_inuse == 1) { 4602 error = EOPNOTSUPP; 4603 goto back; 4604 } 4605 4606 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4607 if (error) 4608 goto back; 4609 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4610 4611 hn_rss_ind_fixup(sc); 4612 error = hn_rss_reconfig(sc); 4613 back: 4614 HN_UNLOCK(sc); 4615 return (error); 4616 } 4617 4618 #endif /* !RSS */ 4619 4620 static int 4621 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4622 { 4623 struct hn_softc *sc = arg1; 4624 char hash_str[128]; 4625 uint32_t hash; 4626 4627 HN_LOCK(sc); 4628 hash = sc->hn_rss_hash; 4629 HN_UNLOCK(sc); 4630 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4631 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4632 } 4633 4634 static int 4635 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4636 { 4637 struct hn_softc *sc = arg1; 4638 char hash_str[128]; 4639 uint32_t hash; 4640 4641 HN_LOCK(sc); 4642 hash = sc->hn_rss_hcap; 4643 HN_UNLOCK(sc); 4644 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4645 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4646 } 4647 4648 static int 4649 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4650 { 4651 struct hn_softc *sc = arg1; 4652 char hash_str[128]; 4653 uint32_t hash; 4654 4655 HN_LOCK(sc); 4656 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4657 HN_UNLOCK(sc); 4658 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4659 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4660 } 4661 4662 static int 4663 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4664 { 4665 struct hn_softc *sc = arg1; 4666 char vf_name[IFNAMSIZ + 1]; 4667 if_t vf_ifp; 4668 4669 HN_LOCK(sc); 4670 vf_name[0] = '\0'; 4671 vf_ifp = sc->hn_vf_ifp; 4672 if (vf_ifp != NULL) 4673 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4674 HN_UNLOCK(sc); 4675 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4676 } 4677 4678 static int 4679 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4680 { 4681 struct hn_softc *sc = arg1; 4682 char vf_name[IFNAMSIZ + 1]; 4683 if_t vf_ifp; 4684 4685 HN_LOCK(sc); 4686 vf_name[0] = '\0'; 4687 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4688 if (vf_ifp != NULL) 4689 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4690 HN_UNLOCK(sc); 4691 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4692 } 4693 4694 static int 4695 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4696 { 4697 struct rm_priotracker pt; 4698 struct sbuf *sb; 4699 int error, i; 4700 bool first; 4701 4702 error = sysctl_wire_old_buffer(req, 0); 4703 if (error != 0) 4704 return (error); 4705 4706 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4707 if (sb == NULL) 4708 return (ENOMEM); 4709 4710 rm_rlock(&hn_vfmap_lock, &pt); 4711 4712 first = true; 4713 for (i = 0; i < hn_vfmap_size; ++i) { 4714 struct epoch_tracker et; 4715 if_t ifp; 4716 4717 if (hn_vfmap[i] == NULL) 4718 continue; 4719 4720 NET_EPOCH_ENTER(et); 4721 ifp = ifnet_byindex(i); 4722 if (ifp != NULL) { 4723 if (first) 4724 sbuf_printf(sb, "%s", if_name(ifp)); 4725 else 4726 sbuf_printf(sb, " %s", if_name(ifp)); 4727 first = false; 4728 } 4729 NET_EPOCH_EXIT(et); 4730 } 4731 4732 rm_runlock(&hn_vfmap_lock, &pt); 4733 4734 error = sbuf_finish(sb); 4735 sbuf_delete(sb); 4736 return (error); 4737 } 4738 4739 static int 4740 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4741 { 4742 struct rm_priotracker pt; 4743 struct sbuf *sb; 4744 int error, i; 4745 bool first; 4746 4747 error = sysctl_wire_old_buffer(req, 0); 4748 if (error != 0) 4749 return (error); 4750 4751 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4752 if (sb == NULL) 4753 return (ENOMEM); 4754 4755 rm_rlock(&hn_vfmap_lock, &pt); 4756 4757 first = true; 4758 for (i = 0; i < hn_vfmap_size; ++i) { 4759 struct epoch_tracker et; 4760 if_t ifp, hn_ifp; 4761 4762 hn_ifp = hn_vfmap[i]; 4763 if (hn_ifp == NULL) 4764 continue; 4765 4766 NET_EPOCH_ENTER(et); 4767 ifp = ifnet_byindex(i); 4768 if (ifp != NULL) { 4769 if (first) { 4770 sbuf_printf(sb, "%s:%s", if_name(ifp), 4771 if_name(hn_ifp)); 4772 } else { 4773 sbuf_printf(sb, " %s:%s", if_name(ifp), 4774 if_name(hn_ifp)); 4775 } 4776 first = false; 4777 } 4778 NET_EPOCH_EXIT(et); 4779 } 4780 4781 rm_runlock(&hn_vfmap_lock, &pt); 4782 4783 error = sbuf_finish(sb); 4784 sbuf_delete(sb); 4785 return (error); 4786 } 4787 4788 static int 4789 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4790 { 4791 struct hn_softc *sc = arg1; 4792 int error, onoff = 0; 4793 4794 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4795 onoff = 1; 4796 error = sysctl_handle_int(oidp, &onoff, 0, req); 4797 if (error || req->newptr == NULL) 4798 return (error); 4799 4800 HN_LOCK(sc); 4801 /* NOTE: hn_vf_lock for hn_transmit() */ 4802 rm_wlock(&sc->hn_vf_lock); 4803 if (onoff) 4804 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4805 else 4806 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4807 rm_wunlock(&sc->hn_vf_lock); 4808 HN_UNLOCK(sc); 4809 4810 return (0); 4811 } 4812 4813 static int 4814 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4815 { 4816 struct hn_softc *sc = arg1; 4817 int enabled = 0; 4818 4819 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4820 enabled = 1; 4821 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4822 } 4823 4824 static int 4825 hn_check_iplen(const struct mbuf *m, int hoff) 4826 { 4827 const struct ip *ip; 4828 int len, iphlen, iplen; 4829 const struct tcphdr *th; 4830 int thoff; /* TCP data offset */ 4831 4832 len = hoff + sizeof(struct ip); 4833 4834 /* The packet must be at least the size of an IP header. */ 4835 if (m->m_pkthdr.len < len) 4836 return IPPROTO_DONE; 4837 4838 /* The fixed IP header must reside completely in the first mbuf. */ 4839 if (m->m_len < len) 4840 return IPPROTO_DONE; 4841 4842 ip = mtodo(m, hoff); 4843 4844 /* Bound check the packet's stated IP header length. */ 4845 iphlen = ip->ip_hl << 2; 4846 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4847 return IPPROTO_DONE; 4848 4849 /* The full IP header must reside completely in the one mbuf. */ 4850 if (m->m_len < hoff + iphlen) 4851 return IPPROTO_DONE; 4852 4853 iplen = ntohs(ip->ip_len); 4854 4855 /* 4856 * Check that the amount of data in the buffers is as 4857 * at least much as the IP header would have us expect. 4858 */ 4859 if (m->m_pkthdr.len < hoff + iplen) 4860 return IPPROTO_DONE; 4861 4862 /* 4863 * Ignore IP fragments. 4864 */ 4865 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4866 return IPPROTO_DONE; 4867 4868 /* 4869 * The TCP/IP or UDP/IP header must be entirely contained within 4870 * the first fragment of a packet. 4871 */ 4872 switch (ip->ip_p) { 4873 case IPPROTO_TCP: 4874 if (iplen < iphlen + sizeof(struct tcphdr)) 4875 return IPPROTO_DONE; 4876 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4877 return IPPROTO_DONE; 4878 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4879 thoff = th->th_off << 2; 4880 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4881 return IPPROTO_DONE; 4882 if (m->m_len < hoff + iphlen + thoff) 4883 return IPPROTO_DONE; 4884 break; 4885 case IPPROTO_UDP: 4886 if (iplen < iphlen + sizeof(struct udphdr)) 4887 return IPPROTO_DONE; 4888 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4889 return IPPROTO_DONE; 4890 break; 4891 default: 4892 if (iplen < iphlen) 4893 return IPPROTO_DONE; 4894 break; 4895 } 4896 return ip->ip_p; 4897 } 4898 4899 static void 4900 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4901 { 4902 const struct ether_header *eh; 4903 uint16_t etype; 4904 int hoff; 4905 4906 hoff = sizeof(*eh); 4907 /* Checked at the beginning of this function. */ 4908 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4909 4910 eh = mtod(m_new, const struct ether_header *); 4911 etype = ntohs(eh->ether_type); 4912 if (etype == ETHERTYPE_VLAN) { 4913 const struct ether_vlan_header *evl; 4914 4915 hoff = sizeof(*evl); 4916 if (m_new->m_len < hoff) 4917 return; 4918 evl = mtod(m_new, const struct ether_vlan_header *); 4919 etype = ntohs(evl->evl_proto); 4920 } 4921 *l3proto = etype; 4922 4923 if (etype == ETHERTYPE_IP) 4924 *l4proto = hn_check_iplen(m_new, hoff); 4925 else 4926 *l4proto = IPPROTO_DONE; 4927 } 4928 4929 static int 4930 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4931 { 4932 struct sysctl_oid_list *child; 4933 struct sysctl_ctx_list *ctx; 4934 device_t dev = sc->hn_dev; 4935 #if defined(INET) || defined(INET6) 4936 int lroent_cnt; 4937 #endif 4938 int i; 4939 4940 /* 4941 * Create RXBUF for reception. 4942 * 4943 * NOTE: 4944 * - It is shared by all channels. 4945 * - A large enough buffer is allocated, certain version of NVSes 4946 * may further limit the usable space. 4947 */ 4948 sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 4949 0ul, ~0ul, PAGE_SIZE, 0); 4950 if (sc->hn_rxbuf == NULL) { 4951 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4952 return (ENOMEM); 4953 } 4954 4955 sc->hn_rx_ring_cnt = ring_cnt; 4956 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4957 4958 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4959 M_DEVBUF, M_WAITOK | M_ZERO); 4960 4961 #if defined(INET) || defined(INET6) 4962 lroent_cnt = hn_lro_entry_count; 4963 if (lroent_cnt < TCP_LRO_ENTRIES) 4964 lroent_cnt = TCP_LRO_ENTRIES; 4965 if (bootverbose) 4966 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4967 #endif /* INET || INET6 */ 4968 4969 ctx = device_get_sysctl_ctx(dev); 4970 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4971 4972 /* Create dev.hn.UNIT.rx sysctl tree */ 4973 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4974 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4975 4976 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4977 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4978 4979 rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF, 4980 M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); 4981 if (rxr->hn_br == NULL) { 4982 device_printf(dev, "allocate bufring failed\n"); 4983 return (ENOMEM); 4984 } 4985 4986 if (hn_trust_hosttcp) 4987 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 4988 if (hn_trust_hostudp) 4989 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 4990 if (hn_trust_hostip) 4991 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 4992 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 4993 rxr->hn_ifp = sc->hn_ifp; 4994 if (i < sc->hn_tx_ring_cnt) 4995 rxr->hn_txr = &sc->hn_tx_ring[i]; 4996 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 4997 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 4998 rxr->hn_rx_idx = i; 4999 rxr->hn_rxbuf = sc->hn_rxbuf; 5000 5001 /* 5002 * Initialize LRO. 5003 */ 5004 #if defined(INET) || defined(INET6) 5005 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5006 hn_lro_mbufq_depth); 5007 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5008 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5009 #endif /* INET || INET6 */ 5010 5011 if (sc->hn_rx_sysctl_tree != NULL) { 5012 char name[16]; 5013 5014 /* 5015 * Create per RX ring sysctl tree: 5016 * dev.hn.UNIT.rx.RINGID 5017 */ 5018 snprintf(name, sizeof(name), "%d", i); 5019 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5020 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5021 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5022 5023 if (rxr->hn_rx_sysctl_tree != NULL) { 5024 SYSCTL_ADD_ULONG(ctx, 5025 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5026 OID_AUTO, "packets", 5027 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5028 "# of packets received"); 5029 SYSCTL_ADD_ULONG(ctx, 5030 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5031 OID_AUTO, "rss_pkts", 5032 CTLFLAG_RW | CTLFLAG_STATS, 5033 &rxr->hn_rss_pkts, 5034 "# of packets w/ RSS info received"); 5035 SYSCTL_ADD_ULONG(ctx, 5036 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5037 OID_AUTO, "rsc_pkts", 5038 CTLFLAG_RW | CTLFLAG_STATS, 5039 &rxr->hn_rsc_pkts, 5040 "# of RSC packets received"); 5041 SYSCTL_ADD_ULONG(ctx, 5042 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5043 OID_AUTO, "rsc_drop", 5044 CTLFLAG_RW | CTLFLAG_STATS, 5045 &rxr->hn_rsc_drop, 5046 "# of RSC fragments dropped"); 5047 SYSCTL_ADD_INT(ctx, 5048 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5049 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5050 &rxr->hn_pktbuf_len, 0, 5051 "Temporary channel packet buffer length"); 5052 } 5053 } 5054 } 5055 5056 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5057 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5058 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5059 hn_rx_stat_u64_sysctl, 5060 "LU", "LRO queued"); 5061 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5062 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5063 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5064 hn_rx_stat_u64_sysctl, 5065 "LU", "LRO flushed"); 5066 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5067 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5068 __offsetof(struct hn_rx_ring, hn_lro_tried), 5069 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5070 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5071 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5072 hn_lro_lenlim_sysctl, "IU", 5073 "Max # of data bytes to be aggregated by LRO"); 5074 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5075 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5076 hn_lro_ackcnt_sysctl, "I", 5077 "Max # of ACKs to be aggregated by LRO"); 5078 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5079 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5080 hn_trust_hcsum_sysctl, "I", 5081 "Trust tcp segment verification on host side, " 5082 "when csum info is missing"); 5083 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5084 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5085 hn_trust_hcsum_sysctl, "I", 5086 "Trust udp datagram verification on host side, " 5087 "when csum info is missing"); 5088 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5089 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5090 hn_trust_hcsum_sysctl, "I", 5091 "Trust ip packet verification on host side, " 5092 "when csum info is missing"); 5093 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5094 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5095 __offsetof(struct hn_rx_ring, hn_csum_ip), 5096 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5097 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5098 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5099 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5100 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5102 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5103 __offsetof(struct hn_rx_ring, hn_csum_udp), 5104 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5105 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5106 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5107 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5108 hn_rx_stat_ulong_sysctl, "LU", 5109 "# of packets that we trust host's csum verification"); 5110 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5111 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5112 __offsetof(struct hn_rx_ring, hn_small_pkts), 5113 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5114 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5115 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5116 __offsetof(struct hn_rx_ring, hn_ack_failed), 5117 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5118 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5119 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5120 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5121 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5122 5123 return (0); 5124 } 5125 5126 static void 5127 hn_destroy_rx_data(struct hn_softc *sc) 5128 { 5129 int i; 5130 5131 if (sc->hn_rxbuf != NULL) { 5132 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5133 contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF); 5134 else 5135 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5136 sc->hn_rxbuf = NULL; 5137 } 5138 5139 if (sc->hn_rx_ring_cnt == 0) 5140 return; 5141 5142 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5143 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5144 5145 if (rxr->hn_br == NULL) 5146 continue; 5147 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5148 contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE, 5149 M_DEVBUF); 5150 } else { 5151 device_printf(sc->hn_dev, 5152 "%dth channel bufring is referenced", i); 5153 } 5154 rxr->hn_br = NULL; 5155 5156 #if defined(INET) || defined(INET6) 5157 tcp_lro_free(&rxr->hn_lro); 5158 #endif 5159 free(rxr->hn_pktbuf, M_DEVBUF); 5160 } 5161 free(sc->hn_rx_ring, M_DEVBUF); 5162 sc->hn_rx_ring = NULL; 5163 5164 sc->hn_rx_ring_cnt = 0; 5165 sc->hn_rx_ring_inuse = 0; 5166 } 5167 5168 static int 5169 hn_tx_ring_create(struct hn_softc *sc, int id) 5170 { 5171 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5172 device_t dev = sc->hn_dev; 5173 bus_dma_tag_t parent_dtag; 5174 int error, i; 5175 5176 txr->hn_sc = sc; 5177 txr->hn_tx_idx = id; 5178 5179 #ifndef HN_USE_TXDESC_BUFRING 5180 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5181 #endif 5182 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5183 5184 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5185 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5186 M_DEVBUF, M_WAITOK | M_ZERO); 5187 #ifndef HN_USE_TXDESC_BUFRING 5188 SLIST_INIT(&txr->hn_txlist); 5189 #else 5190 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5191 M_WAITOK, &txr->hn_tx_lock); 5192 #endif 5193 5194 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5195 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5196 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5197 } else { 5198 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5199 } 5200 5201 #ifdef HN_IFSTART_SUPPORT 5202 if (hn_use_if_start) { 5203 txr->hn_txeof = hn_start_txeof; 5204 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5205 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5206 } else 5207 #endif 5208 { 5209 int br_depth; 5210 5211 txr->hn_txeof = hn_xmit_txeof; 5212 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5213 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5214 5215 br_depth = hn_get_txswq_depth(txr); 5216 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5217 M_WAITOK, &txr->hn_tx_lock); 5218 } 5219 5220 txr->hn_direct_tx_size = hn_direct_tx_size; 5221 5222 /* 5223 * Always schedule transmission instead of trying to do direct 5224 * transmission. This one gives the best performance so far. 5225 */ 5226 txr->hn_sched_tx = 1; 5227 5228 parent_dtag = bus_get_dma_tag(dev); 5229 5230 /* DMA tag for RNDIS packet messages. */ 5231 error = bus_dma_tag_create(parent_dtag, /* parent */ 5232 HN_RNDIS_PKT_ALIGN, /* alignment */ 5233 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5234 BUS_SPACE_MAXADDR, /* lowaddr */ 5235 BUS_SPACE_MAXADDR, /* highaddr */ 5236 NULL, NULL, /* filter, filterarg */ 5237 HN_RNDIS_PKT_LEN, /* maxsize */ 5238 1, /* nsegments */ 5239 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5240 0, /* flags */ 5241 NULL, /* lockfunc */ 5242 NULL, /* lockfuncarg */ 5243 &txr->hn_tx_rndis_dtag); 5244 if (error) { 5245 device_printf(dev, "failed to create rndis dmatag\n"); 5246 return error; 5247 } 5248 5249 /* DMA tag for data. */ 5250 error = bus_dma_tag_create(parent_dtag, /* parent */ 5251 1, /* alignment */ 5252 HN_TX_DATA_BOUNDARY, /* boundary */ 5253 BUS_SPACE_MAXADDR, /* lowaddr */ 5254 BUS_SPACE_MAXADDR, /* highaddr */ 5255 NULL, NULL, /* filter, filterarg */ 5256 HN_TX_DATA_MAXSIZE, /* maxsize */ 5257 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5258 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5259 0, /* flags */ 5260 NULL, /* lockfunc */ 5261 NULL, /* lockfuncarg */ 5262 &txr->hn_tx_data_dtag); 5263 if (error) { 5264 device_printf(dev, "failed to create data dmatag\n"); 5265 return error; 5266 } 5267 5268 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5269 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5270 5271 txd->txr = txr; 5272 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5273 STAILQ_INIT(&txd->agg_list); 5274 5275 /* 5276 * Allocate and load RNDIS packet message. 5277 */ 5278 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5279 (void **)&txd->rndis_pkt, 5280 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5281 &txd->rndis_pkt_dmap); 5282 if (error) { 5283 device_printf(dev, 5284 "failed to allocate rndis_packet_msg, %d\n", i); 5285 return error; 5286 } 5287 5288 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5289 txd->rndis_pkt_dmap, 5290 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5291 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5292 BUS_DMA_NOWAIT); 5293 if (error) { 5294 device_printf(dev, 5295 "failed to load rndis_packet_msg, %d\n", i); 5296 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5297 txd->rndis_pkt, txd->rndis_pkt_dmap); 5298 return error; 5299 } 5300 5301 /* DMA map for TX data. */ 5302 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5303 &txd->data_dmap); 5304 if (error) { 5305 device_printf(dev, 5306 "failed to allocate tx data dmamap\n"); 5307 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5308 txd->rndis_pkt_dmap); 5309 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5310 txd->rndis_pkt, txd->rndis_pkt_dmap); 5311 return error; 5312 } 5313 5314 /* All set, put it to list */ 5315 txd->flags |= HN_TXD_FLAG_ONLIST; 5316 #ifndef HN_USE_TXDESC_BUFRING 5317 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5318 #else 5319 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5320 #endif 5321 } 5322 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5323 5324 if (sc->hn_tx_sysctl_tree != NULL) { 5325 struct sysctl_oid_list *child; 5326 struct sysctl_ctx_list *ctx; 5327 char name[16]; 5328 5329 /* 5330 * Create per TX ring sysctl tree: 5331 * dev.hn.UNIT.tx.RINGID 5332 */ 5333 ctx = device_get_sysctl_ctx(dev); 5334 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5335 5336 snprintf(name, sizeof(name), "%d", id); 5337 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5338 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5339 5340 if (txr->hn_tx_sysctl_tree != NULL) { 5341 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5342 5343 #ifdef HN_DEBUG 5344 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5345 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5346 "# of available TX descs"); 5347 #endif 5348 #ifdef HN_IFSTART_SUPPORT 5349 if (!hn_use_if_start) 5350 #endif 5351 { 5352 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5353 CTLFLAG_RD, &txr->hn_oactive, 0, 5354 "over active"); 5355 } 5356 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5357 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5358 "# of packets transmitted"); 5359 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5360 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5361 "# of sends"); 5362 } 5363 } 5364 5365 return 0; 5366 } 5367 5368 static void 5369 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5370 { 5371 struct hn_tx_ring *txr = txd->txr; 5372 5373 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5374 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5375 5376 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5377 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5378 txd->rndis_pkt_dmap); 5379 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5380 } 5381 5382 static void 5383 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5384 { 5385 5386 KASSERT(txd->refs == 0 || txd->refs == 1, 5387 ("invalid txd refs %d", txd->refs)); 5388 5389 /* Aggregated txds will be freed by their aggregating txd. */ 5390 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5391 int freed __diagused; 5392 5393 freed = hn_txdesc_put(txr, txd); 5394 KASSERT(freed, ("can't free txdesc")); 5395 } 5396 } 5397 5398 static void 5399 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5400 { 5401 int i; 5402 5403 if (txr->hn_txdesc == NULL) 5404 return; 5405 5406 /* 5407 * NOTE: 5408 * Because the freeing of aggregated txds will be deferred 5409 * to the aggregating txd, two passes are used here: 5410 * - The first pass GCes any pending txds. This GC is necessary, 5411 * since if the channels are revoked, hypervisor will not 5412 * deliver send-done for all pending txds. 5413 * - The second pass frees the busdma stuffs, i.e. after all txds 5414 * were freed. 5415 */ 5416 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5417 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5418 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5419 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5420 5421 if (txr->hn_tx_data_dtag != NULL) 5422 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5423 if (txr->hn_tx_rndis_dtag != NULL) 5424 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5425 5426 #ifdef HN_USE_TXDESC_BUFRING 5427 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5428 #endif 5429 5430 free(txr->hn_txdesc, M_DEVBUF); 5431 txr->hn_txdesc = NULL; 5432 5433 if (txr->hn_mbuf_br != NULL) 5434 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5435 5436 #ifndef HN_USE_TXDESC_BUFRING 5437 mtx_destroy(&txr->hn_txlist_spin); 5438 #endif 5439 mtx_destroy(&txr->hn_tx_lock); 5440 } 5441 5442 static int 5443 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5444 { 5445 struct sysctl_oid_list *child; 5446 struct sysctl_ctx_list *ctx; 5447 int i; 5448 5449 /* 5450 * Create TXBUF for chimney sending. 5451 * 5452 * NOTE: It is shared by all channels. 5453 */ 5454 sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 5455 0ul, ~0ul, PAGE_SIZE, 0); 5456 if (sc->hn_chim == NULL) { 5457 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5458 return (ENOMEM); 5459 } 5460 5461 sc->hn_tx_ring_cnt = ring_cnt; 5462 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5463 5464 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5465 M_DEVBUF, M_WAITOK | M_ZERO); 5466 5467 ctx = device_get_sysctl_ctx(sc->hn_dev); 5468 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5469 5470 /* Create dev.hn.UNIT.tx sysctl tree */ 5471 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5472 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5473 5474 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5475 int error; 5476 5477 error = hn_tx_ring_create(sc, i); 5478 if (error) 5479 return error; 5480 } 5481 5482 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5483 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5484 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5485 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5486 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5487 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5488 __offsetof(struct hn_tx_ring, hn_send_failed), 5489 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5490 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5491 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5492 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5493 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5494 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5495 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5496 __offsetof(struct hn_tx_ring, hn_flush_failed), 5497 hn_tx_stat_ulong_sysctl, "LU", 5498 "# of packet transmission aggregation flush failure"); 5499 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5500 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5501 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5502 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5503 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5504 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5505 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5506 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5507 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5508 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5509 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5510 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5511 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5512 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5513 "# of total TX descs"); 5514 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5515 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5516 "Chimney send packet size upper boundary"); 5517 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5518 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5519 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5520 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5521 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5522 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5523 hn_tx_conf_int_sysctl, "I", 5524 "Size of the packet for direct transmission"); 5525 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5526 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5527 __offsetof(struct hn_tx_ring, hn_sched_tx), 5528 hn_tx_conf_int_sysctl, "I", 5529 "Always schedule transmission " 5530 "instead of doing direct transmission"); 5531 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5532 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5533 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5534 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5535 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5536 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5537 "Applied packet transmission aggregation size"); 5538 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5539 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5540 hn_txagg_pktmax_sysctl, "I", 5541 "Applied packet transmission aggregation packets"); 5542 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5543 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5544 hn_txagg_align_sysctl, "I", 5545 "Applied packet transmission aggregation alignment"); 5546 5547 return 0; 5548 } 5549 5550 static void 5551 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5552 { 5553 int i; 5554 5555 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5556 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5557 } 5558 5559 static void 5560 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5561 { 5562 if_t ifp = sc->hn_ifp; 5563 u_int hw_tsomax; 5564 int tso_minlen; 5565 5566 HN_LOCK_ASSERT(sc); 5567 5568 if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5569 return; 5570 5571 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5572 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5573 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5574 5575 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5576 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5577 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5578 5579 if (tso_maxlen < tso_minlen) 5580 tso_maxlen = tso_minlen; 5581 else if (tso_maxlen > IP_MAXPACKET) 5582 tso_maxlen = IP_MAXPACKET; 5583 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5584 tso_maxlen = sc->hn_ndis_tso_szmax; 5585 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5586 5587 if (hn_xpnt_vf_isready(sc)) { 5588 if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp)) 5589 hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp); 5590 } 5591 if_sethwtsomax(ifp, hw_tsomax); 5592 if (bootverbose) 5593 if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp)); 5594 } 5595 5596 static void 5597 hn_fixup_tx_data(struct hn_softc *sc) 5598 { 5599 uint64_t csum_assist; 5600 int i; 5601 5602 hn_set_chim_size(sc, sc->hn_chim_szmax); 5603 if (hn_tx_chimney_size > 0 && 5604 hn_tx_chimney_size < sc->hn_chim_szmax) 5605 hn_set_chim_size(sc, hn_tx_chimney_size); 5606 5607 csum_assist = 0; 5608 if (sc->hn_caps & HN_CAP_IPCS) 5609 csum_assist |= CSUM_IP; 5610 if (sc->hn_caps & HN_CAP_TCP4CS) 5611 csum_assist |= CSUM_IP_TCP; 5612 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5613 csum_assist |= CSUM_IP_UDP; 5614 if (sc->hn_caps & HN_CAP_TCP6CS) 5615 csum_assist |= CSUM_IP6_TCP; 5616 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5617 csum_assist |= CSUM_IP6_UDP; 5618 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5619 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5620 5621 if (sc->hn_caps & HN_CAP_HASHVAL) { 5622 /* 5623 * Support HASHVAL pktinfo on TX path. 5624 */ 5625 if (bootverbose) 5626 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5627 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5628 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5629 } 5630 } 5631 5632 static void 5633 hn_fixup_rx_data(struct hn_softc *sc) 5634 { 5635 5636 if (sc->hn_caps & HN_CAP_UDPHASH) { 5637 int i; 5638 5639 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5640 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5641 } 5642 } 5643 5644 static void 5645 hn_destroy_tx_data(struct hn_softc *sc) 5646 { 5647 int i; 5648 5649 if (sc->hn_chim != NULL) { 5650 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5651 contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF); 5652 } else { 5653 device_printf(sc->hn_dev, 5654 "chimney sending buffer is referenced"); 5655 } 5656 sc->hn_chim = NULL; 5657 } 5658 5659 if (sc->hn_tx_ring_cnt == 0) 5660 return; 5661 5662 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5663 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5664 5665 free(sc->hn_tx_ring, M_DEVBUF); 5666 sc->hn_tx_ring = NULL; 5667 5668 sc->hn_tx_ring_cnt = 0; 5669 sc->hn_tx_ring_inuse = 0; 5670 } 5671 5672 #ifdef HN_IFSTART_SUPPORT 5673 5674 static void 5675 hn_start_taskfunc(void *xtxr, int pending __unused) 5676 { 5677 struct hn_tx_ring *txr = xtxr; 5678 5679 mtx_lock(&txr->hn_tx_lock); 5680 hn_start_locked(txr, 0); 5681 mtx_unlock(&txr->hn_tx_lock); 5682 } 5683 5684 static int 5685 hn_start_locked(struct hn_tx_ring *txr, int len) 5686 { 5687 struct hn_softc *sc = txr->hn_sc; 5688 if_t ifp = sc->hn_ifp; 5689 int sched = 0; 5690 5691 KASSERT(hn_use_if_start, 5692 ("hn_start_locked is called, when if_start is disabled")); 5693 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5694 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5695 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5696 5697 if (__predict_false(txr->hn_suspended)) 5698 return (0); 5699 5700 if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5701 IFF_DRV_RUNNING) 5702 return (0); 5703 5704 while (!if_sendq_empty(ifp)) { 5705 struct hn_txdesc *txd; 5706 struct mbuf *m_head; 5707 int error; 5708 5709 m_head = if_dequeue(ifp); 5710 if (m_head == NULL) 5711 break; 5712 5713 if (len > 0 && m_head->m_pkthdr.len > len) { 5714 /* 5715 * This sending could be time consuming; let callers 5716 * dispatch this packet sending (and sending of any 5717 * following up packets) to tx taskqueue. 5718 */ 5719 if_sendq_prepend(ifp, m_head); 5720 sched = 1; 5721 break; 5722 } 5723 5724 #if defined(INET6) || defined(INET) 5725 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5726 m_head = hn_tso_fixup(m_head); 5727 if (__predict_false(m_head == NULL)) { 5728 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5729 continue; 5730 } 5731 } else if (m_head->m_pkthdr.csum_flags & 5732 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5733 m_head = hn_set_hlen(m_head); 5734 if (__predict_false(m_head == NULL)) { 5735 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5736 continue; 5737 } 5738 } 5739 #endif 5740 5741 txd = hn_txdesc_get(txr); 5742 if (txd == NULL) { 5743 txr->hn_no_txdescs++; 5744 if_sendq_prepend(ifp, m_head); 5745 if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); 5746 break; 5747 } 5748 5749 error = hn_encap(ifp, txr, txd, &m_head); 5750 if (error) { 5751 /* Both txd and m_head are freed */ 5752 KASSERT(txr->hn_agg_txd == NULL, 5753 ("encap failed w/ pending aggregating txdesc")); 5754 continue; 5755 } 5756 5757 if (txr->hn_agg_pktleft == 0) { 5758 if (txr->hn_agg_txd != NULL) { 5759 KASSERT(m_head == NULL, 5760 ("pending mbuf for aggregating txdesc")); 5761 error = hn_flush_txagg(ifp, txr); 5762 if (__predict_false(error)) { 5763 if_setdrvflagbits(ifp, 5764 IFF_DRV_OACTIVE, 0); 5765 break; 5766 } 5767 } else { 5768 KASSERT(m_head != NULL, ("mbuf was freed")); 5769 error = hn_txpkt(ifp, txr, txd); 5770 if (__predict_false(error)) { 5771 /* txd is freed, but m_head is not */ 5772 if_sendq_prepend(ifp, m_head); 5773 if_setdrvflagbits(ifp, 5774 IFF_DRV_OACTIVE, 0); 5775 break; 5776 } 5777 } 5778 } 5779 #ifdef INVARIANTS 5780 else { 5781 KASSERT(txr->hn_agg_txd != NULL, 5782 ("no aggregating txdesc")); 5783 KASSERT(m_head == NULL, 5784 ("pending mbuf for aggregating txdesc")); 5785 } 5786 #endif 5787 } 5788 5789 /* Flush pending aggerated transmission. */ 5790 if (txr->hn_agg_txd != NULL) 5791 hn_flush_txagg(ifp, txr); 5792 return (sched); 5793 } 5794 5795 static void 5796 hn_start(if_t ifp) 5797 { 5798 struct hn_softc *sc = if_getsoftc(ifp); 5799 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5800 5801 if (txr->hn_sched_tx) 5802 goto do_sched; 5803 5804 if (mtx_trylock(&txr->hn_tx_lock)) { 5805 int sched; 5806 5807 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5808 mtx_unlock(&txr->hn_tx_lock); 5809 if (!sched) 5810 return; 5811 } 5812 do_sched: 5813 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5814 } 5815 5816 static void 5817 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5818 { 5819 struct hn_tx_ring *txr = xtxr; 5820 5821 mtx_lock(&txr->hn_tx_lock); 5822 if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE); 5823 hn_start_locked(txr, 0); 5824 mtx_unlock(&txr->hn_tx_lock); 5825 } 5826 5827 static void 5828 hn_start_txeof(struct hn_tx_ring *txr) 5829 { 5830 struct hn_softc *sc = txr->hn_sc; 5831 if_t ifp = sc->hn_ifp; 5832 5833 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5834 5835 if (txr->hn_sched_tx) 5836 goto do_sched; 5837 5838 if (mtx_trylock(&txr->hn_tx_lock)) { 5839 int sched; 5840 5841 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5842 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5843 mtx_unlock(&txr->hn_tx_lock); 5844 if (sched) { 5845 taskqueue_enqueue(txr->hn_tx_taskq, 5846 &txr->hn_tx_task); 5847 } 5848 } else { 5849 do_sched: 5850 /* 5851 * Release the OACTIVE earlier, with the hope, that 5852 * others could catch up. The task will clear the 5853 * flag again with the hn_tx_lock to avoid possible 5854 * races. 5855 */ 5856 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5857 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5858 } 5859 } 5860 5861 #endif /* HN_IFSTART_SUPPORT */ 5862 5863 static int 5864 hn_xmit(struct hn_tx_ring *txr, int len) 5865 { 5866 struct hn_softc *sc = txr->hn_sc; 5867 if_t ifp = sc->hn_ifp; 5868 struct mbuf *m_head; 5869 int sched = 0; 5870 5871 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5872 #ifdef HN_IFSTART_SUPPORT 5873 KASSERT(hn_use_if_start == 0, 5874 ("hn_xmit is called, when if_start is enabled")); 5875 #endif 5876 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5877 5878 if (__predict_false(txr->hn_suspended)) 5879 return (0); 5880 5881 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5882 return (0); 5883 5884 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5885 struct hn_txdesc *txd; 5886 int error; 5887 5888 if (len > 0 && m_head->m_pkthdr.len > len) { 5889 /* 5890 * This sending could be time consuming; let callers 5891 * dispatch this packet sending (and sending of any 5892 * following up packets) to tx taskqueue. 5893 */ 5894 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5895 sched = 1; 5896 break; 5897 } 5898 5899 txd = hn_txdesc_get(txr); 5900 if (txd == NULL) { 5901 txr->hn_no_txdescs++; 5902 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5903 txr->hn_oactive = 1; 5904 break; 5905 } 5906 5907 error = hn_encap(ifp, txr, txd, &m_head); 5908 if (error) { 5909 /* Both txd and m_head are freed; discard */ 5910 KASSERT(txr->hn_agg_txd == NULL, 5911 ("encap failed w/ pending aggregating txdesc")); 5912 drbr_advance(ifp, txr->hn_mbuf_br); 5913 continue; 5914 } 5915 5916 if (txr->hn_agg_pktleft == 0) { 5917 if (txr->hn_agg_txd != NULL) { 5918 KASSERT(m_head == NULL, 5919 ("pending mbuf for aggregating txdesc")); 5920 error = hn_flush_txagg(ifp, txr); 5921 if (__predict_false(error)) { 5922 txr->hn_oactive = 1; 5923 break; 5924 } 5925 } else { 5926 KASSERT(m_head != NULL, ("mbuf was freed")); 5927 error = hn_txpkt(ifp, txr, txd); 5928 if (__predict_false(error)) { 5929 /* txd is freed, but m_head is not */ 5930 drbr_putback(ifp, txr->hn_mbuf_br, 5931 m_head); 5932 txr->hn_oactive = 1; 5933 break; 5934 } 5935 } 5936 } 5937 #ifdef INVARIANTS 5938 else { 5939 KASSERT(txr->hn_agg_txd != NULL, 5940 ("no aggregating txdesc")); 5941 KASSERT(m_head == NULL, 5942 ("pending mbuf for aggregating txdesc")); 5943 } 5944 #endif 5945 5946 /* Sent */ 5947 drbr_advance(ifp, txr->hn_mbuf_br); 5948 } 5949 5950 /* Flush pending aggerated transmission. */ 5951 if (txr->hn_agg_txd != NULL) 5952 hn_flush_txagg(ifp, txr); 5953 return (sched); 5954 } 5955 5956 static int 5957 hn_transmit(if_t ifp, struct mbuf *m) 5958 { 5959 struct hn_softc *sc = if_getsoftc(ifp); 5960 struct hn_tx_ring *txr; 5961 int error, idx = 0; 5962 5963 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5964 struct rm_priotracker pt; 5965 5966 rm_rlock(&sc->hn_vf_lock, &pt); 5967 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5968 struct mbuf *m_bpf = NULL; 5969 int obytes, omcast; 5970 5971 obytes = m->m_pkthdr.len; 5972 omcast = (m->m_flags & M_MCAST) != 0; 5973 5974 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 5975 if (bpf_peers_present(if_getbpf(ifp))) { 5976 m_bpf = m_copypacket(m, M_NOWAIT); 5977 if (m_bpf == NULL) { 5978 /* 5979 * Failed to grab a shallow 5980 * copy; tap now. 5981 */ 5982 ETHER_BPF_MTAP(ifp, m); 5983 } 5984 } 5985 } else { 5986 ETHER_BPF_MTAP(ifp, m); 5987 } 5988 5989 error = if_transmit(sc->hn_vf_ifp, m); 5990 rm_runlock(&sc->hn_vf_lock, &pt); 5991 5992 if (m_bpf != NULL) { 5993 if (!error) 5994 ETHER_BPF_MTAP(ifp, m_bpf); 5995 m_freem(m_bpf); 5996 } 5997 5998 if (error == ENOBUFS) { 5999 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6000 } else if (error) { 6001 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6002 } else { 6003 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6004 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6005 if (omcast) { 6006 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6007 omcast); 6008 } 6009 } 6010 return (error); 6011 } 6012 rm_runlock(&sc->hn_vf_lock, &pt); 6013 } 6014 6015 #if defined(INET6) || defined(INET) 6016 /* 6017 * Perform TSO packet header fixup or get l2/l3 header length now, 6018 * since packet headers should be cache-hot. 6019 */ 6020 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6021 m = hn_tso_fixup(m); 6022 if (__predict_false(m == NULL)) { 6023 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6024 return EIO; 6025 } 6026 } else if (m->m_pkthdr.csum_flags & 6027 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6028 m = hn_set_hlen(m); 6029 if (__predict_false(m == NULL)) { 6030 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6031 return EIO; 6032 } 6033 } 6034 #endif 6035 6036 /* 6037 * Select the TX ring based on flowid 6038 */ 6039 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6040 #ifdef RSS 6041 uint32_t bid; 6042 6043 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6044 &bid) == 0) 6045 idx = bid % sc->hn_tx_ring_inuse; 6046 else 6047 #endif 6048 { 6049 #if defined(INET6) || defined(INET) 6050 int tcpsyn = 0; 6051 6052 if (m->m_pkthdr.len < 128 && 6053 (m->m_pkthdr.csum_flags & 6054 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6055 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6056 m = hn_check_tcpsyn(m, &tcpsyn); 6057 if (__predict_false(m == NULL)) { 6058 if_inc_counter(ifp, 6059 IFCOUNTER_OERRORS, 1); 6060 return (EIO); 6061 } 6062 } 6063 #else 6064 const int tcpsyn = 0; 6065 #endif 6066 if (tcpsyn) 6067 idx = 0; 6068 else 6069 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6070 } 6071 } 6072 txr = &sc->hn_tx_ring[idx]; 6073 6074 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6075 if (error) { 6076 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6077 return error; 6078 } 6079 6080 if (txr->hn_oactive) 6081 return 0; 6082 6083 if (txr->hn_sched_tx) 6084 goto do_sched; 6085 6086 if (mtx_trylock(&txr->hn_tx_lock)) { 6087 int sched; 6088 6089 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6090 mtx_unlock(&txr->hn_tx_lock); 6091 if (!sched) 6092 return 0; 6093 } 6094 do_sched: 6095 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6096 return 0; 6097 } 6098 6099 static void 6100 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6101 { 6102 struct mbuf *m; 6103 6104 mtx_lock(&txr->hn_tx_lock); 6105 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6106 m_freem(m); 6107 mtx_unlock(&txr->hn_tx_lock); 6108 } 6109 6110 static void 6111 hn_xmit_qflush(if_t ifp) 6112 { 6113 struct hn_softc *sc = if_getsoftc(ifp); 6114 struct rm_priotracker pt; 6115 int i; 6116 6117 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6118 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6119 if_qflush(ifp); 6120 6121 rm_rlock(&sc->hn_vf_lock, &pt); 6122 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6123 if_qflush(sc->hn_vf_ifp); 6124 rm_runlock(&sc->hn_vf_lock, &pt); 6125 } 6126 6127 static void 6128 hn_xmit_txeof(struct hn_tx_ring *txr) 6129 { 6130 6131 if (txr->hn_sched_tx) 6132 goto do_sched; 6133 6134 if (mtx_trylock(&txr->hn_tx_lock)) { 6135 int sched; 6136 6137 txr->hn_oactive = 0; 6138 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6139 mtx_unlock(&txr->hn_tx_lock); 6140 if (sched) { 6141 taskqueue_enqueue(txr->hn_tx_taskq, 6142 &txr->hn_tx_task); 6143 } 6144 } else { 6145 do_sched: 6146 /* 6147 * Release the oactive earlier, with the hope, that 6148 * others could catch up. The task will clear the 6149 * oactive again with the hn_tx_lock to avoid possible 6150 * races. 6151 */ 6152 txr->hn_oactive = 0; 6153 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6154 } 6155 } 6156 6157 static void 6158 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6159 { 6160 struct hn_tx_ring *txr = xtxr; 6161 6162 mtx_lock(&txr->hn_tx_lock); 6163 hn_xmit(txr, 0); 6164 mtx_unlock(&txr->hn_tx_lock); 6165 } 6166 6167 static void 6168 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6169 { 6170 struct hn_tx_ring *txr = xtxr; 6171 6172 mtx_lock(&txr->hn_tx_lock); 6173 txr->hn_oactive = 0; 6174 hn_xmit(txr, 0); 6175 mtx_unlock(&txr->hn_tx_lock); 6176 } 6177 6178 static int 6179 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6180 { 6181 struct vmbus_chan_br cbr; 6182 struct hn_rx_ring *rxr; 6183 struct hn_tx_ring *txr = NULL; 6184 int idx, error; 6185 6186 idx = vmbus_chan_subidx(chan); 6187 6188 /* 6189 * Link this channel to RX/TX ring. 6190 */ 6191 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6192 ("invalid channel index %d, should > 0 && < %d", 6193 idx, sc->hn_rx_ring_inuse)); 6194 rxr = &sc->hn_rx_ring[idx]; 6195 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6196 ("RX ring %d already attached", idx)); 6197 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6198 rxr->hn_chan = chan; 6199 6200 if (bootverbose) { 6201 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6202 idx, vmbus_chan_id(chan)); 6203 } 6204 6205 if (idx < sc->hn_tx_ring_inuse) { 6206 txr = &sc->hn_tx_ring[idx]; 6207 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6208 ("TX ring %d already attached", idx)); 6209 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6210 6211 txr->hn_chan = chan; 6212 if (bootverbose) { 6213 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6214 idx, vmbus_chan_id(chan)); 6215 } 6216 } 6217 6218 /* Bind this channel to a proper CPU. */ 6219 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6220 6221 /* 6222 * Open this channel 6223 */ 6224 cbr.cbr = rxr->hn_br; 6225 cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br); 6226 cbr.cbr_txsz = HN_TXBR_SIZE; 6227 cbr.cbr_rxsz = HN_RXBR_SIZE; 6228 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6229 if (error) { 6230 if (error == EISCONN) { 6231 if_printf(sc->hn_ifp, "bufring is connected after " 6232 "chan%u open failure\n", vmbus_chan_id(chan)); 6233 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6234 } else { 6235 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6236 vmbus_chan_id(chan), error); 6237 } 6238 } 6239 return (error); 6240 } 6241 6242 static void 6243 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6244 { 6245 struct hn_rx_ring *rxr; 6246 int idx, error; 6247 6248 idx = vmbus_chan_subidx(chan); 6249 6250 /* 6251 * Link this channel to RX/TX ring. 6252 */ 6253 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6254 ("invalid channel index %d, should > 0 && < %d", 6255 idx, sc->hn_rx_ring_inuse)); 6256 rxr = &sc->hn_rx_ring[idx]; 6257 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6258 ("RX ring %d is not attached", idx)); 6259 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6260 6261 if (idx < sc->hn_tx_ring_inuse) { 6262 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6263 6264 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6265 ("TX ring %d is not attached attached", idx)); 6266 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6267 } 6268 6269 /* 6270 * Close this channel. 6271 * 6272 * NOTE: 6273 * Channel closing does _not_ destroy the target channel. 6274 */ 6275 error = vmbus_chan_close_direct(chan); 6276 if (error == EISCONN) { 6277 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6278 "after being closed\n", vmbus_chan_id(chan)); 6279 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6280 } else if (error) { 6281 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6282 vmbus_chan_id(chan), error); 6283 } 6284 } 6285 6286 static int 6287 hn_attach_subchans(struct hn_softc *sc) 6288 { 6289 struct vmbus_channel **subchans; 6290 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6291 int i, error = 0; 6292 6293 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6294 6295 /* Attach the sub-channels. */ 6296 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6297 for (i = 0; i < subchan_cnt; ++i) { 6298 int error1; 6299 6300 error1 = hn_chan_attach(sc, subchans[i]); 6301 if (error1) { 6302 error = error1; 6303 /* Move on; all channels will be detached later. */ 6304 } 6305 } 6306 vmbus_subchan_rel(subchans, subchan_cnt); 6307 6308 if (error) { 6309 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6310 } else { 6311 if (bootverbose) { 6312 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6313 subchan_cnt); 6314 } 6315 } 6316 return (error); 6317 } 6318 6319 static void 6320 hn_detach_allchans(struct hn_softc *sc) 6321 { 6322 struct vmbus_channel **subchans; 6323 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6324 int i; 6325 6326 if (subchan_cnt == 0) 6327 goto back; 6328 6329 /* Detach the sub-channels. */ 6330 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6331 for (i = 0; i < subchan_cnt; ++i) 6332 hn_chan_detach(sc, subchans[i]); 6333 vmbus_subchan_rel(subchans, subchan_cnt); 6334 6335 back: 6336 /* 6337 * Detach the primary channel, _after_ all sub-channels 6338 * are detached. 6339 */ 6340 hn_chan_detach(sc, sc->hn_prichan); 6341 6342 /* Wait for sub-channels to be destroyed, if any. */ 6343 vmbus_subchan_drain(sc->hn_prichan); 6344 6345 #ifdef INVARIANTS 6346 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6347 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6348 HN_RX_FLAG_ATTACHED) == 0, 6349 ("%dth RX ring is still attached", i)); 6350 } 6351 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6352 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6353 HN_TX_FLAG_ATTACHED) == 0, 6354 ("%dth TX ring is still attached", i)); 6355 } 6356 #endif 6357 } 6358 6359 static int 6360 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6361 { 6362 struct vmbus_channel **subchans; 6363 int nchan, rxr_cnt, error; 6364 6365 nchan = *nsubch + 1; 6366 if (nchan == 1) { 6367 /* 6368 * Multiple RX/TX rings are not requested. 6369 */ 6370 *nsubch = 0; 6371 return (0); 6372 } 6373 6374 /* 6375 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6376 * table entries. 6377 */ 6378 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6379 if (error) { 6380 /* No RSS; this is benign. */ 6381 *nsubch = 0; 6382 return (0); 6383 } 6384 if (bootverbose) { 6385 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6386 rxr_cnt, nchan); 6387 } 6388 6389 if (nchan > rxr_cnt) 6390 nchan = rxr_cnt; 6391 if (nchan == 1) { 6392 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6393 *nsubch = 0; 6394 return (0); 6395 } 6396 6397 /* 6398 * Allocate sub-channels from NVS. 6399 */ 6400 *nsubch = nchan - 1; 6401 error = hn_nvs_alloc_subchans(sc, nsubch); 6402 if (error || *nsubch == 0) { 6403 /* Failed to allocate sub-channels. */ 6404 *nsubch = 0; 6405 return (0); 6406 } 6407 6408 /* 6409 * Wait for all sub-channels to become ready before moving on. 6410 */ 6411 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6412 vmbus_subchan_rel(subchans, *nsubch); 6413 return (0); 6414 } 6415 6416 static bool 6417 hn_synth_attachable(const struct hn_softc *sc) 6418 { 6419 int i; 6420 6421 if (sc->hn_flags & HN_FLAG_ERRORS) 6422 return (false); 6423 6424 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6425 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6426 6427 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6428 return (false); 6429 } 6430 return (true); 6431 } 6432 6433 /* 6434 * Make sure that the RX filter is zero after the successful 6435 * RNDIS initialization. 6436 * 6437 * NOTE: 6438 * Under certain conditions on certain versions of Hyper-V, 6439 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6440 * after the successful RNDIS initialization, which breaks 6441 * the assumption of any following code (well, it breaks the 6442 * RNDIS API contract actually). Clear the RNDIS rxfilter 6443 * explicitly, drain packets sneaking through, and drain the 6444 * interrupt taskqueues scheduled due to the stealth packets. 6445 */ 6446 static void 6447 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6448 { 6449 6450 hn_disable_rx(sc); 6451 hn_drain_rxtx(sc, nchan); 6452 } 6453 6454 static int 6455 hn_synth_attach(struct hn_softc *sc, int mtu) 6456 { 6457 #define ATTACHED_NVS 0x0002 6458 #define ATTACHED_RNDIS 0x0004 6459 6460 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6461 int error, nsubch, nchan = 1, i, rndis_inited; 6462 uint32_t old_caps, attached = 0; 6463 6464 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6465 ("synthetic parts were attached")); 6466 6467 if (!hn_synth_attachable(sc)) 6468 return (ENXIO); 6469 6470 /* Save capabilities for later verification. */ 6471 old_caps = sc->hn_caps; 6472 sc->hn_caps = 0; 6473 6474 /* Clear RSS stuffs. */ 6475 sc->hn_rss_ind_size = 0; 6476 sc->hn_rss_hash = 0; 6477 sc->hn_rss_hcap = 0; 6478 6479 /* 6480 * Attach the primary channel _before_ attaching NVS and RNDIS. 6481 */ 6482 error = hn_chan_attach(sc, sc->hn_prichan); 6483 if (error) 6484 goto failed; 6485 6486 /* 6487 * Attach NVS. 6488 */ 6489 error = hn_nvs_attach(sc, mtu); 6490 if (error) 6491 goto failed; 6492 attached |= ATTACHED_NVS; 6493 6494 /* 6495 * Attach RNDIS _after_ NVS is attached. 6496 */ 6497 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6498 if (rndis_inited) 6499 attached |= ATTACHED_RNDIS; 6500 if (error) 6501 goto failed; 6502 6503 /* 6504 * Make sure capabilities are not changed. 6505 */ 6506 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6507 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6508 old_caps, sc->hn_caps); 6509 error = ENXIO; 6510 goto failed; 6511 } 6512 6513 /* 6514 * Allocate sub-channels for multi-TX/RX rings. 6515 * 6516 * NOTE: 6517 * The # of RX rings that can be used is equivalent to the # of 6518 * channels to be requested. 6519 */ 6520 nsubch = sc->hn_rx_ring_cnt - 1; 6521 error = hn_synth_alloc_subchans(sc, &nsubch); 6522 if (error) 6523 goto failed; 6524 /* NOTE: _Full_ synthetic parts detach is required now. */ 6525 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6526 6527 /* 6528 * Set the # of TX/RX rings that could be used according to 6529 * the # of channels that NVS offered. 6530 */ 6531 nchan = nsubch + 1; 6532 hn_set_ring_inuse(sc, nchan); 6533 if (nchan == 1) { 6534 /* Only the primary channel can be used; done */ 6535 goto back; 6536 } 6537 6538 /* 6539 * Attach the sub-channels. 6540 * 6541 * NOTE: hn_set_ring_inuse() _must_ have been called. 6542 */ 6543 error = hn_attach_subchans(sc); 6544 if (error) 6545 goto failed; 6546 6547 /* 6548 * Configure RSS key and indirect table _after_ all sub-channels 6549 * are attached. 6550 */ 6551 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6552 /* 6553 * RSS key is not set yet; set it to the default RSS key. 6554 */ 6555 if (bootverbose) 6556 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6557 #ifdef RSS 6558 rss_getkey(rss->rss_key); 6559 #else 6560 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6561 #endif 6562 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6563 } 6564 6565 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6566 /* 6567 * RSS indirect table is not set yet; set it up in round- 6568 * robin fashion. 6569 */ 6570 if (bootverbose) { 6571 if_printf(sc->hn_ifp, "setup default RSS indirect " 6572 "table\n"); 6573 } 6574 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6575 uint32_t subidx; 6576 6577 #ifdef RSS 6578 subidx = rss_get_indirection_to_bucket(i); 6579 #else 6580 subidx = i; 6581 #endif 6582 rss->rss_ind[i] = subidx % nchan; 6583 } 6584 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6585 } else { 6586 /* 6587 * # of usable channels may be changed, so we have to 6588 * make sure that all entries in RSS indirect table 6589 * are valid. 6590 * 6591 * NOTE: hn_set_ring_inuse() _must_ have been called. 6592 */ 6593 hn_rss_ind_fixup(sc); 6594 } 6595 6596 sc->hn_rss_hash = sc->hn_rss_hcap; 6597 if ((sc->hn_flags & HN_FLAG_RXVF) || 6598 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6599 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6600 hn_vf_rss_fixup(sc, false); 6601 } 6602 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6603 if (error) 6604 goto failed; 6605 back: 6606 /* 6607 * Fixup transmission aggregation setup. 6608 */ 6609 hn_set_txagg(sc); 6610 hn_rndis_init_fixat(sc, nchan); 6611 return (0); 6612 6613 failed: 6614 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6615 hn_rndis_init_fixat(sc, nchan); 6616 hn_synth_detach(sc); 6617 } else { 6618 if (attached & ATTACHED_RNDIS) { 6619 hn_rndis_init_fixat(sc, nchan); 6620 hn_rndis_detach(sc); 6621 } 6622 if (attached & ATTACHED_NVS) 6623 hn_nvs_detach(sc); 6624 hn_chan_detach(sc, sc->hn_prichan); 6625 /* Restore old capabilities. */ 6626 sc->hn_caps = old_caps; 6627 } 6628 return (error); 6629 6630 #undef ATTACHED_RNDIS 6631 #undef ATTACHED_NVS 6632 } 6633 6634 /* 6635 * NOTE: 6636 * The interface must have been suspended though hn_suspend(), before 6637 * this function get called. 6638 */ 6639 static void 6640 hn_synth_detach(struct hn_softc *sc) 6641 { 6642 6643 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6644 ("synthetic parts were not attached")); 6645 6646 /* Detach the RNDIS first. */ 6647 hn_rndis_detach(sc); 6648 6649 /* Detach NVS. */ 6650 hn_nvs_detach(sc); 6651 6652 /* Detach all of the channels. */ 6653 hn_detach_allchans(sc); 6654 6655 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6656 /* 6657 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6658 */ 6659 int error; 6660 6661 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6662 sc->hn_rxbuf_gpadl); 6663 if (error) { 6664 if_printf(sc->hn_ifp, 6665 "rxbuf gpadl disconn failed: %d\n", error); 6666 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6667 } 6668 sc->hn_rxbuf_gpadl = 0; 6669 } 6670 6671 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6672 /* 6673 * Host is post-Win2016, disconnect chimney sending buffer from 6674 * primary channel here. 6675 */ 6676 int error; 6677 6678 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6679 sc->hn_chim_gpadl); 6680 if (error) { 6681 if_printf(sc->hn_ifp, 6682 "chim gpadl disconn failed: %d\n", error); 6683 sc->hn_flags |= HN_FLAG_CHIM_REF; 6684 } 6685 sc->hn_chim_gpadl = 0; 6686 } 6687 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6688 } 6689 6690 static void 6691 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6692 { 6693 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6694 ("invalid ring count %d", ring_cnt)); 6695 6696 if (sc->hn_tx_ring_cnt > ring_cnt) 6697 sc->hn_tx_ring_inuse = ring_cnt; 6698 else 6699 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6700 sc->hn_rx_ring_inuse = ring_cnt; 6701 6702 #ifdef RSS 6703 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6704 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6705 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6706 rss_getnumbuckets()); 6707 } 6708 #endif 6709 6710 if (bootverbose) { 6711 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6712 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6713 } 6714 } 6715 6716 static void 6717 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6718 { 6719 6720 /* 6721 * NOTE: 6722 * The TX bufring will not be drained by the hypervisor, 6723 * if the primary channel is revoked. 6724 */ 6725 while (!vmbus_chan_rx_empty(chan) || 6726 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6727 !vmbus_chan_tx_empty(chan))) 6728 pause("waitch", 1); 6729 vmbus_chan_intr_drain(chan); 6730 } 6731 6732 static void 6733 hn_disable_rx(struct hn_softc *sc) 6734 { 6735 6736 /* 6737 * Disable RX by clearing RX filter forcefully. 6738 */ 6739 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6740 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6741 6742 /* 6743 * Give RNDIS enough time to flush all pending data packets. 6744 */ 6745 pause("waitrx", (200 * hz) / 1000); 6746 } 6747 6748 /* 6749 * NOTE: 6750 * RX/TX _must_ have been suspended/disabled, before this function 6751 * is called. 6752 */ 6753 static void 6754 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6755 { 6756 struct vmbus_channel **subch = NULL; 6757 int nsubch; 6758 6759 /* 6760 * Drain RX/TX bufrings and interrupts. 6761 */ 6762 nsubch = nchan - 1; 6763 if (nsubch > 0) 6764 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6765 6766 if (subch != NULL) { 6767 int i; 6768 6769 for (i = 0; i < nsubch; ++i) 6770 hn_chan_drain(sc, subch[i]); 6771 } 6772 hn_chan_drain(sc, sc->hn_prichan); 6773 6774 if (subch != NULL) 6775 vmbus_subchan_rel(subch, nsubch); 6776 } 6777 6778 static void 6779 hn_suspend_data(struct hn_softc *sc) 6780 { 6781 struct hn_tx_ring *txr; 6782 int i; 6783 6784 HN_LOCK_ASSERT(sc); 6785 6786 /* 6787 * Suspend TX. 6788 */ 6789 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6790 txr = &sc->hn_tx_ring[i]; 6791 6792 mtx_lock(&txr->hn_tx_lock); 6793 txr->hn_suspended = 1; 6794 mtx_unlock(&txr->hn_tx_lock); 6795 /* No one is able send more packets now. */ 6796 6797 /* 6798 * Wait for all pending sends to finish. 6799 * 6800 * NOTE: 6801 * We will _not_ receive all pending send-done, if the 6802 * primary channel is revoked. 6803 */ 6804 while (hn_tx_ring_pending(txr) && 6805 !vmbus_chan_is_revoked(sc->hn_prichan)) 6806 pause("hnwtx", 1 /* 1 tick */); 6807 } 6808 6809 /* 6810 * Disable RX. 6811 */ 6812 hn_disable_rx(sc); 6813 6814 /* 6815 * Drain RX/TX. 6816 */ 6817 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6818 6819 /* 6820 * Drain any pending TX tasks. 6821 * 6822 * NOTE: 6823 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6824 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6825 */ 6826 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6827 txr = &sc->hn_tx_ring[i]; 6828 6829 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6830 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6831 } 6832 } 6833 6834 static void 6835 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6836 { 6837 6838 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6839 } 6840 6841 static void 6842 hn_suspend_mgmt(struct hn_softc *sc) 6843 { 6844 struct task task; 6845 6846 HN_LOCK_ASSERT(sc); 6847 6848 /* 6849 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6850 * through hn_mgmt_taskq. 6851 */ 6852 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6853 vmbus_chan_run_task(sc->hn_prichan, &task); 6854 6855 /* 6856 * Make sure that all pending management tasks are completed. 6857 */ 6858 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6859 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6860 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6861 } 6862 6863 static void 6864 hn_suspend(struct hn_softc *sc) 6865 { 6866 6867 /* Disable polling. */ 6868 hn_polling(sc, 0); 6869 6870 /* 6871 * If the non-transparent mode VF is activated, the synthetic 6872 * device is receiving packets, so the data path of the 6873 * synthetic device must be suspended. 6874 */ 6875 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 6876 (sc->hn_flags & HN_FLAG_RXVF)) 6877 hn_suspend_data(sc); 6878 hn_suspend_mgmt(sc); 6879 } 6880 6881 static void 6882 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6883 { 6884 int i; 6885 6886 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6887 ("invalid TX ring count %d", tx_ring_cnt)); 6888 6889 for (i = 0; i < tx_ring_cnt; ++i) { 6890 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6891 6892 mtx_lock(&txr->hn_tx_lock); 6893 txr->hn_suspended = 0; 6894 mtx_unlock(&txr->hn_tx_lock); 6895 } 6896 } 6897 6898 static void 6899 hn_resume_data(struct hn_softc *sc) 6900 { 6901 int i; 6902 6903 HN_LOCK_ASSERT(sc); 6904 6905 /* 6906 * Re-enable RX. 6907 */ 6908 hn_rxfilter_config(sc); 6909 6910 /* 6911 * Make sure to clear suspend status on "all" TX rings, 6912 * since hn_tx_ring_inuse can be changed after 6913 * hn_suspend_data(). 6914 */ 6915 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6916 6917 #ifdef HN_IFSTART_SUPPORT 6918 if (!hn_use_if_start) 6919 #endif 6920 { 6921 /* 6922 * Flush unused drbrs, since hn_tx_ring_inuse may be 6923 * reduced. 6924 */ 6925 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6926 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6927 } 6928 6929 /* 6930 * Kick start TX. 6931 */ 6932 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6933 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6934 6935 /* 6936 * Use txeof task, so that any pending oactive can be 6937 * cleared properly. 6938 */ 6939 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6940 } 6941 } 6942 6943 static void 6944 hn_resume_mgmt(struct hn_softc *sc) 6945 { 6946 6947 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6948 6949 /* 6950 * Kick off network change detection, if it was pending. 6951 * If no network change was pending, start link status 6952 * checks, which is more lightweight than network change 6953 * detection. 6954 */ 6955 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6956 hn_change_network(sc); 6957 else 6958 hn_update_link_status(sc); 6959 } 6960 6961 static void 6962 hn_resume(struct hn_softc *sc) 6963 { 6964 6965 /* 6966 * If the non-transparent mode VF is activated, the synthetic 6967 * device have to receive packets, so the data path of the 6968 * synthetic device must be resumed. 6969 */ 6970 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 6971 (sc->hn_flags & HN_FLAG_RXVF)) 6972 hn_resume_data(sc); 6973 6974 /* 6975 * Don't resume link status change if VF is attached/activated. 6976 * - In the non-transparent VF mode, the synthetic device marks 6977 * link down until the VF is deactivated; i.e. VF is down. 6978 * - In transparent VF mode, VF's media status is used until 6979 * the VF is detached. 6980 */ 6981 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 6982 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 6983 hn_resume_mgmt(sc); 6984 6985 /* 6986 * Re-enable polling if this interface is running and 6987 * the polling is requested. 6988 */ 6989 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 6990 hn_polling(sc, sc->hn_pollhz); 6991 } 6992 6993 static void 6994 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 6995 { 6996 const struct rndis_status_msg *msg; 6997 int ofs; 6998 6999 if (dlen < sizeof(*msg)) { 7000 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7001 return; 7002 } 7003 msg = data; 7004 7005 switch (msg->rm_status) { 7006 case RNDIS_STATUS_MEDIA_CONNECT: 7007 case RNDIS_STATUS_MEDIA_DISCONNECT: 7008 hn_update_link_status(sc); 7009 break; 7010 7011 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7012 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7013 /* Not really useful; ignore. */ 7014 break; 7015 7016 case RNDIS_STATUS_NETWORK_CHANGE: 7017 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7018 if (dlen < ofs + msg->rm_stbuflen || 7019 msg->rm_stbuflen < sizeof(uint32_t)) { 7020 if_printf(sc->hn_ifp, "network changed\n"); 7021 } else { 7022 uint32_t change; 7023 7024 memcpy(&change, ((const uint8_t *)msg) + ofs, 7025 sizeof(change)); 7026 if_printf(sc->hn_ifp, "network changed, change %u\n", 7027 change); 7028 } 7029 hn_change_network(sc); 7030 break; 7031 7032 default: 7033 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7034 msg->rm_status); 7035 break; 7036 } 7037 } 7038 7039 static int 7040 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7041 { 7042 const struct rndis_pktinfo *pi = info_data; 7043 uint32_t mask = 0; 7044 7045 while (info_dlen != 0) { 7046 const void *data; 7047 uint32_t dlen; 7048 7049 if (__predict_false(info_dlen < sizeof(*pi))) 7050 return (EINVAL); 7051 if (__predict_false(info_dlen < pi->rm_size)) 7052 return (EINVAL); 7053 info_dlen -= pi->rm_size; 7054 7055 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7056 return (EINVAL); 7057 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7058 return (EINVAL); 7059 dlen = pi->rm_size - pi->rm_pktinfooffset; 7060 data = pi->rm_data; 7061 7062 if (pi->rm_internal == 1) { 7063 switch (pi->rm_type) { 7064 case NDIS_PKTINFO_IT_PKTINFO_ID: 7065 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7066 return (EINVAL); 7067 info->pktinfo_id = 7068 (const struct packet_info_id *)data; 7069 mask |= HN_RXINFO_PKTINFO_ID; 7070 break; 7071 7072 default: 7073 goto next; 7074 } 7075 } else { 7076 switch (pi->rm_type) { 7077 case NDIS_PKTINFO_TYPE_VLAN: 7078 if (__predict_false(dlen 7079 < NDIS_VLAN_INFO_SIZE)) 7080 return (EINVAL); 7081 info->vlan_info = (const uint32_t *)data; 7082 mask |= HN_RXINFO_VLAN; 7083 break; 7084 7085 case NDIS_PKTINFO_TYPE_CSUM: 7086 if (__predict_false(dlen 7087 < NDIS_RXCSUM_INFO_SIZE)) 7088 return (EINVAL); 7089 info->csum_info = (const uint32_t *)data; 7090 mask |= HN_RXINFO_CSUM; 7091 break; 7092 7093 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7094 if (__predict_false(dlen 7095 < HN_NDIS_HASH_VALUE_SIZE)) 7096 return (EINVAL); 7097 info->hash_value = (const uint32_t *)data; 7098 mask |= HN_RXINFO_HASHVAL; 7099 break; 7100 7101 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7102 if (__predict_false(dlen 7103 < HN_NDIS_HASH_INFO_SIZE)) 7104 return (EINVAL); 7105 info->hash_info = (const uint32_t *)data; 7106 mask |= HN_RXINFO_HASHINF; 7107 break; 7108 7109 default: 7110 goto next; 7111 } 7112 } 7113 7114 if (mask == HN_RXINFO_ALL) { 7115 /* All found; done */ 7116 break; 7117 } 7118 next: 7119 pi = (const struct rndis_pktinfo *) 7120 ((const uint8_t *)pi + pi->rm_size); 7121 } 7122 7123 /* 7124 * Final fixup. 7125 * - If there is no hash value, invalidate the hash info. 7126 */ 7127 if ((mask & HN_RXINFO_HASHVAL) == 0) 7128 info->hash_info = NULL; 7129 return (0); 7130 } 7131 7132 static __inline bool 7133 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7134 { 7135 7136 if (off < check_off) { 7137 if (__predict_true(off + len <= check_off)) 7138 return (false); 7139 } else if (off > check_off) { 7140 if (__predict_true(check_off + check_len <= off)) 7141 return (false); 7142 } 7143 return (true); 7144 } 7145 7146 static __inline void 7147 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7148 uint32_t len, struct hn_rxinfo *info) 7149 { 7150 uint32_t cnt = rxr->rsc.cnt; 7151 7152 if (cnt) { 7153 rxr->rsc.pktlen += len; 7154 } else { 7155 rxr->rsc.vlan_info = info->vlan_info; 7156 rxr->rsc.csum_info = info->csum_info; 7157 rxr->rsc.hash_info = info->hash_info; 7158 rxr->rsc.hash_value = info->hash_value; 7159 rxr->rsc.pktlen = len; 7160 } 7161 7162 rxr->rsc.frag_data[cnt] = data; 7163 rxr->rsc.frag_len[cnt] = len; 7164 rxr->rsc.cnt++; 7165 } 7166 7167 static void 7168 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7169 { 7170 const struct rndis_packet_msg *pkt; 7171 struct hn_rxinfo info; 7172 int data_off, pktinfo_off, data_len, pktinfo_len; 7173 bool rsc_more= false; 7174 7175 /* 7176 * Check length. 7177 */ 7178 if (__predict_false(dlen < sizeof(*pkt))) { 7179 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7180 return; 7181 } 7182 pkt = data; 7183 7184 if (__predict_false(dlen < pkt->rm_len)) { 7185 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7186 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7187 return; 7188 } 7189 if (__predict_false(pkt->rm_len < 7190 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7191 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7192 "msglen %u, data %u, oob %u, pktinfo %u\n", 7193 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7194 pkt->rm_pktinfolen); 7195 return; 7196 } 7197 if (__predict_false(pkt->rm_datalen == 0)) { 7198 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7199 return; 7200 } 7201 7202 /* 7203 * Check offests. 7204 */ 7205 #define IS_OFFSET_INVALID(ofs) \ 7206 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7207 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7208 7209 /* XXX Hyper-V does not meet data offset alignment requirement */ 7210 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7211 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7212 "data offset %u\n", pkt->rm_dataoffset); 7213 return; 7214 } 7215 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7216 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7217 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7218 "oob offset %u\n", pkt->rm_oobdataoffset); 7219 return; 7220 } 7221 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7222 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7223 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7224 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7225 return; 7226 } 7227 7228 #undef IS_OFFSET_INVALID 7229 7230 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7231 data_len = pkt->rm_datalen; 7232 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7233 pktinfo_len = pkt->rm_pktinfolen; 7234 7235 /* 7236 * Check OOB coverage. 7237 */ 7238 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7239 int oob_off, oob_len; 7240 7241 if_printf(rxr->hn_ifp, "got oobdata\n"); 7242 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7243 oob_len = pkt->rm_oobdatalen; 7244 7245 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7246 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7247 "oob overflow, msglen %u, oob abs %d len %d\n", 7248 pkt->rm_len, oob_off, oob_len); 7249 return; 7250 } 7251 7252 /* 7253 * Check against data. 7254 */ 7255 if (hn_rndis_check_overlap(oob_off, oob_len, 7256 data_off, data_len)) { 7257 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7258 "oob overlaps data, oob abs %d len %d, " 7259 "data abs %d len %d\n", 7260 oob_off, oob_len, data_off, data_len); 7261 return; 7262 } 7263 7264 /* 7265 * Check against pktinfo. 7266 */ 7267 if (pktinfo_len != 0 && 7268 hn_rndis_check_overlap(oob_off, oob_len, 7269 pktinfo_off, pktinfo_len)) { 7270 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7271 "oob overlaps pktinfo, oob abs %d len %d, " 7272 "pktinfo abs %d len %d\n", 7273 oob_off, oob_len, pktinfo_off, pktinfo_len); 7274 return; 7275 } 7276 } 7277 7278 /* 7279 * Check per-packet-info coverage and find useful per-packet-info. 7280 */ 7281 info.vlan_info = NULL; 7282 info.csum_info = NULL; 7283 info.hash_info = NULL; 7284 info.pktinfo_id = NULL; 7285 7286 if (__predict_true(pktinfo_len != 0)) { 7287 bool overlap; 7288 int error; 7289 7290 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7291 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7292 "pktinfo overflow, msglen %u, " 7293 "pktinfo abs %d len %d\n", 7294 pkt->rm_len, pktinfo_off, pktinfo_len); 7295 return; 7296 } 7297 7298 /* 7299 * Check packet info coverage. 7300 */ 7301 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7302 data_off, data_len); 7303 if (__predict_false(overlap)) { 7304 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7305 "pktinfo overlap data, pktinfo abs %d len %d, " 7306 "data abs %d len %d\n", 7307 pktinfo_off, pktinfo_len, data_off, data_len); 7308 return; 7309 } 7310 7311 /* 7312 * Find useful per-packet-info. 7313 */ 7314 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7315 pktinfo_len, &info); 7316 if (__predict_false(error)) { 7317 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7318 "pktinfo\n"); 7319 return; 7320 } 7321 } 7322 7323 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7324 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7325 "data overflow, msglen %u, data abs %d len %d\n", 7326 pkt->rm_len, data_off, data_len); 7327 return; 7328 } 7329 7330 /* Identify RSC fragments, drop invalid packets */ 7331 if ((info.pktinfo_id != NULL) && 7332 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7333 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7334 rxr->rsc.cnt = 0; 7335 rxr->hn_rsc_pkts++; 7336 } else if (rxr->rsc.cnt == 0) 7337 goto drop; 7338 7339 rsc_more = true; 7340 7341 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7342 rsc_more = false; 7343 7344 if (rsc_more && rxr->rsc.is_last) 7345 goto drop; 7346 } else { 7347 rxr->rsc.cnt = 0; 7348 } 7349 7350 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7351 goto drop; 7352 7353 /* Store data in per rx ring structure */ 7354 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7355 data_len, &info); 7356 7357 if (rsc_more) 7358 return; 7359 7360 hn_rxpkt(rxr); 7361 rxr->rsc.cnt = 0; 7362 return; 7363 drop: 7364 rxr->hn_rsc_drop++; 7365 return; 7366 } 7367 7368 static __inline void 7369 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7370 { 7371 const struct rndis_msghdr *hdr; 7372 7373 if (__predict_false(dlen < sizeof(*hdr))) { 7374 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7375 return; 7376 } 7377 hdr = data; 7378 7379 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7380 /* Hot data path. */ 7381 hn_rndis_rx_data(rxr, data, dlen); 7382 /* Done! */ 7383 return; 7384 } 7385 7386 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7387 hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen); 7388 else 7389 hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen); 7390 } 7391 7392 static void 7393 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7394 { 7395 const struct hn_nvs_hdr *hdr; 7396 7397 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7398 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7399 return; 7400 } 7401 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7402 7403 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7404 /* Useless; ignore */ 7405 return; 7406 } 7407 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7408 } 7409 7410 static void 7411 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7412 const struct vmbus_chanpkt_hdr *pkt) 7413 { 7414 struct hn_nvs_sendctx *sndc; 7415 7416 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7417 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7418 VMBUS_CHANPKT_DATALEN(pkt)); 7419 /* 7420 * NOTE: 7421 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7422 * its callback. 7423 */ 7424 } 7425 7426 static void 7427 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7428 const struct vmbus_chanpkt_hdr *pkthdr) 7429 { 7430 struct epoch_tracker et; 7431 const struct vmbus_chanpkt_rxbuf *pkt; 7432 const struct hn_nvs_hdr *nvs_hdr; 7433 int count, i, hlen; 7434 7435 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7436 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7437 return; 7438 } 7439 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7440 7441 /* Make sure that this is a RNDIS message. */ 7442 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7443 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7444 nvs_hdr->nvs_type); 7445 return; 7446 } 7447 7448 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7449 if (__predict_false(hlen < sizeof(*pkt))) { 7450 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7451 return; 7452 } 7453 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7454 7455 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7456 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7457 pkt->cp_rxbuf_id); 7458 return; 7459 } 7460 7461 count = pkt->cp_rxbuf_cnt; 7462 if (__predict_false(hlen < 7463 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7464 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7465 return; 7466 } 7467 7468 NET_EPOCH_ENTER(et); 7469 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7470 for (i = 0; i < count; ++i) { 7471 int ofs, len; 7472 7473 ofs = pkt->cp_rxbuf[i].rb_ofs; 7474 len = pkt->cp_rxbuf[i].rb_len; 7475 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7476 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7477 "ofs %d, len %d\n", i, ofs, len); 7478 continue; 7479 } 7480 7481 rxr->rsc.is_last = (i == (count - 1)); 7482 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7483 } 7484 NET_EPOCH_EXIT(et); 7485 7486 /* 7487 * Ack the consumed RXBUF associated w/ this channel packet, 7488 * so that this RXBUF can be recycled by the hypervisor. 7489 */ 7490 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7491 } 7492 7493 static void 7494 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7495 uint64_t tid) 7496 { 7497 struct hn_nvs_rndis_ack ack; 7498 int retries, error; 7499 7500 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7501 ack.nvs_status = HN_NVS_STATUS_OK; 7502 7503 retries = 0; 7504 again: 7505 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7506 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7507 if (__predict_false(error == EAGAIN)) { 7508 /* 7509 * NOTE: 7510 * This should _not_ happen in real world, since the 7511 * consumption of the TX bufring from the TX path is 7512 * controlled. 7513 */ 7514 if (rxr->hn_ack_failed == 0) 7515 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7516 rxr->hn_ack_failed++; 7517 retries++; 7518 if (retries < 10) { 7519 DELAY(100); 7520 goto again; 7521 } 7522 /* RXBUF leaks! */ 7523 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7524 } 7525 } 7526 7527 static void 7528 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7529 { 7530 struct hn_rx_ring *rxr = xrxr; 7531 struct hn_softc *sc = if_getsoftc(rxr->hn_ifp); 7532 7533 for (;;) { 7534 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7535 int error, pktlen; 7536 7537 pktlen = rxr->hn_pktbuf_len; 7538 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7539 if (__predict_false(error == ENOBUFS)) { 7540 void *nbuf; 7541 int nlen; 7542 7543 /* 7544 * Expand channel packet buffer. 7545 * 7546 * XXX 7547 * Use M_WAITOK here, since allocation failure 7548 * is fatal. 7549 */ 7550 nlen = rxr->hn_pktbuf_len * 2; 7551 while (nlen < pktlen) 7552 nlen *= 2; 7553 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7554 7555 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7556 rxr->hn_pktbuf_len, nlen); 7557 7558 free(rxr->hn_pktbuf, M_DEVBUF); 7559 rxr->hn_pktbuf = nbuf; 7560 rxr->hn_pktbuf_len = nlen; 7561 /* Retry! */ 7562 continue; 7563 } else if (__predict_false(error == EAGAIN)) { 7564 /* No more channel packets; done! */ 7565 break; 7566 } 7567 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7568 7569 switch (pkt->cph_type) { 7570 case VMBUS_CHANPKT_TYPE_COMP: 7571 hn_nvs_handle_comp(sc, chan, pkt); 7572 break; 7573 7574 case VMBUS_CHANPKT_TYPE_RXBUF: 7575 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7576 break; 7577 7578 case VMBUS_CHANPKT_TYPE_INBAND: 7579 hn_nvs_handle_notify(sc, pkt); 7580 break; 7581 7582 default: 7583 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7584 pkt->cph_type); 7585 break; 7586 } 7587 } 7588 hn_chan_rollup(rxr, rxr->hn_txr); 7589 } 7590 7591 static void 7592 hn_sysinit(void *arg __unused) 7593 { 7594 int i; 7595 7596 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7597 7598 #ifdef HN_IFSTART_SUPPORT 7599 /* 7600 * Don't use ifnet.if_start if transparent VF mode is requested; 7601 * mainly due to the IFF_DRV_OACTIVE flag. 7602 */ 7603 if (hn_xpnt_vf && hn_use_if_start) { 7604 hn_use_if_start = 0; 7605 printf("hn: tranparent VF mode, if_transmit will be used, " 7606 "instead of if_start\n"); 7607 } 7608 #endif 7609 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7610 printf("hn: invalid transparent VF attach routing " 7611 "wait timeout %d, reset to %d\n", 7612 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7613 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7614 } 7615 7616 /* 7617 * Initialize VF map. 7618 */ 7619 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7620 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7621 hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF, 7622 M_WAITOK | M_ZERO); 7623 7624 /* 7625 * Fix the # of TX taskqueues. 7626 */ 7627 if (hn_tx_taskq_cnt <= 0) 7628 hn_tx_taskq_cnt = 1; 7629 else if (hn_tx_taskq_cnt > mp_ncpus) 7630 hn_tx_taskq_cnt = mp_ncpus; 7631 7632 /* 7633 * Fix the TX taskqueue mode. 7634 */ 7635 switch (hn_tx_taskq_mode) { 7636 case HN_TX_TASKQ_M_INDEP: 7637 case HN_TX_TASKQ_M_GLOBAL: 7638 case HN_TX_TASKQ_M_EVTTQ: 7639 break; 7640 default: 7641 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7642 break; 7643 } 7644 7645 if (vm_guest != VM_GUEST_HV) 7646 return; 7647 7648 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7649 return; 7650 7651 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7652 M_DEVBUF, M_WAITOK); 7653 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7654 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7655 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7656 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7657 "hn tx%d", i); 7658 } 7659 } 7660 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7661 7662 static void 7663 hn_sysuninit(void *arg __unused) 7664 { 7665 7666 if (hn_tx_taskque != NULL) { 7667 int i; 7668 7669 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7670 taskqueue_free(hn_tx_taskque[i]); 7671 free(hn_tx_taskque, M_DEVBUF); 7672 } 7673 7674 if (hn_vfmap != NULL) 7675 free(hn_vfmap, M_DEVBUF); 7676 rm_destroy(&hn_vfmap_lock); 7677 7678 counter_u64_free(hn_udpcs_fixup); 7679 } 7680 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7681