1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 #include "opt_hn.h" 57 #include "opt_inet6.h" 58 #include "opt_inet.h" 59 #include "opt_rss.h" 60 61 #include <sys/param.h> 62 #include <sys/systm.h> 63 #include <sys/bus.h> 64 #include <sys/counter.h> 65 #include <sys/kernel.h> 66 #include <sys/limits.h> 67 #include <sys/malloc.h> 68 #include <sys/mbuf.h> 69 #include <sys/module.h> 70 #include <sys/queue.h> 71 #include <sys/lock.h> 72 #include <sys/proc.h> 73 #include <sys/rmlock.h> 74 #include <sys/sbuf.h> 75 #include <sys/sched.h> 76 #include <sys/smp.h> 77 #include <sys/socket.h> 78 #include <sys/sockio.h> 79 #include <sys/sx.h> 80 #include <sys/sysctl.h> 81 #include <sys/taskqueue.h> 82 #include <sys/buf_ring.h> 83 #include <sys/eventhandler.h> 84 #include <sys/epoch.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_extern.h> 88 #include <vm/pmap.h> 89 90 #include <machine/atomic.h> 91 #include <machine/in_cksum.h> 92 93 #include <net/bpf.h> 94 #include <net/ethernet.h> 95 #include <net/if.h> 96 #include <net/if_dl.h> 97 #include <net/if_media.h> 98 #include <net/if_types.h> 99 #include <net/if_var.h> 100 #include <net/rndis.h> 101 #ifdef RSS 102 #include <net/rss_config.h> 103 #endif 104 105 #include <netinet/in_systm.h> 106 #include <netinet/in.h> 107 #include <netinet/ip.h> 108 #include <netinet/ip6.h> 109 #include <netinet/tcp.h> 110 #include <netinet/tcp_lro.h> 111 #include <netinet/udp.h> 112 113 #include <dev/hyperv/include/hyperv.h> 114 #include <dev/hyperv/include/hyperv_busdma.h> 115 #include <dev/hyperv/include/vmbus.h> 116 #include <dev/hyperv/include/vmbus_xact.h> 117 118 #include <dev/hyperv/netvsc/ndis.h> 119 #include <dev/hyperv/netvsc/if_hnreg.h> 120 #include <dev/hyperv/netvsc/if_hnvar.h> 121 #include <dev/hyperv/netvsc/hn_nvs.h> 122 #include <dev/hyperv/netvsc/hn_rndis.h> 123 124 #include "vmbus_if.h" 125 126 #define HN_IFSTART_SUPPORT 127 128 #define HN_RING_CNT_DEF_MAX 8 129 130 #define HN_VFMAP_SIZE_DEF 8 131 132 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 133 134 /* YYY should get it from the underlying channel */ 135 #define HN_TX_DESC_CNT 512 136 137 #define HN_RNDIS_PKT_LEN \ 138 (sizeof(struct rndis_packet_msg) + \ 139 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 141 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 142 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 143 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 144 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 145 146 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 147 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 148 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 149 /* -1 for RNDIS packet message */ 150 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 151 152 #define HN_DIRECT_TX_SIZE_DEF 128 153 154 #define HN_EARLY_TXEOF_THRESH 8 155 156 #define HN_PKTBUF_LEN_DEF (16 * 1024) 157 158 #define HN_LROENT_CNT_DEF 128 159 160 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 161 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 162 /* YYY 2*MTU is a bit rough, but should be good enough. */ 163 #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp)) 164 165 #define HN_LRO_ACKCNT_DEF 1 166 167 #define HN_LOCK_INIT(sc) \ 168 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 169 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 170 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 171 #define HN_LOCK(sc) \ 172 do { \ 173 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 174 /* Relinquish cpu to avoid deadlock */ \ 175 sched_relinquish(curthread); \ 176 DELAY(1000); \ 177 } \ 178 } while (0) 179 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 180 181 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 182 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 183 #define HN_CSUM_IP_HWASSIST(sc) \ 184 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 185 #define HN_CSUM_IP6_HWASSIST(sc) \ 186 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 187 188 #define HN_PKTSIZE_MIN(align) \ 189 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 190 HN_RNDIS_PKT_LEN, (align)) 191 #define HN_PKTSIZE(m, align) \ 192 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 193 194 #ifdef RSS 195 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 196 #else 197 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 198 #endif 199 200 struct hn_txdesc { 201 #ifndef HN_USE_TXDESC_BUFRING 202 SLIST_ENTRY(hn_txdesc) link; 203 #endif 204 STAILQ_ENTRY(hn_txdesc) agg_link; 205 206 /* Aggregated txdescs, in sending order. */ 207 STAILQ_HEAD(, hn_txdesc) agg_list; 208 209 /* The oldest packet, if transmission aggregation happens. */ 210 struct mbuf *m; 211 struct hn_tx_ring *txr; 212 int refs; 213 uint32_t flags; /* HN_TXD_FLAG_ */ 214 struct hn_nvs_sendctx send_ctx; 215 uint32_t chim_index; 216 int chim_size; 217 218 bus_dmamap_t data_dmap; 219 220 bus_addr_t rndis_pkt_paddr; 221 struct rndis_packet_msg *rndis_pkt; 222 bus_dmamap_t rndis_pkt_dmap; 223 }; 224 225 #define HN_TXD_FLAG_ONLIST 0x0001 226 #define HN_TXD_FLAG_DMAMAP 0x0002 227 #define HN_TXD_FLAG_ONAGG 0x0004 228 229 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 230 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 231 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 232 233 struct packet_info_id { 234 uint8_t ver; 235 uint8_t flag; 236 uint16_t pkt_id; 237 }; 238 239 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 240 241 242 struct hn_rxinfo { 243 const uint32_t *vlan_info; 244 const uint32_t *csum_info; 245 const uint32_t *hash_info; 246 const uint32_t *hash_value; 247 const struct packet_info_id *pktinfo_id; 248 }; 249 250 struct hn_rxvf_setarg { 251 struct hn_rx_ring *rxr; 252 if_t vf_ifp; 253 }; 254 255 #define HN_RXINFO_VLAN 0x0001 256 #define HN_RXINFO_CSUM 0x0002 257 #define HN_RXINFO_HASHINF 0x0004 258 #define HN_RXINFO_HASHVAL 0x0008 259 #define HN_RXINFO_PKTINFO_ID 0x0010 260 #define HN_RXINFO_ALL \ 261 (HN_RXINFO_VLAN | \ 262 HN_RXINFO_CSUM | \ 263 HN_RXINFO_HASHINF | \ 264 HN_RXINFO_HASHVAL | \ 265 HN_RXINFO_PKTINFO_ID) 266 267 static int hn_probe(device_t); 268 static int hn_attach(device_t); 269 static int hn_detach(device_t); 270 static int hn_shutdown(device_t); 271 static void hn_chan_callback(struct vmbus_channel *, 272 void *); 273 274 static void hn_init(void *); 275 static int hn_ioctl(if_t, u_long, caddr_t); 276 #ifdef HN_IFSTART_SUPPORT 277 static void hn_start(if_t); 278 #endif 279 static int hn_transmit(if_t, struct mbuf *); 280 static void hn_xmit_qflush(if_t); 281 static int hn_ifmedia_upd(if_t); 282 static void hn_ifmedia_sts(if_t, 283 struct ifmediareq *); 284 285 static void hn_ifnet_event(void *, if_t, int); 286 static void hn_ifaddr_event(void *, if_t); 287 static void hn_ifnet_attevent(void *, if_t); 288 static void hn_ifnet_detevent(void *, if_t); 289 static void hn_ifnet_lnkevent(void *, if_t, int); 290 291 static bool hn_ismyvf(const struct hn_softc *, 292 const if_t); 293 static void hn_rxvf_change(struct hn_softc *, 294 if_t, bool); 295 static void hn_rxvf_set(struct hn_softc *, if_t); 296 static void hn_rxvf_set_task(void *, int); 297 static void hn_xpnt_vf_input(if_t, struct mbuf *); 298 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 299 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 300 struct ifreq *); 301 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 302 static bool hn_xpnt_vf_isready(struct hn_softc *); 303 static void hn_xpnt_vf_setready(struct hn_softc *); 304 static void hn_xpnt_vf_init_taskfunc(void *, int); 305 static void hn_xpnt_vf_init(struct hn_softc *); 306 static void hn_xpnt_vf_setenable(struct hn_softc *); 307 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 308 static void hn_vf_rss_fixup(struct hn_softc *, bool); 309 static void hn_vf_rss_restore(struct hn_softc *); 310 311 static int hn_rndis_rxinfo(const void *, int, 312 struct hn_rxinfo *); 313 static void hn_rndis_rx_data(struct hn_rx_ring *, 314 const void *, int); 315 static void hn_rndis_rx_status(struct hn_softc *, 316 const void *, int); 317 static void hn_rndis_init_fixat(struct hn_softc *, int); 318 319 static void hn_nvs_handle_notify(struct hn_softc *, 320 const struct vmbus_chanpkt_hdr *); 321 static void hn_nvs_handle_comp(struct hn_softc *, 322 struct vmbus_channel *, 323 const struct vmbus_chanpkt_hdr *); 324 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 325 struct vmbus_channel *, 326 const struct vmbus_chanpkt_hdr *); 327 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 328 struct vmbus_channel *, uint64_t); 329 330 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 331 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 332 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 342 #ifndef RSS 343 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 344 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 345 #endif 346 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 347 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 348 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 349 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 360 361 static void hn_stop(struct hn_softc *, bool); 362 static void hn_init_locked(struct hn_softc *); 363 static int hn_chan_attach(struct hn_softc *, 364 struct vmbus_channel *); 365 static void hn_chan_detach(struct hn_softc *, 366 struct vmbus_channel *); 367 static int hn_attach_subchans(struct hn_softc *); 368 static void hn_detach_allchans(struct hn_softc *); 369 static void hn_chan_rollup(struct hn_rx_ring *, 370 struct hn_tx_ring *); 371 static void hn_set_ring_inuse(struct hn_softc *, int); 372 static int hn_synth_attach(struct hn_softc *, int); 373 static void hn_synth_detach(struct hn_softc *); 374 static int hn_synth_alloc_subchans(struct hn_softc *, 375 int *); 376 static bool hn_synth_attachable(const struct hn_softc *); 377 static void hn_suspend(struct hn_softc *); 378 static void hn_suspend_data(struct hn_softc *); 379 static void hn_suspend_mgmt(struct hn_softc *); 380 static void hn_resume(struct hn_softc *); 381 static void hn_resume_data(struct hn_softc *); 382 static void hn_resume_mgmt(struct hn_softc *); 383 static void hn_suspend_mgmt_taskfunc(void *, int); 384 static void hn_chan_drain(struct hn_softc *, 385 struct vmbus_channel *); 386 static void hn_disable_rx(struct hn_softc *); 387 static void hn_drain_rxtx(struct hn_softc *, int); 388 static void hn_polling(struct hn_softc *, u_int); 389 static void hn_chan_polling(struct vmbus_channel *, u_int); 390 static void hn_mtu_change_fixup(struct hn_softc *); 391 392 static void hn_update_link_status(struct hn_softc *); 393 static void hn_change_network(struct hn_softc *); 394 static void hn_link_taskfunc(void *, int); 395 static void hn_netchg_init_taskfunc(void *, int); 396 static void hn_netchg_status_taskfunc(void *, int); 397 static void hn_link_status(struct hn_softc *); 398 399 static int hn_create_rx_data(struct hn_softc *, int); 400 static void hn_destroy_rx_data(struct hn_softc *); 401 static int hn_check_iplen(const struct mbuf *, int); 402 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 403 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 404 static int hn_rxfilter_config(struct hn_softc *); 405 static int hn_rss_reconfig(struct hn_softc *); 406 static void hn_rss_ind_fixup(struct hn_softc *); 407 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 408 static int hn_rxpkt(struct hn_rx_ring *); 409 static uint32_t hn_rss_type_fromndis(uint32_t); 410 static uint32_t hn_rss_type_tondis(uint32_t); 411 412 static int hn_tx_ring_create(struct hn_softc *, int); 413 static void hn_tx_ring_destroy(struct hn_tx_ring *); 414 static int hn_create_tx_data(struct hn_softc *, int); 415 static void hn_fixup_tx_data(struct hn_softc *); 416 static void hn_fixup_rx_data(struct hn_softc *); 417 static void hn_destroy_tx_data(struct hn_softc *); 418 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 419 static void hn_txdesc_gc(struct hn_tx_ring *, 420 struct hn_txdesc *); 421 static int hn_encap(if_t, struct hn_tx_ring *, 422 struct hn_txdesc *, struct mbuf **); 423 static int hn_txpkt(if_t, struct hn_tx_ring *, 424 struct hn_txdesc *); 425 static void hn_set_chim_size(struct hn_softc *, int); 426 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 427 static bool hn_tx_ring_pending(struct hn_tx_ring *); 428 static void hn_tx_ring_qflush(struct hn_tx_ring *); 429 static void hn_resume_tx(struct hn_softc *, int); 430 static void hn_set_txagg(struct hn_softc *); 431 static void *hn_try_txagg(if_t, 432 struct hn_tx_ring *, struct hn_txdesc *, 433 int); 434 static int hn_get_txswq_depth(const struct hn_tx_ring *); 435 static void hn_txpkt_done(struct hn_nvs_sendctx *, 436 struct hn_softc *, struct vmbus_channel *, 437 const void *, int); 438 static int hn_txpkt_sglist(struct hn_tx_ring *, 439 struct hn_txdesc *); 440 static int hn_txpkt_chim(struct hn_tx_ring *, 441 struct hn_txdesc *); 442 static int hn_xmit(struct hn_tx_ring *, int); 443 static void hn_xmit_taskfunc(void *, int); 444 static void hn_xmit_txeof(struct hn_tx_ring *); 445 static void hn_xmit_txeof_taskfunc(void *, int); 446 #ifdef HN_IFSTART_SUPPORT 447 static int hn_start_locked(struct hn_tx_ring *, int); 448 static void hn_start_taskfunc(void *, int); 449 static void hn_start_txeof(struct hn_tx_ring *); 450 static void hn_start_txeof_taskfunc(void *, int); 451 #endif 452 453 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 454 455 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 456 "Hyper-V network interface"); 457 458 /* Trust tcp segment verification on host side. */ 459 static int hn_trust_hosttcp = 1; 460 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 461 &hn_trust_hosttcp, 0, 462 "Trust tcp segment verification on host side, " 463 "when csum info is missing (global setting)"); 464 465 /* Trust udp datagrams verification on host side. */ 466 static int hn_trust_hostudp = 1; 467 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 468 &hn_trust_hostudp, 0, 469 "Trust udp datagram verification on host side, " 470 "when csum info is missing (global setting)"); 471 472 /* Trust ip packets verification on host side. */ 473 static int hn_trust_hostip = 1; 474 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 475 &hn_trust_hostip, 0, 476 "Trust ip packet verification on host side, " 477 "when csum info is missing (global setting)"); 478 479 /* 480 * Offload UDP/IPv4 checksum. 481 */ 482 static int hn_enable_udp4cs = 1; 483 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 484 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 485 486 /* 487 * Offload UDP/IPv6 checksum. 488 */ 489 static int hn_enable_udp6cs = 1; 490 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 491 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 492 493 /* Stats. */ 494 static counter_u64_t hn_udpcs_fixup; 495 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 496 &hn_udpcs_fixup, "# of UDP checksum fixup"); 497 498 /* 499 * See hn_set_hlen(). 500 * 501 * This value is for Azure. For Hyper-V, set this above 502 * 65536 to disable UDP datagram checksum fixup. 503 */ 504 static int hn_udpcs_fixup_mtu = 1420; 505 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 506 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 507 508 /* Limit TSO burst size */ 509 static int hn_tso_maxlen = IP_MAXPACKET; 510 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 511 &hn_tso_maxlen, 0, "TSO burst limit"); 512 513 /* Limit chimney send size */ 514 static int hn_tx_chimney_size = 0; 515 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 516 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 517 518 /* Limit the size of packet for direct transmission */ 519 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 520 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 521 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 522 523 /* # of LRO entries per RX ring */ 524 #if defined(INET) || defined(INET6) 525 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 526 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 527 &hn_lro_entry_count, 0, "LRO entry count"); 528 #endif 529 530 static int hn_tx_taskq_cnt = 1; 531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 532 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 533 534 #define HN_TX_TASKQ_M_INDEP 0 535 #define HN_TX_TASKQ_M_GLOBAL 1 536 #define HN_TX_TASKQ_M_EVTTQ 2 537 538 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 539 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 540 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 541 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 542 543 #ifndef HN_USE_TXDESC_BUFRING 544 static int hn_use_txdesc_bufring = 0; 545 #else 546 static int hn_use_txdesc_bufring = 1; 547 #endif 548 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 549 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 550 551 #ifdef HN_IFSTART_SUPPORT 552 /* Use ifnet.if_start instead of ifnet.if_transmit */ 553 static int hn_use_if_start = 0; 554 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 555 &hn_use_if_start, 0, "Use if_start TX method"); 556 #endif 557 558 /* # of channels to use */ 559 static int hn_chan_cnt = 0; 560 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 561 &hn_chan_cnt, 0, 562 "# of channels to use; each channel has one RX ring and one TX ring"); 563 564 /* # of transmit rings to use */ 565 static int hn_tx_ring_cnt = 0; 566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 567 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 568 569 /* Software TX ring deptch */ 570 static int hn_tx_swq_depth = 0; 571 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 572 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 573 574 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 575 static u_int hn_lro_mbufq_depth = 0; 576 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 577 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 578 579 /* Packet transmission aggregation size limit */ 580 static int hn_tx_agg_size = -1; 581 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 582 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 583 584 /* Packet transmission aggregation count limit */ 585 static int hn_tx_agg_pkts = -1; 586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 587 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 588 589 /* VF list */ 590 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 591 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 592 hn_vflist_sysctl, "A", 593 "VF list"); 594 595 /* VF mapping */ 596 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 597 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 598 hn_vfmap_sysctl, "A", 599 "VF mapping"); 600 601 /* Transparent VF */ 602 static int hn_xpnt_vf = 1; 603 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 604 &hn_xpnt_vf, 0, "Transparent VF mod"); 605 606 /* Accurate BPF support for Transparent VF */ 607 static int hn_xpnt_vf_accbpf = 0; 608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 609 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 610 611 /* Extra wait for transparent VF attach routing; unit seconds. */ 612 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 614 &hn_xpnt_vf_attwait, 0, 615 "Extra wait for transparent VF attach routing; unit: seconds"); 616 617 static u_int hn_cpu_index; /* next CPU for channel */ 618 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 619 620 static struct rmlock hn_vfmap_lock; 621 static int hn_vfmap_size; 622 static if_t *hn_vfmap; 623 624 #ifndef RSS 625 static const uint8_t 626 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 627 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 628 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 629 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 630 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 631 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 632 }; 633 #endif /* !RSS */ 634 635 static const struct hyperv_guid hn_guid = { 636 .hv_guid = { 637 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 638 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 639 }; 640 641 static device_method_t hn_methods[] = { 642 /* Device interface */ 643 DEVMETHOD(device_probe, hn_probe), 644 DEVMETHOD(device_attach, hn_attach), 645 DEVMETHOD(device_detach, hn_detach), 646 DEVMETHOD(device_shutdown, hn_shutdown), 647 DEVMETHOD_END 648 }; 649 650 static driver_t hn_driver = { 651 "hn", 652 hn_methods, 653 sizeof(struct hn_softc) 654 }; 655 656 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); 657 MODULE_VERSION(hn, 1); 658 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 659 660 static void 661 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 662 { 663 int i; 664 665 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 666 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 667 } 668 669 static int 670 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 671 { 672 673 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 674 txd->chim_size == 0, ("invalid rndis sglist txd")); 675 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 676 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 677 } 678 679 static int 680 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 681 { 682 struct hn_nvs_rndis rndis; 683 684 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 685 txd->chim_size > 0, ("invalid rndis chim txd")); 686 687 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 688 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 689 rndis.nvs_chim_idx = txd->chim_index; 690 rndis.nvs_chim_sz = txd->chim_size; 691 692 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 693 &rndis, sizeof(rndis), &txd->send_ctx)); 694 } 695 696 static __inline uint32_t 697 hn_chim_alloc(struct hn_softc *sc) 698 { 699 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 700 u_long *bmap = sc->hn_chim_bmap; 701 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 702 703 for (i = 0; i < bmap_cnt; ++i) { 704 int idx; 705 706 idx = ffsl(~bmap[i]); 707 if (idx == 0) 708 continue; 709 710 --idx; /* ffsl is 1-based */ 711 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 712 ("invalid i %d and idx %d", i, idx)); 713 714 if (atomic_testandset_long(&bmap[i], idx)) 715 continue; 716 717 ret = i * LONG_BIT + idx; 718 break; 719 } 720 return (ret); 721 } 722 723 static __inline void 724 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 725 { 726 u_long mask; 727 uint32_t idx; 728 729 idx = chim_idx / LONG_BIT; 730 KASSERT(idx < sc->hn_chim_bmap_cnt, 731 ("invalid chimney index 0x%x", chim_idx)); 732 733 mask = 1UL << (chim_idx % LONG_BIT); 734 KASSERT(sc->hn_chim_bmap[idx] & mask, 735 ("index bitmap 0x%lx, chimney index %u, " 736 "bitmap idx %d, bitmask 0x%lx", 737 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 738 739 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 740 } 741 742 #if defined(INET6) || defined(INET) 743 744 #define PULLUP_HDR(m, len) \ 745 do { \ 746 if (__predict_false((m)->m_len < (len))) { \ 747 (m) = m_pullup((m), (len)); \ 748 if ((m) == NULL) \ 749 return (NULL); \ 750 } \ 751 } while (0) 752 753 /* 754 * NOTE: If this function failed, the m_head would be freed. 755 */ 756 static __inline struct mbuf * 757 hn_tso_fixup(struct mbuf *m_head) 758 { 759 struct ether_vlan_header *evl; 760 struct tcphdr *th; 761 int ehlen; 762 763 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 764 765 PULLUP_HDR(m_head, sizeof(*evl)); 766 evl = mtod(m_head, struct ether_vlan_header *); 767 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 768 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 769 else 770 ehlen = ETHER_HDR_LEN; 771 m_head->m_pkthdr.l2hlen = ehlen; 772 773 #ifdef INET 774 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 775 struct ip *ip; 776 int iphlen; 777 778 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 779 ip = mtodo(m_head, ehlen); 780 iphlen = ip->ip_hl << 2; 781 m_head->m_pkthdr.l3hlen = iphlen; 782 783 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 784 th = mtodo(m_head, ehlen + iphlen); 785 786 ip->ip_len = 0; 787 ip->ip_sum = 0; 788 th->th_sum = in_pseudo(ip->ip_src.s_addr, 789 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 790 } 791 #endif 792 #if defined(INET6) && defined(INET) 793 else 794 #endif 795 #ifdef INET6 796 { 797 struct ip6_hdr *ip6; 798 799 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 800 ip6 = mtodo(m_head, ehlen); 801 if (ip6->ip6_nxt != IPPROTO_TCP) { 802 m_freem(m_head); 803 return (NULL); 804 } 805 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 806 807 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 808 th = mtodo(m_head, ehlen + sizeof(*ip6)); 809 810 ip6->ip6_plen = 0; 811 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 812 } 813 #endif 814 return (m_head); 815 } 816 817 /* 818 * NOTE: If this function failed, the m_head would be freed. 819 */ 820 static __inline struct mbuf * 821 hn_set_hlen(struct mbuf *m_head) 822 { 823 const struct ether_vlan_header *evl; 824 int ehlen; 825 826 PULLUP_HDR(m_head, sizeof(*evl)); 827 evl = mtod(m_head, const struct ether_vlan_header *); 828 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 829 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 830 else 831 ehlen = ETHER_HDR_LEN; 832 m_head->m_pkthdr.l2hlen = ehlen; 833 834 #ifdef INET 835 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 836 const struct ip *ip; 837 int iphlen; 838 839 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 840 ip = mtodo(m_head, ehlen); 841 iphlen = ip->ip_hl << 2; 842 m_head->m_pkthdr.l3hlen = iphlen; 843 844 /* 845 * UDP checksum offload does not work in Azure, if the 846 * following conditions meet: 847 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 848 * - IP_DF is not set in the IP hdr. 849 * 850 * Fallback to software checksum for these UDP datagrams. 851 */ 852 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 853 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 854 (ntohs(ip->ip_off) & IP_DF) == 0) { 855 uint16_t off = ehlen + iphlen; 856 857 counter_u64_add(hn_udpcs_fixup, 1); 858 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 859 *(uint16_t *)(m_head->m_data + off + 860 m_head->m_pkthdr.csum_data) = in_cksum_skip( 861 m_head, m_head->m_pkthdr.len, off); 862 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 863 } 864 } 865 #endif 866 #if defined(INET6) && defined(INET) 867 else 868 #endif 869 #ifdef INET6 870 { 871 const struct ip6_hdr *ip6; 872 873 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 874 ip6 = mtodo(m_head, ehlen); 875 if (ip6->ip6_nxt != IPPROTO_TCP && 876 ip6->ip6_nxt != IPPROTO_UDP) { 877 m_freem(m_head); 878 return (NULL); 879 } 880 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 881 } 882 #endif 883 return (m_head); 884 } 885 886 /* 887 * NOTE: If this function failed, the m_head would be freed. 888 */ 889 static __inline struct mbuf * 890 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 891 { 892 const struct tcphdr *th; 893 int ehlen, iphlen; 894 895 *tcpsyn = 0; 896 ehlen = m_head->m_pkthdr.l2hlen; 897 iphlen = m_head->m_pkthdr.l3hlen; 898 899 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 900 th = mtodo(m_head, ehlen + iphlen); 901 if (th->th_flags & TH_SYN) 902 *tcpsyn = 1; 903 return (m_head); 904 } 905 906 #undef PULLUP_HDR 907 908 #endif /* INET6 || INET */ 909 910 static int 911 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 912 { 913 int error = 0; 914 915 HN_LOCK_ASSERT(sc); 916 917 if (sc->hn_rx_filter != filter) { 918 error = hn_rndis_set_rxfilter(sc, filter); 919 if (!error) 920 sc->hn_rx_filter = filter; 921 } 922 return (error); 923 } 924 925 static int 926 hn_rxfilter_config(struct hn_softc *sc) 927 { 928 if_t ifp = sc->hn_ifp; 929 uint32_t filter; 930 931 HN_LOCK_ASSERT(sc); 932 933 /* 934 * If the non-transparent mode VF is activated, we don't know how 935 * its RX filter is configured, so stick the synthetic device in 936 * the promiscous mode. 937 */ 938 if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 939 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 940 } else { 941 filter = NDIS_PACKET_TYPE_DIRECTED; 942 if (if_getflags(ifp) & IFF_BROADCAST) 943 filter |= NDIS_PACKET_TYPE_BROADCAST; 944 /* TODO: support multicast list */ 945 if ((if_getflags(ifp) & IFF_ALLMULTI) || 946 !if_maddr_empty(ifp)) 947 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 948 } 949 return (hn_set_rxfilter(sc, filter)); 950 } 951 952 static void 953 hn_set_txagg(struct hn_softc *sc) 954 { 955 uint32_t size, pkts; 956 int i; 957 958 /* 959 * Setup aggregation size. 960 */ 961 if (sc->hn_agg_size < 0) 962 size = UINT32_MAX; 963 else 964 size = sc->hn_agg_size; 965 966 if (sc->hn_rndis_agg_size < size) 967 size = sc->hn_rndis_agg_size; 968 969 /* NOTE: We only aggregate packets using chimney sending buffers. */ 970 if (size > (uint32_t)sc->hn_chim_szmax) 971 size = sc->hn_chim_szmax; 972 973 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 974 /* Disable */ 975 size = 0; 976 pkts = 0; 977 goto done; 978 } 979 980 /* NOTE: Type of the per TX ring setting is 'int'. */ 981 if (size > INT_MAX) 982 size = INT_MAX; 983 984 /* 985 * Setup aggregation packet count. 986 */ 987 if (sc->hn_agg_pkts < 0) 988 pkts = UINT32_MAX; 989 else 990 pkts = sc->hn_agg_pkts; 991 992 if (sc->hn_rndis_agg_pkts < pkts) 993 pkts = sc->hn_rndis_agg_pkts; 994 995 if (pkts <= 1) { 996 /* Disable */ 997 size = 0; 998 pkts = 0; 999 goto done; 1000 } 1001 1002 /* NOTE: Type of the per TX ring setting is 'short'. */ 1003 if (pkts > SHRT_MAX) 1004 pkts = SHRT_MAX; 1005 1006 done: 1007 /* NOTE: Type of the per TX ring setting is 'short'. */ 1008 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1009 /* Disable */ 1010 size = 0; 1011 pkts = 0; 1012 } 1013 1014 if (bootverbose) { 1015 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1016 size, pkts, sc->hn_rndis_agg_align); 1017 } 1018 1019 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1020 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1021 1022 mtx_lock(&txr->hn_tx_lock); 1023 txr->hn_agg_szmax = size; 1024 txr->hn_agg_pktmax = pkts; 1025 txr->hn_agg_align = sc->hn_rndis_agg_align; 1026 mtx_unlock(&txr->hn_tx_lock); 1027 } 1028 } 1029 1030 static int 1031 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1032 { 1033 1034 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1035 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1036 return txr->hn_txdesc_cnt; 1037 return hn_tx_swq_depth; 1038 } 1039 1040 static int 1041 hn_rss_reconfig(struct hn_softc *sc) 1042 { 1043 int error; 1044 1045 HN_LOCK_ASSERT(sc); 1046 1047 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1048 return (ENXIO); 1049 1050 /* 1051 * Disable RSS first. 1052 * 1053 * NOTE: 1054 * Direct reconfiguration by setting the UNCHG flags does 1055 * _not_ work properly. 1056 */ 1057 if (bootverbose) 1058 if_printf(sc->hn_ifp, "disable RSS\n"); 1059 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1060 if (error) { 1061 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1062 return (error); 1063 } 1064 1065 /* 1066 * Reenable the RSS w/ the updated RSS key or indirect 1067 * table. 1068 */ 1069 if (bootverbose) 1070 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1071 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1072 if (error) { 1073 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1074 return (error); 1075 } 1076 return (0); 1077 } 1078 1079 static void 1080 hn_rss_ind_fixup(struct hn_softc *sc) 1081 { 1082 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1083 int i, nchan; 1084 1085 nchan = sc->hn_rx_ring_inuse; 1086 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1087 1088 /* 1089 * Check indirect table to make sure that all channels in it 1090 * can be used. 1091 */ 1092 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1093 if (rss->rss_ind[i] >= nchan) { 1094 if_printf(sc->hn_ifp, 1095 "RSS indirect table %d fixup: %u -> %d\n", 1096 i, rss->rss_ind[i], nchan - 1); 1097 rss->rss_ind[i] = nchan - 1; 1098 } 1099 } 1100 } 1101 1102 static int 1103 hn_ifmedia_upd(if_t ifp __unused) 1104 { 1105 1106 /* Ignore since autoselect is the only defined and valid media */ 1107 return (0); 1108 } 1109 1110 static void 1111 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) 1112 { 1113 struct hn_softc *sc = if_getsoftc(ifp); 1114 1115 ifmr->ifm_status = IFM_AVALID; 1116 ifmr->ifm_active = IFM_ETHER; 1117 1118 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1119 ifmr->ifm_active |= IFM_NONE; 1120 return; 1121 } 1122 ifmr->ifm_status |= IFM_ACTIVE; 1123 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1124 } 1125 1126 static void 1127 hn_rxvf_set_task(void *xarg, int pending __unused) 1128 { 1129 struct hn_rxvf_setarg *arg = xarg; 1130 1131 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1132 } 1133 1134 static void 1135 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp) 1136 { 1137 struct hn_rx_ring *rxr; 1138 struct hn_rxvf_setarg arg; 1139 struct task task; 1140 int i; 1141 1142 HN_LOCK_ASSERT(sc); 1143 1144 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1145 1146 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1147 rxr = &sc->hn_rx_ring[i]; 1148 1149 if (i < sc->hn_rx_ring_inuse) { 1150 arg.rxr = rxr; 1151 arg.vf_ifp = vf_ifp; 1152 vmbus_chan_run_task(rxr->hn_chan, &task); 1153 } else { 1154 rxr->hn_rxvf_ifp = vf_ifp; 1155 } 1156 } 1157 } 1158 1159 static bool 1160 hn_ismyvf(const struct hn_softc *sc, const if_t ifp) 1161 { 1162 if_t hn_ifp; 1163 1164 hn_ifp = sc->hn_ifp; 1165 1166 if (ifp == hn_ifp) 1167 return (false); 1168 1169 if (if_getalloctype(ifp) != IFT_ETHER) 1170 return (false); 1171 1172 /* Ignore lagg/vlan interfaces */ 1173 if (strcmp(if_getdname(ifp), "lagg") == 0 || 1174 strcmp(if_getdname(ifp), "vlan") == 0) 1175 return (false); 1176 1177 /* 1178 * During detach events if_getifaddr(ifp) might be NULL. 1179 * Make sure the bcmp() below doesn't panic on that: 1180 */ 1181 if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL) 1182 return (false); 1183 1184 if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0) 1185 return (false); 1186 1187 return (true); 1188 } 1189 1190 static void 1191 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf) 1192 { 1193 if_t hn_ifp; 1194 1195 HN_LOCK(sc); 1196 1197 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1198 goto out; 1199 1200 if (!hn_ismyvf(sc, ifp)) 1201 goto out; 1202 hn_ifp = sc->hn_ifp; 1203 1204 if (rxvf) { 1205 if (sc->hn_flags & HN_FLAG_RXVF) 1206 goto out; 1207 1208 sc->hn_flags |= HN_FLAG_RXVF; 1209 hn_rxfilter_config(sc); 1210 } else { 1211 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1212 goto out; 1213 1214 sc->hn_flags &= ~HN_FLAG_RXVF; 1215 if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING) 1216 hn_rxfilter_config(sc); 1217 else 1218 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1219 } 1220 1221 hn_nvs_set_datapath(sc, 1222 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1223 1224 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1225 1226 if (rxvf) { 1227 hn_vf_rss_fixup(sc, true); 1228 hn_suspend_mgmt(sc); 1229 sc->hn_link_flags &= 1230 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1231 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1232 } else { 1233 hn_vf_rss_restore(sc); 1234 hn_resume_mgmt(sc); 1235 } 1236 1237 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1238 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1239 1240 if (bootverbose) { 1241 if_printf(hn_ifp, "datapath is switched %s %s\n", 1242 rxvf ? "to" : "from", if_name(ifp)); 1243 } 1244 out: 1245 HN_UNLOCK(sc); 1246 } 1247 1248 static void 1249 hn_ifnet_event(void *arg, if_t ifp, int event) 1250 { 1251 1252 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1253 return; 1254 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1255 } 1256 1257 static void 1258 hn_ifaddr_event(void *arg, if_t ifp) 1259 { 1260 1261 hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP); 1262 } 1263 1264 static int 1265 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused) 1266 { 1267 if_t ifp, vf_ifp; 1268 1269 HN_LOCK_ASSERT(sc); 1270 ifp = sc->hn_ifp; 1271 vf_ifp = sc->hn_vf_ifp; 1272 1273 /* 1274 * Just sync up with VF's enabled capabilities. 1275 */ 1276 if_setcapenable(ifp, if_getcapenable(vf_ifp)); 1277 if_sethwassist(ifp, if_gethwassist(vf_ifp)); 1278 1279 return (0); 1280 } 1281 1282 static int 1283 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1284 { 1285 if_t vf_ifp; 1286 struct ifreq ifr; 1287 1288 HN_LOCK_ASSERT(sc); 1289 vf_ifp = sc->hn_vf_ifp; 1290 1291 memset(&ifr, 0, sizeof(ifr)); 1292 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1293 ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff; 1294 ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16; 1295 return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread)); 1296 } 1297 1298 static void 1299 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1300 { 1301 if_t ifp = sc->hn_ifp; 1302 int allmulti = 0; 1303 1304 HN_LOCK_ASSERT(sc); 1305 1306 /* XXX vlan(4) style mcast addr maintenance */ 1307 if (!if_maddr_empty(ifp)) 1308 allmulti = IFF_ALLMULTI; 1309 1310 /* Always set the VF's if_flags */ 1311 if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti); 1312 } 1313 1314 static void 1315 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m) 1316 { 1317 struct rm_priotracker pt; 1318 if_t hn_ifp = NULL; 1319 struct mbuf *mn; 1320 1321 /* 1322 * XXX racy, if hn(4) ever detached. 1323 */ 1324 rm_rlock(&hn_vfmap_lock, &pt); 1325 if (if_getindex(vf_ifp) < hn_vfmap_size) 1326 hn_ifp = hn_vfmap[if_getindex(vf_ifp)]; 1327 rm_runlock(&hn_vfmap_lock, &pt); 1328 1329 if (hn_ifp != NULL) { 1330 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1331 /* 1332 * Allow tapping on the VF. 1333 */ 1334 ETHER_BPF_MTAP(vf_ifp, mn); 1335 1336 /* 1337 * Update VF stats. 1338 */ 1339 if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) { 1340 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1341 mn->m_pkthdr.len); 1342 } 1343 /* 1344 * XXX IFCOUNTER_IMCAST 1345 * This stat updating is kinda invasive, since it 1346 * requires two checks on the mbuf: the length check 1347 * and the ethernet header check. As of this write, 1348 * all multicast packets go directly to hn(4), which 1349 * makes imcast stat updating in the VF a try in vian. 1350 */ 1351 1352 /* 1353 * Fix up rcvif and increase hn(4)'s ipackets. 1354 */ 1355 mn->m_pkthdr.rcvif = hn_ifp; 1356 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1357 } 1358 /* 1359 * Go through hn(4)'s if_input. 1360 */ 1361 if_input(hn_ifp, m); 1362 } else { 1363 /* 1364 * In the middle of the transition; free this 1365 * mbuf chain. 1366 */ 1367 while (m != NULL) { 1368 mn = m->m_nextpkt; 1369 m->m_nextpkt = NULL; 1370 m_freem(m); 1371 m = mn; 1372 } 1373 } 1374 } 1375 1376 static void 1377 hn_mtu_change_fixup(struct hn_softc *sc) 1378 { 1379 if_t ifp; 1380 1381 HN_LOCK_ASSERT(sc); 1382 ifp = sc->hn_ifp; 1383 1384 hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp)); 1385 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1386 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1387 } 1388 1389 static uint32_t 1390 hn_rss_type_fromndis(uint32_t rss_hash) 1391 { 1392 uint32_t types = 0; 1393 1394 if (rss_hash & NDIS_HASH_IPV4) 1395 types |= RSS_TYPE_IPV4; 1396 if (rss_hash & NDIS_HASH_TCP_IPV4) 1397 types |= RSS_TYPE_TCP_IPV4; 1398 if (rss_hash & NDIS_HASH_IPV6) 1399 types |= RSS_TYPE_IPV6; 1400 if (rss_hash & NDIS_HASH_IPV6_EX) 1401 types |= RSS_TYPE_IPV6_EX; 1402 if (rss_hash & NDIS_HASH_TCP_IPV6) 1403 types |= RSS_TYPE_TCP_IPV6; 1404 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1405 types |= RSS_TYPE_TCP_IPV6_EX; 1406 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1407 types |= RSS_TYPE_UDP_IPV4; 1408 return (types); 1409 } 1410 1411 static uint32_t 1412 hn_rss_type_tondis(uint32_t types) 1413 { 1414 uint32_t rss_hash = 0; 1415 1416 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1417 ("UDP6 and UDP6EX are not supported")); 1418 1419 if (types & RSS_TYPE_IPV4) 1420 rss_hash |= NDIS_HASH_IPV4; 1421 if (types & RSS_TYPE_TCP_IPV4) 1422 rss_hash |= NDIS_HASH_TCP_IPV4; 1423 if (types & RSS_TYPE_IPV6) 1424 rss_hash |= NDIS_HASH_IPV6; 1425 if (types & RSS_TYPE_IPV6_EX) 1426 rss_hash |= NDIS_HASH_IPV6_EX; 1427 if (types & RSS_TYPE_TCP_IPV6) 1428 rss_hash |= NDIS_HASH_TCP_IPV6; 1429 if (types & RSS_TYPE_TCP_IPV6_EX) 1430 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1431 if (types & RSS_TYPE_UDP_IPV4) 1432 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1433 return (rss_hash); 1434 } 1435 1436 static void 1437 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1438 { 1439 int i; 1440 1441 HN_LOCK_ASSERT(sc); 1442 1443 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1444 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1445 } 1446 1447 static void 1448 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1449 { 1450 if_t ifp, vf_ifp; 1451 struct ifrsshash ifrh; 1452 struct ifrsskey ifrk; 1453 int error; 1454 uint32_t my_types, diff_types, mbuf_types = 0; 1455 1456 HN_LOCK_ASSERT(sc); 1457 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1458 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1459 1460 if (sc->hn_rx_ring_inuse == 1) { 1461 /* No RSS on synthetic parts; done. */ 1462 return; 1463 } 1464 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1465 /* Synthetic parts do not support Toeplitz; done. */ 1466 return; 1467 } 1468 1469 ifp = sc->hn_ifp; 1470 vf_ifp = sc->hn_vf_ifp; 1471 1472 /* 1473 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1474 * supported. 1475 */ 1476 memset(&ifrk, 0, sizeof(ifrk)); 1477 strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name)); 1478 error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread); 1479 if (error) { 1480 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1481 if_name(vf_ifp), error); 1482 goto done; 1483 } 1484 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1485 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1486 if_name(vf_ifp), ifrk.ifrk_func); 1487 goto done; 1488 } 1489 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1490 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1491 if_name(vf_ifp), ifrk.ifrk_keylen); 1492 goto done; 1493 } 1494 1495 /* 1496 * Extract VF's RSS hash. Only Toeplitz is supported. 1497 */ 1498 memset(&ifrh, 0, sizeof(ifrh)); 1499 strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name)); 1500 error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread); 1501 if (error) { 1502 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1503 if_name(vf_ifp), error); 1504 goto done; 1505 } 1506 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1507 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1508 if_name(vf_ifp), ifrh.ifrh_func); 1509 goto done; 1510 } 1511 1512 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1513 if ((ifrh.ifrh_types & my_types) == 0) { 1514 /* This disables RSS; ignore it then */ 1515 if_printf(ifp, "%s intersection of RSS types failed. " 1516 "VF %#x, mine %#x\n", if_name(vf_ifp), 1517 ifrh.ifrh_types, my_types); 1518 goto done; 1519 } 1520 1521 diff_types = my_types ^ ifrh.ifrh_types; 1522 my_types &= ifrh.ifrh_types; 1523 mbuf_types = my_types; 1524 1525 /* 1526 * Detect RSS hash value/type confliction. 1527 * 1528 * NOTE: 1529 * We don't disable the hash type, but stop delivery the hash 1530 * value/type through mbufs on RX path. 1531 * 1532 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1533 * hash is delivered with type of TCP_IPV4. This means if 1534 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1535 * least to hn_mbuf_hash. However, given that _all_ of the 1536 * NICs implement TCP_IPV4, this will _not_ impose any issues 1537 * here. 1538 */ 1539 if ((my_types & RSS_TYPE_IPV4) && 1540 (diff_types & ifrh.ifrh_types & 1541 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1542 /* Conflict; disable IPV4 hash type/value delivery. */ 1543 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1544 mbuf_types &= ~RSS_TYPE_IPV4; 1545 } 1546 if ((my_types & RSS_TYPE_IPV6) && 1547 (diff_types & ifrh.ifrh_types & 1548 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1549 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1550 RSS_TYPE_IPV6_EX))) { 1551 /* Conflict; disable IPV6 hash type/value delivery. */ 1552 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1553 mbuf_types &= ~RSS_TYPE_IPV6; 1554 } 1555 if ((my_types & RSS_TYPE_IPV6_EX) && 1556 (diff_types & ifrh.ifrh_types & 1557 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1558 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1559 RSS_TYPE_IPV6))) { 1560 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1561 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1562 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1563 } 1564 if ((my_types & RSS_TYPE_TCP_IPV6) && 1565 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1566 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1567 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1568 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1569 } 1570 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1571 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1572 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1573 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1574 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1575 } 1576 if ((my_types & RSS_TYPE_UDP_IPV6) && 1577 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1578 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1579 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1580 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1581 } 1582 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1583 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1584 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1585 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1586 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1587 } 1588 1589 /* 1590 * Indirect table does not matter. 1591 */ 1592 1593 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1594 hn_rss_type_tondis(my_types); 1595 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1596 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1597 1598 if (reconf) { 1599 error = hn_rss_reconfig(sc); 1600 if (error) { 1601 /* XXX roll-back? */ 1602 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1603 /* XXX keep going. */ 1604 } 1605 } 1606 done: 1607 /* Hash deliverability for mbufs. */ 1608 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1609 } 1610 1611 static void 1612 hn_vf_rss_restore(struct hn_softc *sc) 1613 { 1614 1615 HN_LOCK_ASSERT(sc); 1616 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1617 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1618 1619 if (sc->hn_rx_ring_inuse == 1) 1620 goto done; 1621 1622 /* 1623 * Restore hash types. Key does _not_ matter. 1624 */ 1625 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1626 int error; 1627 1628 sc->hn_rss_hash = sc->hn_rss_hcap; 1629 error = hn_rss_reconfig(sc); 1630 if (error) { 1631 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1632 error); 1633 /* XXX keep going. */ 1634 } 1635 } 1636 done: 1637 /* Hash deliverability for mbufs. */ 1638 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1639 } 1640 1641 static void 1642 hn_xpnt_vf_setready(struct hn_softc *sc) 1643 { 1644 if_t ifp, vf_ifp; 1645 struct ifreq ifr; 1646 1647 HN_LOCK_ASSERT(sc); 1648 ifp = sc->hn_ifp; 1649 vf_ifp = sc->hn_vf_ifp; 1650 1651 /* 1652 * Mark the VF ready. 1653 */ 1654 sc->hn_vf_rdytick = 0; 1655 1656 /* 1657 * Save information for restoration. 1658 */ 1659 sc->hn_saved_caps = if_getcapabilities(ifp); 1660 sc->hn_saved_tsomax = if_gethwtsomax(ifp); 1661 sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp); 1662 sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp); 1663 sc->hn_saved_capenable = if_getcapenable(ifp); 1664 sc->hn_saved_hwassist = if_gethwassist(ifp); 1665 1666 /* 1667 * Intersect supported/enabled capabilities. 1668 * 1669 * NOTE: 1670 * if_hwassist is not changed here. 1671 */ 1672 if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp)); 1673 if_setcapenablebit(ifp, 0, if_getcapabilities(ifp)); 1674 1675 /* 1676 * Fix TSO settings. 1677 */ 1678 if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp)) 1679 if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp)); 1680 if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp)) 1681 if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp)); 1682 if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp)) 1683 if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp)); 1684 1685 /* 1686 * Change VF's enabled capabilities. 1687 */ 1688 memset(&ifr, 0, sizeof(ifr)); 1689 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1690 ifr.ifr_reqcap = if_getcapenable(ifp); 1691 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1692 1693 if (if_getmtu(ifp) != ETHERMTU) { 1694 int error; 1695 1696 /* 1697 * Change VF's MTU. 1698 */ 1699 memset(&ifr, 0, sizeof(ifr)); 1700 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1701 ifr.ifr_mtu = if_getmtu(ifp); 1702 error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread); 1703 if (error) { 1704 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1705 if_name(vf_ifp), if_getmtu(ifp)); 1706 if (if_getmtu(ifp) > ETHERMTU) { 1707 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1708 1709 /* 1710 * XXX 1711 * No need to adjust the synthetic parts' MTU; 1712 * failure of the adjustment will cause us 1713 * infinite headache. 1714 */ 1715 if_setmtu(ifp, ETHERMTU); 1716 hn_mtu_change_fixup(sc); 1717 } 1718 } 1719 } 1720 } 1721 1722 static bool 1723 hn_xpnt_vf_isready(struct hn_softc *sc) 1724 { 1725 1726 HN_LOCK_ASSERT(sc); 1727 1728 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1729 return (false); 1730 1731 if (sc->hn_vf_rdytick == 0) 1732 return (true); 1733 1734 if (sc->hn_vf_rdytick > ticks) 1735 return (false); 1736 1737 /* Mark VF as ready. */ 1738 hn_xpnt_vf_setready(sc); 1739 return (true); 1740 } 1741 1742 static void 1743 hn_xpnt_vf_setenable(struct hn_softc *sc) 1744 { 1745 int i; 1746 1747 HN_LOCK_ASSERT(sc); 1748 1749 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1750 rm_wlock(&sc->hn_vf_lock); 1751 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1752 rm_wunlock(&sc->hn_vf_lock); 1753 1754 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1755 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1756 } 1757 1758 static void 1759 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1760 { 1761 int i; 1762 1763 HN_LOCK_ASSERT(sc); 1764 1765 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1766 rm_wlock(&sc->hn_vf_lock); 1767 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1768 if (clear_vf) 1769 sc->hn_vf_ifp = NULL; 1770 rm_wunlock(&sc->hn_vf_lock); 1771 1772 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1773 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1774 } 1775 1776 static void 1777 hn_xpnt_vf_init(struct hn_softc *sc) 1778 { 1779 int error; 1780 1781 HN_LOCK_ASSERT(sc); 1782 1783 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1784 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1785 1786 if (bootverbose) { 1787 if_printf(sc->hn_ifp, "try bringing up %s\n", 1788 if_name(sc->hn_vf_ifp)); 1789 } 1790 1791 /* 1792 * Bring the VF up. 1793 */ 1794 hn_xpnt_vf_saveifflags(sc); 1795 if_setflagbits(sc->hn_ifp, IFF_UP, 0); 1796 error = hn_xpnt_vf_iocsetflags(sc); 1797 if (error) { 1798 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1799 if_name(sc->hn_vf_ifp), error); 1800 return; 1801 } 1802 1803 /* 1804 * NOTE: 1805 * Datapath setting must happen _after_ bringing the VF up. 1806 */ 1807 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1808 1809 /* 1810 * NOTE: 1811 * Fixup RSS related bits _after_ the VF is brought up, since 1812 * many VFs generate RSS key during it's initialization. 1813 */ 1814 hn_vf_rss_fixup(sc, true); 1815 1816 /* Mark transparent mode VF as enabled. */ 1817 hn_xpnt_vf_setenable(sc); 1818 } 1819 1820 static void 1821 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1822 { 1823 struct hn_softc *sc = xsc; 1824 1825 HN_LOCK(sc); 1826 1827 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1828 goto done; 1829 if (sc->hn_vf_ifp == NULL) 1830 goto done; 1831 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1832 goto done; 1833 1834 if (sc->hn_vf_rdytick != 0) { 1835 /* Mark VF as ready. */ 1836 hn_xpnt_vf_setready(sc); 1837 } 1838 1839 if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) { 1840 /* 1841 * Delayed VF initialization. 1842 */ 1843 if (bootverbose) { 1844 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1845 if_name(sc->hn_vf_ifp)); 1846 } 1847 hn_xpnt_vf_init(sc); 1848 } 1849 done: 1850 HN_UNLOCK(sc); 1851 } 1852 1853 static void 1854 hn_ifnet_attevent(void *xsc, if_t ifp) 1855 { 1856 struct hn_softc *sc = xsc; 1857 1858 HN_LOCK(sc); 1859 1860 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1861 goto done; 1862 1863 if (!hn_ismyvf(sc, ifp)) 1864 goto done; 1865 1866 if (sc->hn_vf_ifp != NULL) { 1867 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1868 if_name(sc->hn_vf_ifp)); 1869 goto done; 1870 } 1871 1872 if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) { 1873 /* 1874 * ifnet.if_start is _not_ supported by transparent 1875 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1876 */ 1877 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1878 "in transparent VF mode.\n", if_name(sc->hn_vf_ifp)); 1879 1880 goto done; 1881 } 1882 1883 rm_wlock(&hn_vfmap_lock); 1884 1885 if (if_getindex(ifp) >= hn_vfmap_size) { 1886 if_t *newmap; 1887 int newsize; 1888 1889 newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF; 1890 newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF, 1891 M_WAITOK | M_ZERO); 1892 1893 memcpy(newmap, hn_vfmap, 1894 sizeof(if_t) * hn_vfmap_size); 1895 free(hn_vfmap, M_DEVBUF); 1896 hn_vfmap = newmap; 1897 hn_vfmap_size = newsize; 1898 } 1899 KASSERT(hn_vfmap[if_getindex(ifp)] == NULL, 1900 ("%s: ifindex %d was mapped to %s", 1901 if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); 1902 hn_vfmap[if_getindex(ifp)] = sc->hn_ifp; 1903 1904 rm_wunlock(&hn_vfmap_lock); 1905 1906 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1907 rm_wlock(&sc->hn_vf_lock); 1908 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1909 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1910 sc->hn_vf_ifp = ifp; 1911 rm_wunlock(&sc->hn_vf_lock); 1912 1913 if (hn_xpnt_vf) { 1914 int wait_ticks; 1915 1916 /* 1917 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1918 * Save vf_ifp's current if_input for later restoration. 1919 */ 1920 sc->hn_vf_input = if_getinputfn(ifp); 1921 if_setinputfn(ifp, hn_xpnt_vf_input); 1922 1923 /* 1924 * Stop link status management; use the VF's. 1925 */ 1926 hn_suspend_mgmt(sc); 1927 1928 /* 1929 * Give VF sometime to complete its attach routing. 1930 */ 1931 wait_ticks = hn_xpnt_vf_attwait * hz; 1932 sc->hn_vf_rdytick = ticks + wait_ticks; 1933 1934 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1935 wait_ticks); 1936 } 1937 done: 1938 HN_UNLOCK(sc); 1939 } 1940 1941 static void 1942 hn_ifnet_detevent(void *xsc, if_t ifp) 1943 { 1944 struct hn_softc *sc = xsc; 1945 1946 HN_LOCK(sc); 1947 1948 if (sc->hn_vf_ifp == NULL) 1949 goto done; 1950 1951 if (!hn_ismyvf(sc, ifp)) 1952 goto done; 1953 1954 if (hn_xpnt_vf) { 1955 /* 1956 * Make sure that the delayed initialization is not running. 1957 * 1958 * NOTE: 1959 * - This lock _must_ be released, since the hn_vf_init task 1960 * will try holding this lock. 1961 * - It is safe to release this lock here, since the 1962 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1963 * 1964 * XXX racy, if hn(4) ever detached. 1965 */ 1966 HN_UNLOCK(sc); 1967 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 1968 HN_LOCK(sc); 1969 1970 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 1971 if_name(sc->hn_ifp))); 1972 if_setinputfn(ifp, sc->hn_vf_input); 1973 sc->hn_vf_input = NULL; 1974 1975 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 1976 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 1977 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 1978 1979 if (sc->hn_vf_rdytick == 0) { 1980 /* 1981 * The VF was ready; restore some settings. 1982 */ 1983 if_setcapabilities(ifp, sc->hn_saved_caps); 1984 1985 if_sethwtsomax(ifp, sc->hn_saved_tsomax); 1986 if_sethwtsomaxsegcount(sc->hn_ifp, 1987 sc->hn_saved_tsosegcnt); 1988 if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz); 1989 1990 if_setcapenable(ifp, sc->hn_saved_capenable); 1991 if_sethwassist(ifp, sc->hn_saved_hwassist); 1992 } 1993 1994 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 1995 /* 1996 * Restore RSS settings. 1997 */ 1998 hn_vf_rss_restore(sc); 1999 2000 /* 2001 * Resume link status management, which was suspended 2002 * by hn_ifnet_attevent(). 2003 */ 2004 hn_resume_mgmt(sc); 2005 } 2006 } 2007 2008 /* Mark transparent mode VF as disabled. */ 2009 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2010 2011 rm_wlock(&hn_vfmap_lock); 2012 2013 KASSERT(if_getindex(ifp) < hn_vfmap_size, 2014 ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size)); 2015 if (hn_vfmap[if_getindex(ifp)] != NULL) { 2016 KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp, 2017 ("%s: ifindex %d was mapped to %s", 2018 if_name(ifp), if_getindex(ifp), 2019 if_name(hn_vfmap[if_getindex(ifp)]))); 2020 hn_vfmap[if_getindex(ifp)] = NULL; 2021 } 2022 2023 rm_wunlock(&hn_vfmap_lock); 2024 done: 2025 HN_UNLOCK(sc); 2026 } 2027 2028 static void 2029 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state) 2030 { 2031 struct hn_softc *sc = xsc; 2032 2033 if (sc->hn_vf_ifp == ifp) 2034 if_link_state_change(sc->hn_ifp, link_state); 2035 } 2036 2037 static int 2038 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS) 2039 { 2040 struct hn_softc *sc = arg1; 2041 unsigned int tsomax; 2042 int error; 2043 2044 tsomax = if_gethwtsomax(sc->hn_ifp); 2045 error = sysctl_handle_int(oidp, &tsomax, 0, req); 2046 return error; 2047 } 2048 2049 static int 2050 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS) 2051 { 2052 struct hn_softc *sc = arg1; 2053 unsigned int tsomaxsegcnt; 2054 int error; 2055 2056 tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp); 2057 error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req); 2058 return error; 2059 } 2060 2061 static int 2062 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS) 2063 { 2064 struct hn_softc *sc = arg1; 2065 unsigned int tsomaxsegsz; 2066 int error; 2067 2068 tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp); 2069 error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req); 2070 return error; 2071 } 2072 2073 static int 2074 hn_probe(device_t dev) 2075 { 2076 2077 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2078 device_set_desc(dev, "Hyper-V Network Interface"); 2079 return BUS_PROBE_DEFAULT; 2080 } 2081 return ENXIO; 2082 } 2083 2084 static int 2085 hn_attach(device_t dev) 2086 { 2087 struct hn_softc *sc = device_get_softc(dev); 2088 struct sysctl_oid_list *child; 2089 struct sysctl_ctx_list *ctx; 2090 uint8_t eaddr[ETHER_ADDR_LEN]; 2091 if_t ifp = NULL; 2092 int error, ring_cnt, tx_ring_cnt; 2093 uint32_t mtu; 2094 2095 sc->hn_dev = dev; 2096 sc->hn_prichan = vmbus_get_channel(dev); 2097 HN_LOCK_INIT(sc); 2098 rm_init(&sc->hn_vf_lock, "hnvf"); 2099 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2100 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2101 2102 /* 2103 * Initialize these tunables once. 2104 */ 2105 sc->hn_agg_size = hn_tx_agg_size; 2106 sc->hn_agg_pkts = hn_tx_agg_pkts; 2107 2108 /* 2109 * Setup taskqueue for transmission. 2110 */ 2111 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2112 int i; 2113 2114 sc->hn_tx_taskqs = 2115 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2116 M_DEVBUF, M_WAITOK); 2117 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2118 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2119 M_WAITOK, taskqueue_thread_enqueue, 2120 &sc->hn_tx_taskqs[i]); 2121 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2122 "%s tx%d", device_get_nameunit(dev), i); 2123 } 2124 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2125 sc->hn_tx_taskqs = hn_tx_taskque; 2126 } 2127 2128 /* 2129 * Setup taskqueue for mangement tasks, e.g. link status. 2130 */ 2131 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2132 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2133 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2134 device_get_nameunit(dev)); 2135 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2136 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2137 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2138 hn_netchg_status_taskfunc, sc); 2139 2140 if (hn_xpnt_vf) { 2141 /* 2142 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2143 */ 2144 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2145 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2146 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2147 device_get_nameunit(dev)); 2148 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2149 hn_xpnt_vf_init_taskfunc, sc); 2150 } 2151 2152 /* 2153 * Allocate ifnet and setup its name earlier, so that if_printf 2154 * can be used by functions, which will be called after 2155 * ether_ifattach(). 2156 */ 2157 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2158 if_setsoftc(ifp, sc); 2159 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2160 2161 /* 2162 * Initialize ifmedia earlier so that it can be unconditionally 2163 * destroyed, if error happened later on. 2164 */ 2165 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2166 2167 /* 2168 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2169 * to use (tx_ring_cnt). 2170 * 2171 * NOTE: 2172 * The # of RX rings to use is same as the # of channels to use. 2173 */ 2174 ring_cnt = hn_chan_cnt; 2175 if (ring_cnt <= 0) { 2176 /* Default */ 2177 ring_cnt = mp_ncpus; 2178 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2179 ring_cnt = HN_RING_CNT_DEF_MAX; 2180 } else if (ring_cnt > mp_ncpus) { 2181 ring_cnt = mp_ncpus; 2182 } 2183 #ifdef RSS 2184 if (ring_cnt > rss_getnumbuckets()) 2185 ring_cnt = rss_getnumbuckets(); 2186 #endif 2187 2188 tx_ring_cnt = hn_tx_ring_cnt; 2189 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2190 tx_ring_cnt = ring_cnt; 2191 #ifdef HN_IFSTART_SUPPORT 2192 if (hn_use_if_start) { 2193 /* ifnet.if_start only needs one TX ring. */ 2194 tx_ring_cnt = 1; 2195 } 2196 #endif 2197 2198 /* 2199 * Set the leader CPU for channels. 2200 */ 2201 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2202 2203 /* 2204 * Create enough TX/RX rings, even if only limited number of 2205 * channels can be allocated. 2206 */ 2207 error = hn_create_tx_data(sc, tx_ring_cnt); 2208 if (error) 2209 goto failed; 2210 error = hn_create_rx_data(sc, ring_cnt); 2211 if (error) 2212 goto failed; 2213 2214 /* 2215 * Create transaction context for NVS and RNDIS transactions. 2216 */ 2217 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2218 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2219 if (sc->hn_xact == NULL) { 2220 error = ENXIO; 2221 goto failed; 2222 } 2223 2224 /* 2225 * Install orphan handler for the revocation of this device's 2226 * primary channel. 2227 * 2228 * NOTE: 2229 * The processing order is critical here: 2230 * Install the orphan handler, _before_ testing whether this 2231 * device's primary channel has been revoked or not. 2232 */ 2233 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2234 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2235 error = ENXIO; 2236 goto failed; 2237 } 2238 2239 /* 2240 * Attach the synthetic parts, i.e. NVS and RNDIS. 2241 */ 2242 error = hn_synth_attach(sc, ETHERMTU); 2243 if (error) 2244 goto failed; 2245 2246 error = hn_rndis_get_eaddr(sc, eaddr); 2247 if (error) 2248 goto failed; 2249 2250 error = hn_rndis_get_mtu(sc, &mtu); 2251 if (error) 2252 mtu = ETHERMTU; 2253 else if (bootverbose) 2254 device_printf(dev, "RNDIS mtu %u\n", mtu); 2255 2256 if (sc->hn_rx_ring_inuse > 1) { 2257 /* 2258 * Reduce TCP segment aggregation limit for multiple 2259 * RX rings to increase ACK timeliness. 2260 */ 2261 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2262 } 2263 2264 /* 2265 * Fixup TX/RX stuffs after synthetic parts are attached. 2266 */ 2267 hn_fixup_tx_data(sc); 2268 hn_fixup_rx_data(sc); 2269 2270 ctx = device_get_sysctl_ctx(dev); 2271 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2272 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2273 &sc->hn_nvs_ver, 0, "NVS version"); 2274 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2275 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2276 hn_ndis_version_sysctl, "A", "NDIS version"); 2277 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2278 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2279 hn_caps_sysctl, "A", "capabilities"); 2280 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2281 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2282 hn_hwassist_sysctl, "A", "hwassist"); 2283 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max", 2284 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl, 2285 "IU", "max TSO size"); 2286 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt", 2287 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl, 2288 "IU", "max # of TSO segments"); 2289 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz", 2290 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl, 2291 "IU", "max size of TSO segment"); 2292 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2293 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2294 hn_rxfilter_sysctl, "A", "rxfilter"); 2295 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2296 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2297 hn_rss_hash_sysctl, "A", "RSS hash"); 2298 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2299 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2300 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2301 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2302 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2303 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2304 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2305 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2306 #ifndef RSS 2307 /* 2308 * Don't allow RSS key/indirect table changes, if RSS is defined. 2309 */ 2310 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2311 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2312 hn_rss_key_sysctl, "IU", "RSS key"); 2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2314 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2315 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2316 #endif 2317 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2318 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2319 "RNDIS offered packet transmission aggregation size limit"); 2320 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2321 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2322 "RNDIS offered packet transmission aggregation count limit"); 2323 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2324 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2325 "RNDIS packet transmission aggregation alignment"); 2326 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2327 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2328 hn_txagg_size_sysctl, "I", 2329 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2330 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2331 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2332 hn_txagg_pkts_sysctl, "I", 2333 "Packet transmission aggregation packets, " 2334 "0 -- disable, -1 -- auto"); 2335 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2336 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2337 hn_polling_sysctl, "I", 2338 "Polling frequency: [100,1000000], 0 disable polling"); 2339 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2340 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2341 hn_vf_sysctl, "A", "Virtual Function's name"); 2342 if (!hn_xpnt_vf) { 2343 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2344 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2345 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2346 } else { 2347 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2348 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2349 hn_xpnt_vf_enabled_sysctl, "I", 2350 "Transparent VF enabled"); 2351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2352 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2353 hn_xpnt_vf_accbpf_sysctl, "I", 2354 "Accurate BPF for transparent VF"); 2355 } 2356 2357 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2358 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2359 "switch to rsc"); 2360 2361 /* 2362 * Setup the ifmedia, which has been initialized earlier. 2363 */ 2364 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2365 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2366 /* XXX ifmedia_set really should do this for us */ 2367 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2368 2369 /* 2370 * Setup the ifnet for this interface. 2371 */ 2372 2373 if_setbaudrate(ifp, IF_Gbps(10)); 2374 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); 2375 if_setioctlfn(ifp, hn_ioctl); 2376 if_setinitfn(ifp, hn_init); 2377 #ifdef HN_IFSTART_SUPPORT 2378 if (hn_use_if_start) { 2379 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2380 2381 if_setstartfn(ifp, hn_start); 2382 if_setsendqlen(ifp, qdepth); 2383 if_setsendqready(ifp); 2384 } else 2385 #endif 2386 { 2387 if_settransmitfn(ifp, hn_transmit); 2388 if_setqflushfn(ifp, hn_xmit_qflush); 2389 } 2390 2391 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0); 2392 #ifdef foo 2393 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2394 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); 2395 #endif 2396 if (sc->hn_caps & HN_CAP_VLAN) { 2397 /* XXX not sure about VLAN_MTU. */ 2398 if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0); 2399 } 2400 2401 if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist); 2402 if (if_gethwassist(ifp) & HN_CSUM_IP_MASK) 2403 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0); 2404 if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK) 2405 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0); 2406 if (sc->hn_caps & HN_CAP_TSO4) { 2407 if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); 2408 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 2409 } 2410 if (sc->hn_caps & HN_CAP_TSO6) { 2411 if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); 2412 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 2413 } 2414 2415 /* Enable all available capabilities by default. */ 2416 if_setcapenable(ifp, if_getcapabilities(ifp)); 2417 2418 /* 2419 * Disable IPv6 TSO and TXCSUM by default, they still can 2420 * be enabled through SIOCSIFCAP. 2421 */ 2422 if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6)); 2423 if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO)); 2424 2425 if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) { 2426 /* 2427 * Lock hn_set_tso_maxsize() to simplify its 2428 * internal logic. 2429 */ 2430 HN_LOCK(sc); 2431 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2432 HN_UNLOCK(sc); 2433 if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX); 2434 if_sethwtsomaxsegsize(ifp, PAGE_SIZE); 2435 } 2436 2437 ether_ifattach(ifp, eaddr); 2438 2439 if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2440 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2441 if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp)); 2442 } 2443 if (mtu < ETHERMTU) { 2444 2445 if_setmtu(ifp, mtu); 2446 } 2447 2448 /* Inform the upper layer about the long frame support. */ 2449 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); 2450 2451 /* 2452 * Kick off link status check. 2453 */ 2454 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2455 hn_update_link_status(sc); 2456 2457 if (!hn_xpnt_vf) { 2458 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2459 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2460 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2461 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2462 } else { 2463 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2464 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2465 } 2466 2467 /* 2468 * NOTE: 2469 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2470 * since interface's LLADDR is needed; interface LLADDR is not 2471 * available when ifnet_arrival event is triggered. 2472 */ 2473 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2474 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2475 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2476 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2477 2478 return (0); 2479 failed: 2480 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2481 hn_synth_detach(sc); 2482 hn_detach(dev); 2483 return (error); 2484 } 2485 2486 static int 2487 hn_detach(device_t dev) 2488 { 2489 struct hn_softc *sc = device_get_softc(dev); 2490 if_t ifp = sc->hn_ifp, vf_ifp; 2491 2492 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2493 /* 2494 * In case that the vmbus missed the orphan handler 2495 * installation. 2496 */ 2497 vmbus_xact_ctx_orphan(sc->hn_xact); 2498 } 2499 2500 if (sc->hn_ifaddr_evthand != NULL) 2501 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2502 if (sc->hn_ifnet_evthand != NULL) 2503 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2504 if (sc->hn_ifnet_atthand != NULL) { 2505 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2506 sc->hn_ifnet_atthand); 2507 } 2508 if (sc->hn_ifnet_dethand != NULL) { 2509 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2510 sc->hn_ifnet_dethand); 2511 } 2512 if (sc->hn_ifnet_lnkhand != NULL) 2513 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2514 2515 vf_ifp = sc->hn_vf_ifp; 2516 __compiler_membar(); 2517 if (vf_ifp != NULL) 2518 hn_ifnet_detevent(sc, vf_ifp); 2519 2520 if (device_is_attached(dev)) { 2521 HN_LOCK(sc); 2522 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2523 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 2524 hn_stop(sc, true); 2525 /* 2526 * NOTE: 2527 * hn_stop() only suspends data, so management 2528 * stuffs have to be suspended manually here. 2529 */ 2530 hn_suspend_mgmt(sc); 2531 hn_synth_detach(sc); 2532 } 2533 HN_UNLOCK(sc); 2534 ether_ifdetach(ifp); 2535 } 2536 2537 ifmedia_removeall(&sc->hn_media); 2538 hn_destroy_rx_data(sc); 2539 hn_destroy_tx_data(sc); 2540 2541 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2542 int i; 2543 2544 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2545 taskqueue_free(sc->hn_tx_taskqs[i]); 2546 free(sc->hn_tx_taskqs, M_DEVBUF); 2547 } 2548 taskqueue_free(sc->hn_mgmt_taskq0); 2549 if (sc->hn_vf_taskq != NULL) 2550 taskqueue_free(sc->hn_vf_taskq); 2551 2552 if (sc->hn_xact != NULL) { 2553 /* 2554 * Uninstall the orphan handler _before_ the xact is 2555 * destructed. 2556 */ 2557 vmbus_chan_unset_orphan(sc->hn_prichan); 2558 vmbus_xact_ctx_destroy(sc->hn_xact); 2559 } 2560 2561 if_free(ifp); 2562 2563 HN_LOCK_DESTROY(sc); 2564 rm_destroy(&sc->hn_vf_lock); 2565 return (0); 2566 } 2567 2568 static int 2569 hn_shutdown(device_t dev) 2570 { 2571 2572 return (0); 2573 } 2574 2575 static void 2576 hn_link_status(struct hn_softc *sc) 2577 { 2578 uint32_t link_status; 2579 int error; 2580 2581 error = hn_rndis_get_linkstatus(sc, &link_status); 2582 if (error) { 2583 /* XXX what to do? */ 2584 return; 2585 } 2586 2587 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2588 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2589 else 2590 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2591 if_link_state_change(sc->hn_ifp, 2592 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2593 LINK_STATE_UP : LINK_STATE_DOWN); 2594 } 2595 2596 static void 2597 hn_link_taskfunc(void *xsc, int pending __unused) 2598 { 2599 struct hn_softc *sc = xsc; 2600 2601 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2602 return; 2603 hn_link_status(sc); 2604 } 2605 2606 static void 2607 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2608 { 2609 struct hn_softc *sc = xsc; 2610 2611 /* Prevent any link status checks from running. */ 2612 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2613 2614 /* 2615 * Fake up a [link down --> link up] state change; 5 seconds 2616 * delay is used, which closely simulates miibus reaction 2617 * upon link down event. 2618 */ 2619 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2620 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2621 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2622 &sc->hn_netchg_status, 5 * hz); 2623 } 2624 2625 static void 2626 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2627 { 2628 struct hn_softc *sc = xsc; 2629 2630 /* Re-allow link status checks. */ 2631 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2632 hn_link_status(sc); 2633 } 2634 2635 static void 2636 hn_update_link_status(struct hn_softc *sc) 2637 { 2638 2639 if (sc->hn_mgmt_taskq != NULL) 2640 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2641 } 2642 2643 static void 2644 hn_change_network(struct hn_softc *sc) 2645 { 2646 2647 if (sc->hn_mgmt_taskq != NULL) 2648 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2649 } 2650 2651 static __inline int 2652 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2653 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2654 { 2655 struct mbuf *m = *m_head; 2656 int error; 2657 2658 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2659 2660 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2661 m, segs, nsegs, BUS_DMA_NOWAIT); 2662 if (error == EFBIG) { 2663 struct mbuf *m_new; 2664 2665 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2666 if (m_new == NULL) 2667 return ENOBUFS; 2668 else 2669 *m_head = m = m_new; 2670 txr->hn_tx_collapsed++; 2671 2672 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2673 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2674 } 2675 if (!error) { 2676 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2677 BUS_DMASYNC_PREWRITE); 2678 txd->flags |= HN_TXD_FLAG_DMAMAP; 2679 } 2680 return error; 2681 } 2682 2683 static __inline int 2684 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2685 { 2686 2687 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2688 ("put an onlist txd %#x", txd->flags)); 2689 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2690 ("put an onagg txd %#x", txd->flags)); 2691 2692 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2693 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2694 return 0; 2695 2696 if (!STAILQ_EMPTY(&txd->agg_list)) { 2697 struct hn_txdesc *tmp_txd; 2698 2699 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2700 int freed __diagused; 2701 2702 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2703 ("resursive aggregation on aggregated txdesc")); 2704 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2705 ("not aggregated txdesc")); 2706 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2707 ("aggregated txdesc uses dmamap")); 2708 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2709 ("aggregated txdesc consumes " 2710 "chimney sending buffer")); 2711 KASSERT(tmp_txd->chim_size == 0, 2712 ("aggregated txdesc has non-zero " 2713 "chimney sending size")); 2714 2715 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2716 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2717 freed = hn_txdesc_put(txr, tmp_txd); 2718 KASSERT(freed, ("failed to free aggregated txdesc")); 2719 } 2720 } 2721 2722 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2723 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2724 ("chim txd uses dmamap")); 2725 hn_chim_free(txr->hn_sc, txd->chim_index); 2726 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2727 txd->chim_size = 0; 2728 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2729 bus_dmamap_sync(txr->hn_tx_data_dtag, 2730 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2731 bus_dmamap_unload(txr->hn_tx_data_dtag, 2732 txd->data_dmap); 2733 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2734 } 2735 2736 if (txd->m != NULL) { 2737 m_freem(txd->m); 2738 txd->m = NULL; 2739 } 2740 2741 txd->flags |= HN_TXD_FLAG_ONLIST; 2742 #ifndef HN_USE_TXDESC_BUFRING 2743 mtx_lock_spin(&txr->hn_txlist_spin); 2744 KASSERT(txr->hn_txdesc_avail >= 0 && 2745 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2746 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2747 txr->hn_txdesc_avail++; 2748 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2749 mtx_unlock_spin(&txr->hn_txlist_spin); 2750 #else /* HN_USE_TXDESC_BUFRING */ 2751 #ifdef HN_DEBUG 2752 atomic_add_int(&txr->hn_txdesc_avail, 1); 2753 #endif 2754 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2755 #endif /* !HN_USE_TXDESC_BUFRING */ 2756 2757 return 1; 2758 } 2759 2760 static __inline struct hn_txdesc * 2761 hn_txdesc_get(struct hn_tx_ring *txr) 2762 { 2763 struct hn_txdesc *txd; 2764 2765 #ifndef HN_USE_TXDESC_BUFRING 2766 mtx_lock_spin(&txr->hn_txlist_spin); 2767 txd = SLIST_FIRST(&txr->hn_txlist); 2768 if (txd != NULL) { 2769 KASSERT(txr->hn_txdesc_avail > 0, 2770 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2771 txr->hn_txdesc_avail--; 2772 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2773 } 2774 mtx_unlock_spin(&txr->hn_txlist_spin); 2775 #else 2776 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2777 #endif 2778 2779 if (txd != NULL) { 2780 #ifdef HN_USE_TXDESC_BUFRING 2781 #ifdef HN_DEBUG 2782 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2783 #endif 2784 #endif /* HN_USE_TXDESC_BUFRING */ 2785 KASSERT(txd->m == NULL && txd->refs == 0 && 2786 STAILQ_EMPTY(&txd->agg_list) && 2787 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2788 txd->chim_size == 0 && 2789 (txd->flags & HN_TXD_FLAG_ONLIST) && 2790 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2791 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2792 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2793 txd->refs = 1; 2794 } 2795 return txd; 2796 } 2797 2798 static __inline void 2799 hn_txdesc_hold(struct hn_txdesc *txd) 2800 { 2801 2802 /* 0->1 transition will never work */ 2803 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2804 atomic_add_int(&txd->refs, 1); 2805 } 2806 2807 static __inline void 2808 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2809 { 2810 2811 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2812 ("recursive aggregation on aggregating txdesc")); 2813 2814 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2815 ("already aggregated")); 2816 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2817 ("recursive aggregation on to-be-aggregated txdesc")); 2818 2819 txd->flags |= HN_TXD_FLAG_ONAGG; 2820 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2821 } 2822 2823 static bool 2824 hn_tx_ring_pending(struct hn_tx_ring *txr) 2825 { 2826 bool pending = false; 2827 2828 #ifndef HN_USE_TXDESC_BUFRING 2829 mtx_lock_spin(&txr->hn_txlist_spin); 2830 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2831 pending = true; 2832 mtx_unlock_spin(&txr->hn_txlist_spin); 2833 #else 2834 if (!buf_ring_full(txr->hn_txdesc_br)) 2835 pending = true; 2836 #endif 2837 return (pending); 2838 } 2839 2840 static __inline void 2841 hn_txeof(struct hn_tx_ring *txr) 2842 { 2843 txr->hn_has_txeof = 0; 2844 txr->hn_txeof(txr); 2845 } 2846 2847 static void 2848 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2849 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2850 { 2851 struct hn_txdesc *txd = sndc->hn_cbarg; 2852 struct hn_tx_ring *txr; 2853 2854 txr = txd->txr; 2855 KASSERT(txr->hn_chan == chan, 2856 ("channel mismatch, on chan%u, should be chan%u", 2857 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2858 2859 txr->hn_has_txeof = 1; 2860 hn_txdesc_put(txr, txd); 2861 2862 ++txr->hn_txdone_cnt; 2863 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2864 txr->hn_txdone_cnt = 0; 2865 if (txr->hn_oactive) 2866 hn_txeof(txr); 2867 } 2868 } 2869 2870 static void 2871 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2872 { 2873 #if defined(INET) || defined(INET6) 2874 struct epoch_tracker et; 2875 2876 NET_EPOCH_ENTER(et); 2877 tcp_lro_flush_all(&rxr->hn_lro); 2878 NET_EPOCH_EXIT(et); 2879 #endif 2880 2881 /* 2882 * NOTE: 2883 * 'txr' could be NULL, if multiple channels and 2884 * ifnet.if_start method are enabled. 2885 */ 2886 if (txr == NULL || !txr->hn_has_txeof) 2887 return; 2888 2889 txr->hn_txdone_cnt = 0; 2890 hn_txeof(txr); 2891 } 2892 2893 static __inline uint32_t 2894 hn_rndis_pktmsg_offset(uint32_t ofs) 2895 { 2896 2897 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2898 ("invalid RNDIS packet msg offset %u", ofs)); 2899 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2900 } 2901 2902 static __inline void * 2903 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2904 size_t pi_dlen, uint32_t pi_type) 2905 { 2906 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2907 struct rndis_pktinfo *pi; 2908 2909 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2910 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2911 2912 /* 2913 * Per-packet-info does not move; it only grows. 2914 * 2915 * NOTE: 2916 * rm_pktinfooffset in this phase counts from the beginning 2917 * of rndis_packet_msg. 2918 */ 2919 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2920 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2921 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2922 pkt->rm_pktinfolen); 2923 pkt->rm_pktinfolen += pi_size; 2924 2925 pi->rm_size = pi_size; 2926 pi->rm_type = pi_type; 2927 pi->rm_internal = 0; 2928 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2929 2930 return (pi->rm_data); 2931 } 2932 2933 static __inline int 2934 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr) 2935 { 2936 struct hn_txdesc *txd; 2937 struct mbuf *m; 2938 int error, pkts; 2939 2940 txd = txr->hn_agg_txd; 2941 KASSERT(txd != NULL, ("no aggregate txdesc")); 2942 2943 /* 2944 * Since hn_txpkt() will reset this temporary stat, save 2945 * it now, so that oerrors can be updated properly, if 2946 * hn_txpkt() ever fails. 2947 */ 2948 pkts = txr->hn_stat_pkts; 2949 2950 /* 2951 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2952 * failure, save it for later freeing, if hn_txpkt() ever 2953 * fails. 2954 */ 2955 m = txd->m; 2956 error = hn_txpkt(ifp, txr, txd); 2957 if (__predict_false(error)) { 2958 /* txd is freed, but m is not. */ 2959 m_freem(m); 2960 2961 txr->hn_flush_failed++; 2962 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2963 } 2964 2965 /* Reset all aggregation states. */ 2966 txr->hn_agg_txd = NULL; 2967 txr->hn_agg_szleft = 0; 2968 txr->hn_agg_pktleft = 0; 2969 txr->hn_agg_prevpkt = NULL; 2970 2971 return (error); 2972 } 2973 2974 static void * 2975 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2976 int pktsize) 2977 { 2978 void *chim; 2979 2980 if (txr->hn_agg_txd != NULL) { 2981 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2982 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2983 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2984 int olen; 2985 2986 /* 2987 * Update the previous RNDIS packet's total length, 2988 * it can be increased due to the mandatory alignment 2989 * padding for this RNDIS packet. And update the 2990 * aggregating txdesc's chimney sending buffer size 2991 * accordingly. 2992 * 2993 * XXX 2994 * Zero-out the padding, as required by the RNDIS spec. 2995 */ 2996 olen = pkt->rm_len; 2997 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 2998 agg_txd->chim_size += pkt->rm_len - olen; 2999 3000 /* Link this txdesc to the parent. */ 3001 hn_txdesc_agg(agg_txd, txd); 3002 3003 chim = (uint8_t *)pkt + pkt->rm_len; 3004 /* Save the current packet for later fixup. */ 3005 txr->hn_agg_prevpkt = chim; 3006 3007 txr->hn_agg_pktleft--; 3008 txr->hn_agg_szleft -= pktsize; 3009 if (txr->hn_agg_szleft <= 3010 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3011 /* 3012 * Probably can't aggregate more packets, 3013 * flush this aggregating txdesc proactively. 3014 */ 3015 txr->hn_agg_pktleft = 0; 3016 } 3017 /* Done! */ 3018 return (chim); 3019 } 3020 hn_flush_txagg(ifp, txr); 3021 } 3022 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3023 3024 txr->hn_tx_chimney_tried++; 3025 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3026 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3027 return (NULL); 3028 txr->hn_tx_chimney++; 3029 3030 chim = txr->hn_sc->hn_chim + 3031 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3032 3033 if (txr->hn_agg_pktmax > 1 && 3034 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3035 txr->hn_agg_txd = txd; 3036 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3037 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3038 txr->hn_agg_prevpkt = chim; 3039 } 3040 return (chim); 3041 } 3042 3043 /* 3044 * NOTE: 3045 * If this function fails, then both txd and m_head0 will be freed. 3046 */ 3047 static int 3048 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3049 struct mbuf **m_head0) 3050 { 3051 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3052 int error, nsegs, i; 3053 struct mbuf *m_head = *m_head0; 3054 struct rndis_packet_msg *pkt; 3055 uint32_t *pi_data; 3056 void *chim = NULL; 3057 int pkt_hlen, pkt_size; 3058 3059 pkt = txd->rndis_pkt; 3060 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3061 if (pkt_size < txr->hn_chim_size) { 3062 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3063 if (chim != NULL) 3064 pkt = chim; 3065 } else { 3066 if (txr->hn_agg_txd != NULL) 3067 hn_flush_txagg(ifp, txr); 3068 } 3069 3070 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3071 pkt->rm_len = m_head->m_pkthdr.len; 3072 pkt->rm_dataoffset = 0; 3073 pkt->rm_datalen = m_head->m_pkthdr.len; 3074 pkt->rm_oobdataoffset = 0; 3075 pkt->rm_oobdatalen = 0; 3076 pkt->rm_oobdataelements = 0; 3077 pkt->rm_pktinfooffset = sizeof(*pkt); 3078 pkt->rm_pktinfolen = 0; 3079 pkt->rm_vchandle = 0; 3080 pkt->rm_reserved = 0; 3081 3082 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3083 /* 3084 * Set the hash value for this packet. 3085 */ 3086 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3087 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3088 3089 if (M_HASHTYPE_ISHASH(m_head)) 3090 /* 3091 * The flowid field contains the hash value host 3092 * set in the rx queue if it is a ip forwarding pkt. 3093 * Set the same hash value so host can send on the 3094 * cpu it was received. 3095 */ 3096 *pi_data = m_head->m_pkthdr.flowid; 3097 else 3098 /* 3099 * Otherwise just put the tx queue index. 3100 */ 3101 *pi_data = txr->hn_tx_idx; 3102 } 3103 3104 if (m_head->m_flags & M_VLANTAG) { 3105 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3106 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3107 *pi_data = NDIS_VLAN_INFO_MAKE( 3108 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3109 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3110 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3111 } 3112 3113 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3114 #if defined(INET6) || defined(INET) 3115 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3116 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3117 #ifdef INET 3118 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3119 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3120 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3121 m_head->m_pkthdr.tso_segsz); 3122 } 3123 #endif 3124 #if defined(INET6) && defined(INET) 3125 else 3126 #endif 3127 #ifdef INET6 3128 { 3129 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3130 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3131 m_head->m_pkthdr.tso_segsz); 3132 } 3133 #endif 3134 #endif /* INET6 || INET */ 3135 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3136 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3137 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3138 if (m_head->m_pkthdr.csum_flags & 3139 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3140 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3141 } else { 3142 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3143 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3144 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3145 } 3146 3147 if (m_head->m_pkthdr.csum_flags & 3148 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3149 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3150 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3151 } else if (m_head->m_pkthdr.csum_flags & 3152 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3153 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3154 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3155 } 3156 } 3157 3158 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3159 /* Fixup RNDIS packet message total length */ 3160 pkt->rm_len += pkt_hlen; 3161 /* Convert RNDIS packet message offsets */ 3162 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3163 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3164 3165 /* 3166 * Fast path: Chimney sending. 3167 */ 3168 if (chim != NULL) { 3169 struct hn_txdesc *tgt_txd = txd; 3170 3171 if (txr->hn_agg_txd != NULL) { 3172 tgt_txd = txr->hn_agg_txd; 3173 #ifdef INVARIANTS 3174 *m_head0 = NULL; 3175 #endif 3176 } 3177 3178 KASSERT(pkt == chim, 3179 ("RNDIS pkt not in chimney sending buffer")); 3180 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3181 ("chimney sending buffer is not used")); 3182 tgt_txd->chim_size += pkt->rm_len; 3183 3184 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3185 ((uint8_t *)chim) + pkt_hlen); 3186 3187 txr->hn_gpa_cnt = 0; 3188 txr->hn_sendpkt = hn_txpkt_chim; 3189 goto done; 3190 } 3191 3192 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3193 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3194 ("chimney buffer is used")); 3195 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3196 3197 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3198 if (__predict_false(error)) { 3199 int freed __diagused; 3200 3201 /* 3202 * This mbuf is not linked w/ the txd yet, so free it now. 3203 */ 3204 m_freem(m_head); 3205 *m_head0 = NULL; 3206 3207 freed = hn_txdesc_put(txr, txd); 3208 KASSERT(freed != 0, 3209 ("fail to free txd upon txdma error")); 3210 3211 txr->hn_txdma_failed++; 3212 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3213 return error; 3214 } 3215 *m_head0 = m_head; 3216 3217 /* +1 RNDIS packet message */ 3218 txr->hn_gpa_cnt = nsegs + 1; 3219 3220 /* send packet with page buffer */ 3221 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3222 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3223 txr->hn_gpa[0].gpa_len = pkt_hlen; 3224 3225 /* 3226 * Fill the page buffers with mbuf info after the page 3227 * buffer for RNDIS packet message. 3228 */ 3229 for (i = 0; i < nsegs; ++i) { 3230 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3231 3232 gpa->gpa_page = atop(segs[i].ds_addr); 3233 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3234 gpa->gpa_len = segs[i].ds_len; 3235 } 3236 3237 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3238 txd->chim_size = 0; 3239 txr->hn_sendpkt = hn_txpkt_sglist; 3240 done: 3241 txd->m = m_head; 3242 3243 /* Set the completion routine */ 3244 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3245 3246 /* Update temporary stats for later use. */ 3247 txr->hn_stat_pkts++; 3248 txr->hn_stat_size += m_head->m_pkthdr.len; 3249 if (m_head->m_flags & M_MCAST) 3250 txr->hn_stat_mcasts++; 3251 3252 return 0; 3253 } 3254 3255 /* 3256 * NOTE: 3257 * If this function fails, then txd will be freed, but the mbuf 3258 * associated w/ the txd will _not_ be freed. 3259 */ 3260 static int 3261 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3262 { 3263 int error, send_failed = 0, has_bpf; 3264 3265 again: 3266 has_bpf = bpf_peers_present_if(ifp); 3267 if (has_bpf) { 3268 /* 3269 * Make sure that this txd and any aggregated txds are not 3270 * freed before ETHER_BPF_MTAP. 3271 */ 3272 hn_txdesc_hold(txd); 3273 } 3274 error = txr->hn_sendpkt(txr, txd); 3275 if (!error) { 3276 if (has_bpf) { 3277 const struct hn_txdesc *tmp_txd; 3278 3279 ETHER_BPF_MTAP(ifp, txd->m); 3280 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3281 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3282 } 3283 3284 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3285 #ifdef HN_IFSTART_SUPPORT 3286 if (!hn_use_if_start) 3287 #endif 3288 { 3289 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3290 txr->hn_stat_size); 3291 if (txr->hn_stat_mcasts != 0) { 3292 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3293 txr->hn_stat_mcasts); 3294 } 3295 } 3296 txr->hn_pkts += txr->hn_stat_pkts; 3297 txr->hn_sends++; 3298 } 3299 if (has_bpf) 3300 hn_txdesc_put(txr, txd); 3301 3302 if (__predict_false(error)) { 3303 int freed __diagused; 3304 3305 /* 3306 * This should "really rarely" happen. 3307 * 3308 * XXX Too many RX to be acked or too many sideband 3309 * commands to run? Ask netvsc_channel_rollup() 3310 * to kick start later. 3311 */ 3312 txr->hn_has_txeof = 1; 3313 if (!send_failed) { 3314 txr->hn_send_failed++; 3315 send_failed = 1; 3316 /* 3317 * Try sending again after set hn_has_txeof; 3318 * in case that we missed the last 3319 * netvsc_channel_rollup(). 3320 */ 3321 goto again; 3322 } 3323 if_printf(ifp, "send failed\n"); 3324 3325 /* 3326 * Caller will perform further processing on the 3327 * associated mbuf, so don't free it in hn_txdesc_put(); 3328 * only unload it from the DMA map in hn_txdesc_put(), 3329 * if it was loaded. 3330 */ 3331 txd->m = NULL; 3332 freed = hn_txdesc_put(txr, txd); 3333 KASSERT(freed != 0, 3334 ("fail to free txd upon send error")); 3335 3336 txr->hn_send_failed++; 3337 } 3338 3339 /* Reset temporary stats, after this sending is done. */ 3340 txr->hn_stat_size = 0; 3341 txr->hn_stat_pkts = 0; 3342 txr->hn_stat_mcasts = 0; 3343 3344 return (error); 3345 } 3346 3347 /* 3348 * Append the specified data to the indicated mbuf chain, 3349 * Extend the mbuf chain if the new data does not fit in 3350 * existing space. 3351 * 3352 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3353 * There should be an equivalent in the kernel mbuf code, 3354 * but there does not appear to be one yet. 3355 * 3356 * Differs from m_append() in that additional mbufs are 3357 * allocated with cluster size MJUMPAGESIZE, and filled 3358 * accordingly. 3359 * 3360 * Return the last mbuf in the chain or NULL if failed to 3361 * allocate new mbuf. 3362 */ 3363 static struct mbuf * 3364 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3365 { 3366 struct mbuf *m, *n; 3367 int remainder, space; 3368 3369 for (m = m0; m->m_next != NULL; m = m->m_next) 3370 ; 3371 remainder = len; 3372 space = M_TRAILINGSPACE(m); 3373 if (space > 0) { 3374 /* 3375 * Copy into available space. 3376 */ 3377 if (space > remainder) 3378 space = remainder; 3379 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3380 m->m_len += space; 3381 cp += space; 3382 remainder -= space; 3383 } 3384 while (remainder > 0) { 3385 /* 3386 * Allocate a new mbuf; could check space 3387 * and allocate a cluster instead. 3388 */ 3389 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3390 if (n == NULL) 3391 return NULL; 3392 n->m_len = min(MJUMPAGESIZE, remainder); 3393 bcopy(cp, mtod(n, caddr_t), n->m_len); 3394 cp += n->m_len; 3395 remainder -= n->m_len; 3396 m->m_next = n; 3397 m = n; 3398 } 3399 3400 return m; 3401 } 3402 3403 #if defined(INET) || defined(INET6) 3404 static __inline int 3405 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3406 { 3407 if (hn_lro_mbufq_depth) { 3408 tcp_lro_queue_mbuf(lc, m); 3409 return 0; 3410 } 3411 return tcp_lro_rx(lc, m, 0); 3412 } 3413 #endif 3414 3415 static int 3416 hn_rxpkt(struct hn_rx_ring *rxr) 3417 { 3418 if_t ifp, hn_ifp = rxr->hn_ifp; 3419 struct mbuf *m_new, *n; 3420 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3421 int hash_type = M_HASHTYPE_NONE; 3422 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3423 int i; 3424 3425 ifp = hn_ifp; 3426 if (rxr->hn_rxvf_ifp != NULL) { 3427 /* 3428 * Non-transparent mode VF; pretend this packet is from 3429 * the VF. 3430 */ 3431 ifp = rxr->hn_rxvf_ifp; 3432 is_vf = 1; 3433 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3434 /* Transparent mode VF. */ 3435 is_vf = 1; 3436 } 3437 3438 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { 3439 /* 3440 * NOTE: 3441 * See the NOTE of hn_rndis_init_fixat(). This 3442 * function can be reached, immediately after the 3443 * RNDIS is initialized but before the ifnet is 3444 * setup on the hn_attach() path; drop the unexpected 3445 * packets. 3446 */ 3447 return (0); 3448 } 3449 3450 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3451 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3452 return (0); 3453 } 3454 3455 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3456 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3457 if (m_new == NULL) { 3458 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3459 return (0); 3460 } 3461 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3462 rxr->rsc.frag_len[0]); 3463 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3464 } else { 3465 /* 3466 * Get an mbuf with a cluster. For packets 2K or less, 3467 * get a standard 2K cluster. For anything larger, get a 3468 * 4K cluster. Any buffers larger than 4K can cause problems 3469 * if looped around to the Hyper-V TX channel, so avoid them. 3470 */ 3471 size = MCLBYTES; 3472 if (rxr->rsc.pktlen > MCLBYTES) { 3473 /* 4096 */ 3474 size = MJUMPAGESIZE; 3475 } 3476 3477 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3478 if (m_new == NULL) { 3479 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3480 return (0); 3481 } 3482 3483 n = m_new; 3484 for (i = 0; i < rxr->rsc.cnt; i++) { 3485 n = hv_m_append(n, rxr->rsc.frag_len[i], 3486 rxr->rsc.frag_data[i]); 3487 if (n == NULL) { 3488 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3489 return (0); 3490 } else { 3491 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3492 } 3493 } 3494 } 3495 if (rxr->rsc.pktlen <= MHLEN) 3496 rxr->hn_small_pkts++; 3497 3498 m_new->m_pkthdr.rcvif = ifp; 3499 3500 if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0)) 3501 do_csum = 0; 3502 3503 /* receive side checksum offload */ 3504 if (rxr->rsc.csum_info != NULL) { 3505 /* IP csum offload */ 3506 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3507 m_new->m_pkthdr.csum_flags |= 3508 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3509 rxr->hn_csum_ip++; 3510 } 3511 3512 /* TCP/UDP csum offload */ 3513 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3514 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3515 m_new->m_pkthdr.csum_flags |= 3516 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3517 m_new->m_pkthdr.csum_data = 0xffff; 3518 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3519 rxr->hn_csum_tcp++; 3520 else 3521 rxr->hn_csum_udp++; 3522 } 3523 3524 /* 3525 * XXX 3526 * As of this write (Oct 28th, 2016), host side will turn 3527 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3528 * the do_lro setting here is actually _not_ accurate. We 3529 * depend on the RSS hash type check to reset do_lro. 3530 */ 3531 if ((*(rxr->rsc.csum_info) & 3532 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3533 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3534 do_lro = 1; 3535 } else { 3536 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3537 if (l3proto == ETHERTYPE_IP) { 3538 if (l4proto == IPPROTO_TCP) { 3539 if (do_csum && 3540 (rxr->hn_trust_hcsum & 3541 HN_TRUST_HCSUM_TCP)) { 3542 rxr->hn_csum_trusted++; 3543 m_new->m_pkthdr.csum_flags |= 3544 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3545 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3546 m_new->m_pkthdr.csum_data = 0xffff; 3547 } 3548 do_lro = 1; 3549 } else if (l4proto == IPPROTO_UDP) { 3550 if (do_csum && 3551 (rxr->hn_trust_hcsum & 3552 HN_TRUST_HCSUM_UDP)) { 3553 rxr->hn_csum_trusted++; 3554 m_new->m_pkthdr.csum_flags |= 3555 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3556 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3557 m_new->m_pkthdr.csum_data = 0xffff; 3558 } 3559 } else if (l4proto != IPPROTO_DONE && do_csum && 3560 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3561 rxr->hn_csum_trusted++; 3562 m_new->m_pkthdr.csum_flags |= 3563 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3564 } 3565 } 3566 } 3567 3568 if (rxr->rsc.vlan_info != NULL) { 3569 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3570 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3571 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3572 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3573 m_new->m_flags |= M_VLANTAG; 3574 } 3575 3576 /* 3577 * If VF is activated (tranparent/non-transparent mode does not 3578 * matter here). 3579 * 3580 * - Disable LRO 3581 * 3582 * hn(4) will only receive broadcast packets, multicast packets, 3583 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3584 * packet types. 3585 * 3586 * For non-transparent, we definitely _cannot_ enable LRO at 3587 * all, since the LRO flush will use hn(4) as the receiving 3588 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3589 */ 3590 if (is_vf) 3591 do_lro = 0; 3592 3593 /* 3594 * If VF is activated (tranparent/non-transparent mode does not 3595 * matter here), do _not_ mess with unsupported hash types or 3596 * functions. 3597 */ 3598 if (rxr->rsc.hash_info != NULL) { 3599 rxr->hn_rss_pkts++; 3600 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3601 if (!is_vf) 3602 hash_type = M_HASHTYPE_OPAQUE_HASH; 3603 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3604 NDIS_HASH_FUNCTION_TOEPLITZ) { 3605 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3606 rxr->hn_mbuf_hash); 3607 3608 /* 3609 * NOTE: 3610 * do_lro is resetted, if the hash types are not TCP 3611 * related. See the comment in the above csum_flags 3612 * setup section. 3613 */ 3614 switch (type) { 3615 case NDIS_HASH_IPV4: 3616 hash_type = M_HASHTYPE_RSS_IPV4; 3617 do_lro = 0; 3618 break; 3619 3620 case NDIS_HASH_TCP_IPV4: 3621 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3622 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3623 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3624 3625 if (is_vf) 3626 def_htype = M_HASHTYPE_NONE; 3627 3628 /* 3629 * UDP 4-tuple hash is delivered as 3630 * TCP 4-tuple hash. 3631 */ 3632 if (l3proto == ETHERTYPE_MAX) { 3633 hn_rxpkt_proto(m_new, 3634 &l3proto, &l4proto); 3635 } 3636 if (l3proto == ETHERTYPE_IP) { 3637 if (l4proto == IPPROTO_UDP && 3638 (rxr->hn_mbuf_hash & 3639 NDIS_HASH_UDP_IPV4_X)) { 3640 hash_type = 3641 M_HASHTYPE_RSS_UDP_IPV4; 3642 do_lro = 0; 3643 } else if (l4proto != 3644 IPPROTO_TCP) { 3645 hash_type = def_htype; 3646 do_lro = 0; 3647 } 3648 } else { 3649 hash_type = def_htype; 3650 do_lro = 0; 3651 } 3652 } 3653 break; 3654 3655 case NDIS_HASH_IPV6: 3656 hash_type = M_HASHTYPE_RSS_IPV6; 3657 do_lro = 0; 3658 break; 3659 3660 case NDIS_HASH_IPV6_EX: 3661 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3662 do_lro = 0; 3663 break; 3664 3665 case NDIS_HASH_TCP_IPV6: 3666 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3667 break; 3668 3669 case NDIS_HASH_TCP_IPV6_EX: 3670 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3671 break; 3672 } 3673 } 3674 } else if (!is_vf) { 3675 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3676 hash_type = M_HASHTYPE_OPAQUE; 3677 } 3678 M_HASHTYPE_SET(m_new, hash_type); 3679 3680 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3681 if (hn_ifp != ifp) { 3682 const struct ether_header *eh; 3683 3684 /* 3685 * Non-transparent mode VF is activated. 3686 */ 3687 3688 /* 3689 * Allow tapping on hn(4). 3690 */ 3691 ETHER_BPF_MTAP(hn_ifp, m_new); 3692 3693 /* 3694 * Update hn(4)'s stats. 3695 */ 3696 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3697 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3698 /* Checked at the beginning of this function. */ 3699 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3700 eh = mtod(m_new, struct ether_header *); 3701 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3702 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3703 } 3704 rxr->hn_pkts++; 3705 3706 if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) { 3707 #if defined(INET) || defined(INET6) 3708 struct lro_ctrl *lro = &rxr->hn_lro; 3709 3710 if (lro->lro_cnt) { 3711 rxr->hn_lro_tried++; 3712 if (hn_lro_rx(lro, m_new) == 0) { 3713 /* DONE! */ 3714 return 0; 3715 } 3716 } 3717 #endif 3718 } 3719 if_input(ifp, m_new); 3720 3721 return (0); 3722 } 3723 3724 static int 3725 hn_ioctl(if_t ifp, u_long cmd, caddr_t data) 3726 { 3727 struct hn_softc *sc = if_getsoftc(ifp); 3728 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3729 if_t vf_ifp; 3730 int mask, error = 0; 3731 struct ifrsskey *ifrk; 3732 struct ifrsshash *ifrh; 3733 uint32_t mtu; 3734 3735 switch (cmd) { 3736 case SIOCSIFMTU: 3737 if (ifr->ifr_mtu > HN_MTU_MAX) { 3738 error = EINVAL; 3739 break; 3740 } 3741 3742 HN_LOCK(sc); 3743 3744 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3745 HN_UNLOCK(sc); 3746 break; 3747 } 3748 3749 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3750 /* Can't change MTU */ 3751 HN_UNLOCK(sc); 3752 error = EOPNOTSUPP; 3753 break; 3754 } 3755 3756 if (if_getmtu(ifp) == ifr->ifr_mtu) { 3757 HN_UNLOCK(sc); 3758 break; 3759 } 3760 3761 if (hn_xpnt_vf_isready(sc)) { 3762 vf_ifp = sc->hn_vf_ifp; 3763 ifr_vf = *ifr; 3764 strlcpy(ifr_vf.ifr_name, if_name(vf_ifp), 3765 sizeof(ifr_vf.ifr_name)); 3766 error = ifhwioctl(SIOCSIFMTU,vf_ifp, 3767 (caddr_t)&ifr_vf, curthread); 3768 if (error) { 3769 HN_UNLOCK(sc); 3770 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3771 if_name(vf_ifp), ifr->ifr_mtu, error); 3772 break; 3773 } 3774 } 3775 3776 /* 3777 * Suspend this interface before the synthetic parts 3778 * are ripped. 3779 */ 3780 hn_suspend(sc); 3781 3782 /* 3783 * Detach the synthetics parts, i.e. NVS and RNDIS. 3784 */ 3785 hn_synth_detach(sc); 3786 3787 /* 3788 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3789 * with the new MTU setting. 3790 */ 3791 error = hn_synth_attach(sc, ifr->ifr_mtu); 3792 if (error) { 3793 HN_UNLOCK(sc); 3794 break; 3795 } 3796 3797 error = hn_rndis_get_mtu(sc, &mtu); 3798 if (error) 3799 mtu = ifr->ifr_mtu; 3800 else if (bootverbose) 3801 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3802 3803 /* 3804 * Commit the requested MTU, after the synthetic parts 3805 * have been successfully attached. 3806 */ 3807 if (mtu >= ifr->ifr_mtu) { 3808 mtu = ifr->ifr_mtu; 3809 } else { 3810 if_printf(ifp, "fixup mtu %d -> %u\n", 3811 ifr->ifr_mtu, mtu); 3812 } 3813 if_setmtu(ifp, mtu); 3814 3815 /* 3816 * Synthetic parts' reattach may change the chimney 3817 * sending size; update it. 3818 */ 3819 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3820 hn_set_chim_size(sc, sc->hn_chim_szmax); 3821 3822 /* 3823 * Make sure that various parameters based on MTU are 3824 * still valid, after the MTU change. 3825 */ 3826 hn_mtu_change_fixup(sc); 3827 3828 /* 3829 * All done! Resume the interface now. 3830 */ 3831 hn_resume(sc); 3832 3833 if ((sc->hn_flags & HN_FLAG_RXVF) || 3834 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3835 /* 3836 * Since we have reattached the NVS part, 3837 * change the datapath to VF again; in case 3838 * that it is lost, after the NVS was detached. 3839 */ 3840 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3841 } 3842 3843 HN_UNLOCK(sc); 3844 break; 3845 3846 case SIOCSIFFLAGS: 3847 HN_LOCK(sc); 3848 3849 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3850 HN_UNLOCK(sc); 3851 break; 3852 } 3853 3854 if (hn_xpnt_vf_isready(sc)) 3855 hn_xpnt_vf_saveifflags(sc); 3856 3857 if (if_getflags(ifp) & IFF_UP) { 3858 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3859 /* 3860 * Caller meight hold mutex, e.g. 3861 * bpf; use busy-wait for the RNDIS 3862 * reply. 3863 */ 3864 HN_NO_SLEEPING(sc); 3865 hn_rxfilter_config(sc); 3866 HN_SLEEPING_OK(sc); 3867 3868 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3869 error = hn_xpnt_vf_iocsetflags(sc); 3870 } else { 3871 hn_init_locked(sc); 3872 } 3873 } else { 3874 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 3875 hn_stop(sc, false); 3876 } 3877 sc->hn_if_flags = if_getflags(ifp); 3878 3879 HN_UNLOCK(sc); 3880 break; 3881 3882 case SIOCSIFCAP: 3883 HN_LOCK(sc); 3884 3885 if (hn_xpnt_vf_isready(sc)) { 3886 ifr_vf = *ifr; 3887 strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp), 3888 sizeof(ifr_vf.ifr_name)); 3889 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3890 HN_UNLOCK(sc); 3891 break; 3892 } 3893 3894 /* 3895 * Fix up requested capabilities w/ supported capabilities, 3896 * since the supported capabilities could have been changed. 3897 */ 3898 mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ 3899 if_getcapenable(ifp); 3900 3901 if (mask & IFCAP_TXCSUM) { 3902 if_togglecapenable(ifp, IFCAP_TXCSUM); 3903 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 3904 if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0); 3905 else 3906 if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc)); 3907 } 3908 if (mask & IFCAP_TXCSUM_IPV6) { 3909 if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); 3910 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 3911 if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0); 3912 else 3913 if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc)); 3914 } 3915 3916 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3917 if (mask & IFCAP_RXCSUM) 3918 if_togglecapenable(ifp, IFCAP_RXCSUM); 3919 #ifdef foo 3920 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3921 if (mask & IFCAP_RXCSUM_IPV6) 3922 if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); 3923 #endif 3924 3925 if (mask & IFCAP_LRO) 3926 if_togglecapenable(ifp, IFCAP_LRO); 3927 3928 if (mask & IFCAP_TSO4) { 3929 if_togglecapenable(ifp, IFCAP_TSO4); 3930 if (if_getcapenable(ifp) & IFCAP_TSO4) 3931 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 3932 else 3933 if_sethwassistbits(ifp, 0, CSUM_IP_TSO); 3934 } 3935 if (mask & IFCAP_TSO6) { 3936 if_togglecapenable(ifp, IFCAP_TSO6); 3937 if (if_getcapenable(ifp) & IFCAP_TSO6) 3938 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 3939 else 3940 if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); 3941 } 3942 3943 HN_UNLOCK(sc); 3944 break; 3945 3946 case SIOCADDMULTI: 3947 case SIOCDELMULTI: 3948 HN_LOCK(sc); 3949 3950 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3951 HN_UNLOCK(sc); 3952 break; 3953 } 3954 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3955 /* 3956 * Multicast uses mutex; use busy-wait for 3957 * the RNDIS reply. 3958 */ 3959 HN_NO_SLEEPING(sc); 3960 hn_rxfilter_config(sc); 3961 HN_SLEEPING_OK(sc); 3962 } 3963 3964 /* XXX vlan(4) style mcast addr maintenance */ 3965 if (hn_xpnt_vf_isready(sc)) { 3966 int old_if_flags; 3967 3968 old_if_flags = if_getflags(sc->hn_vf_ifp); 3969 hn_xpnt_vf_saveifflags(sc); 3970 3971 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3972 ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) & 3973 IFF_ALLMULTI)) 3974 error = hn_xpnt_vf_iocsetflags(sc); 3975 } 3976 3977 HN_UNLOCK(sc); 3978 break; 3979 3980 case SIOCSIFMEDIA: 3981 case SIOCGIFMEDIA: 3982 HN_LOCK(sc); 3983 if (hn_xpnt_vf_isready(sc)) { 3984 /* 3985 * SIOCGIFMEDIA expects ifmediareq, so don't 3986 * create and pass ifr_vf to the VF here; just 3987 * replace the ifr_name. 3988 */ 3989 vf_ifp = sc->hn_vf_ifp; 3990 strlcpy(ifr->ifr_name, if_name(vf_ifp), 3991 sizeof(ifr->ifr_name)); 3992 error = ifhwioctl(cmd, vf_ifp, data, curthread); 3993 /* Restore the ifr_name. */ 3994 strlcpy(ifr->ifr_name, if_name(ifp), 3995 sizeof(ifr->ifr_name)); 3996 HN_UNLOCK(sc); 3997 break; 3998 } 3999 HN_UNLOCK(sc); 4000 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4001 break; 4002 4003 case SIOCGIFRSSHASH: 4004 ifrh = (struct ifrsshash *)data; 4005 HN_LOCK(sc); 4006 if (sc->hn_rx_ring_inuse == 1) { 4007 HN_UNLOCK(sc); 4008 ifrh->ifrh_func = RSS_FUNC_NONE; 4009 ifrh->ifrh_types = 0; 4010 break; 4011 } 4012 4013 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4014 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4015 else 4016 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4017 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4018 HN_UNLOCK(sc); 4019 break; 4020 4021 case SIOCGIFRSSKEY: 4022 ifrk = (struct ifrsskey *)data; 4023 HN_LOCK(sc); 4024 if (sc->hn_rx_ring_inuse == 1) { 4025 HN_UNLOCK(sc); 4026 ifrk->ifrk_func = RSS_FUNC_NONE; 4027 ifrk->ifrk_keylen = 0; 4028 break; 4029 } 4030 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4031 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4032 else 4033 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4034 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4035 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4036 NDIS_HASH_KEYSIZE_TOEPLITZ); 4037 HN_UNLOCK(sc); 4038 break; 4039 4040 default: 4041 error = ether_ioctl(ifp, cmd, data); 4042 break; 4043 } 4044 return (error); 4045 } 4046 4047 static void 4048 hn_stop(struct hn_softc *sc, bool detaching) 4049 { 4050 if_t ifp = sc->hn_ifp; 4051 int i; 4052 4053 HN_LOCK_ASSERT(sc); 4054 4055 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4056 ("synthetic parts were not attached")); 4057 4058 /* Clear RUNNING bit ASAP. */ 4059 if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); 4060 4061 /* Disable polling. */ 4062 hn_polling(sc, 0); 4063 4064 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4065 KASSERT(sc->hn_vf_ifp != NULL, 4066 ("%s: VF is not attached", if_name(ifp))); 4067 4068 /* Mark transparent mode VF as disabled. */ 4069 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4070 4071 /* 4072 * NOTE: 4073 * Datapath setting must happen _before_ bringing 4074 * the VF down. 4075 */ 4076 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4077 4078 /* 4079 * Bring the VF down. 4080 */ 4081 hn_xpnt_vf_saveifflags(sc); 4082 if_setflagbits(ifp, 0, IFF_UP); 4083 hn_xpnt_vf_iocsetflags(sc); 4084 } 4085 4086 /* Suspend data transfers. */ 4087 hn_suspend_data(sc); 4088 4089 /* Clear OACTIVE bit. */ 4090 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4091 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4092 sc->hn_tx_ring[i].hn_oactive = 0; 4093 4094 /* 4095 * If the non-transparent mode VF is active, make sure 4096 * that the RX filter still allows packet reception. 4097 */ 4098 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4099 hn_rxfilter_config(sc); 4100 } 4101 4102 static void 4103 hn_init_locked(struct hn_softc *sc) 4104 { 4105 if_t ifp = sc->hn_ifp; 4106 int i; 4107 4108 HN_LOCK_ASSERT(sc); 4109 4110 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4111 return; 4112 4113 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 4114 return; 4115 4116 /* Configure RX filter */ 4117 hn_rxfilter_config(sc); 4118 4119 /* Clear OACTIVE bit. */ 4120 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4121 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4122 sc->hn_tx_ring[i].hn_oactive = 0; 4123 4124 /* Clear TX 'suspended' bit. */ 4125 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4126 4127 if (hn_xpnt_vf_isready(sc)) { 4128 /* Initialize transparent VF. */ 4129 hn_xpnt_vf_init(sc); 4130 } 4131 4132 /* Everything is ready; unleash! */ 4133 if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); 4134 4135 /* Re-enable polling if requested. */ 4136 if (sc->hn_pollhz > 0) 4137 hn_polling(sc, sc->hn_pollhz); 4138 } 4139 4140 static void 4141 hn_init(void *xsc) 4142 { 4143 struct hn_softc *sc = xsc; 4144 4145 HN_LOCK(sc); 4146 hn_init_locked(sc); 4147 HN_UNLOCK(sc); 4148 } 4149 4150 static int 4151 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4152 { 4153 struct hn_softc *sc = arg1; 4154 unsigned int lenlim; 4155 int error; 4156 4157 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4158 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4159 if (error || req->newptr == NULL) 4160 return error; 4161 4162 HN_LOCK(sc); 4163 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4164 lenlim > TCP_LRO_LENGTH_MAX) { 4165 HN_UNLOCK(sc); 4166 return EINVAL; 4167 } 4168 hn_set_lro_lenlim(sc, lenlim); 4169 HN_UNLOCK(sc); 4170 4171 return 0; 4172 } 4173 4174 static int 4175 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4176 { 4177 struct hn_softc *sc = arg1; 4178 int ackcnt, error, i; 4179 4180 /* 4181 * lro_ackcnt_lim is append count limit, 4182 * +1 to turn it into aggregation limit. 4183 */ 4184 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4185 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4186 if (error || req->newptr == NULL) 4187 return error; 4188 4189 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4190 return EINVAL; 4191 4192 /* 4193 * Convert aggregation limit back to append 4194 * count limit. 4195 */ 4196 --ackcnt; 4197 HN_LOCK(sc); 4198 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4199 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4200 HN_UNLOCK(sc); 4201 return 0; 4202 } 4203 4204 static int 4205 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4206 { 4207 struct hn_softc *sc = arg1; 4208 int hcsum = arg2; 4209 int on, error, i; 4210 4211 on = 0; 4212 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4213 on = 1; 4214 4215 error = sysctl_handle_int(oidp, &on, 0, req); 4216 if (error || req->newptr == NULL) 4217 return error; 4218 4219 HN_LOCK(sc); 4220 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4221 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4222 4223 if (on) 4224 rxr->hn_trust_hcsum |= hcsum; 4225 else 4226 rxr->hn_trust_hcsum &= ~hcsum; 4227 } 4228 HN_UNLOCK(sc); 4229 return 0; 4230 } 4231 4232 static int 4233 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4234 { 4235 struct hn_softc *sc = arg1; 4236 int chim_size, error; 4237 4238 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4239 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4240 if (error || req->newptr == NULL) 4241 return error; 4242 4243 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4244 return EINVAL; 4245 4246 HN_LOCK(sc); 4247 hn_set_chim_size(sc, chim_size); 4248 HN_UNLOCK(sc); 4249 return 0; 4250 } 4251 4252 static int 4253 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4254 { 4255 struct hn_softc *sc = arg1; 4256 int ofs = arg2, i, error; 4257 struct hn_rx_ring *rxr; 4258 uint64_t stat; 4259 4260 stat = 0; 4261 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4262 rxr = &sc->hn_rx_ring[i]; 4263 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4264 } 4265 4266 error = sysctl_handle_64(oidp, &stat, 0, req); 4267 if (error || req->newptr == NULL) 4268 return error; 4269 4270 /* Zero out this stat. */ 4271 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4272 rxr = &sc->hn_rx_ring[i]; 4273 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4274 } 4275 return 0; 4276 } 4277 4278 static int 4279 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4280 { 4281 struct hn_softc *sc = arg1; 4282 int ofs = arg2, i, error; 4283 struct hn_rx_ring *rxr; 4284 u_long stat; 4285 4286 stat = 0; 4287 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4288 rxr = &sc->hn_rx_ring[i]; 4289 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4290 } 4291 4292 error = sysctl_handle_long(oidp, &stat, 0, req); 4293 if (error || req->newptr == NULL) 4294 return error; 4295 4296 /* Zero out this stat. */ 4297 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4298 rxr = &sc->hn_rx_ring[i]; 4299 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4300 } 4301 return 0; 4302 } 4303 4304 static int 4305 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4306 { 4307 struct hn_softc *sc = arg1; 4308 int ofs = arg2, i, error; 4309 struct hn_tx_ring *txr; 4310 u_long stat; 4311 4312 stat = 0; 4313 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4314 txr = &sc->hn_tx_ring[i]; 4315 stat += *((u_long *)((uint8_t *)txr + ofs)); 4316 } 4317 4318 error = sysctl_handle_long(oidp, &stat, 0, req); 4319 if (error || req->newptr == NULL) 4320 return error; 4321 4322 /* Zero out this stat. */ 4323 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4324 txr = &sc->hn_tx_ring[i]; 4325 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4326 } 4327 return 0; 4328 } 4329 4330 static int 4331 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4332 { 4333 struct hn_softc *sc = arg1; 4334 int ofs = arg2, i, error, conf; 4335 struct hn_tx_ring *txr; 4336 4337 txr = &sc->hn_tx_ring[0]; 4338 conf = *((int *)((uint8_t *)txr + ofs)); 4339 4340 error = sysctl_handle_int(oidp, &conf, 0, req); 4341 if (error || req->newptr == NULL) 4342 return error; 4343 4344 HN_LOCK(sc); 4345 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4346 txr = &sc->hn_tx_ring[i]; 4347 *((int *)((uint8_t *)txr + ofs)) = conf; 4348 } 4349 HN_UNLOCK(sc); 4350 4351 return 0; 4352 } 4353 4354 static int 4355 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4356 { 4357 struct hn_softc *sc = arg1; 4358 int error, size; 4359 4360 size = sc->hn_agg_size; 4361 error = sysctl_handle_int(oidp, &size, 0, req); 4362 if (error || req->newptr == NULL) 4363 return (error); 4364 4365 HN_LOCK(sc); 4366 sc->hn_agg_size = size; 4367 hn_set_txagg(sc); 4368 HN_UNLOCK(sc); 4369 4370 return (0); 4371 } 4372 4373 static int 4374 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4375 { 4376 struct hn_softc *sc = arg1; 4377 int error, pkts; 4378 4379 pkts = sc->hn_agg_pkts; 4380 error = sysctl_handle_int(oidp, &pkts, 0, req); 4381 if (error || req->newptr == NULL) 4382 return (error); 4383 4384 HN_LOCK(sc); 4385 sc->hn_agg_pkts = pkts; 4386 hn_set_txagg(sc); 4387 HN_UNLOCK(sc); 4388 4389 return (0); 4390 } 4391 4392 static int 4393 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4394 { 4395 struct hn_softc *sc = arg1; 4396 int pkts; 4397 4398 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4399 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4400 } 4401 4402 static int 4403 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4404 { 4405 struct hn_softc *sc = arg1; 4406 int align; 4407 4408 align = sc->hn_tx_ring[0].hn_agg_align; 4409 return (sysctl_handle_int(oidp, &align, 0, req)); 4410 } 4411 4412 static void 4413 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4414 { 4415 if (pollhz == 0) 4416 vmbus_chan_poll_disable(chan); 4417 else 4418 vmbus_chan_poll_enable(chan, pollhz); 4419 } 4420 4421 static void 4422 hn_polling(struct hn_softc *sc, u_int pollhz) 4423 { 4424 int nsubch = sc->hn_rx_ring_inuse - 1; 4425 4426 HN_LOCK_ASSERT(sc); 4427 4428 if (nsubch > 0) { 4429 struct vmbus_channel **subch; 4430 int i; 4431 4432 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4433 for (i = 0; i < nsubch; ++i) 4434 hn_chan_polling(subch[i], pollhz); 4435 vmbus_subchan_rel(subch, nsubch); 4436 } 4437 hn_chan_polling(sc->hn_prichan, pollhz); 4438 } 4439 4440 static int 4441 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4442 { 4443 struct hn_softc *sc = arg1; 4444 int pollhz, error; 4445 4446 pollhz = sc->hn_pollhz; 4447 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4448 if (error || req->newptr == NULL) 4449 return (error); 4450 4451 if (pollhz != 0 && 4452 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4453 return (EINVAL); 4454 4455 HN_LOCK(sc); 4456 if (sc->hn_pollhz != pollhz) { 4457 sc->hn_pollhz = pollhz; 4458 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && 4459 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4460 hn_polling(sc, sc->hn_pollhz); 4461 } 4462 HN_UNLOCK(sc); 4463 4464 return (0); 4465 } 4466 4467 static int 4468 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4469 { 4470 struct hn_softc *sc = arg1; 4471 char verstr[16]; 4472 4473 snprintf(verstr, sizeof(verstr), "%u.%u", 4474 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4475 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4476 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4477 } 4478 4479 static int 4480 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4481 { 4482 struct hn_softc *sc = arg1; 4483 char caps_str[128]; 4484 uint32_t caps; 4485 4486 HN_LOCK(sc); 4487 caps = sc->hn_caps; 4488 HN_UNLOCK(sc); 4489 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4490 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4491 } 4492 4493 static int 4494 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4495 { 4496 struct hn_softc *sc = arg1; 4497 char assist_str[128]; 4498 uint32_t hwassist; 4499 4500 HN_LOCK(sc); 4501 hwassist = if_gethwassist(sc->hn_ifp); 4502 HN_UNLOCK(sc); 4503 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4504 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4505 } 4506 4507 static int 4508 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4509 { 4510 struct hn_softc *sc = arg1; 4511 char filter_str[128]; 4512 uint32_t filter; 4513 4514 HN_LOCK(sc); 4515 filter = sc->hn_rx_filter; 4516 HN_UNLOCK(sc); 4517 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4518 NDIS_PACKET_TYPES); 4519 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4520 } 4521 4522 static int 4523 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4524 { 4525 struct hn_softc *sc = arg1; 4526 uint32_t mtu; 4527 int error; 4528 HN_LOCK(sc); 4529 error = hn_rndis_get_mtu(sc, &mtu); 4530 if (error) { 4531 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4532 goto back; 4533 } 4534 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4535 if (error || req->newptr == NULL) 4536 goto back; 4537 4538 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4539 if (error) 4540 goto back; 4541 error = hn_rndis_reconf_offload(sc, mtu); 4542 back: 4543 HN_UNLOCK(sc); 4544 return (error); 4545 } 4546 #ifndef RSS 4547 4548 static int 4549 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4550 { 4551 struct hn_softc *sc = arg1; 4552 int error; 4553 4554 HN_LOCK(sc); 4555 4556 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4557 if (error || req->newptr == NULL) 4558 goto back; 4559 4560 if ((sc->hn_flags & HN_FLAG_RXVF) || 4561 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4562 /* 4563 * RSS key is synchronized w/ VF's, don't allow users 4564 * to change it. 4565 */ 4566 error = EBUSY; 4567 goto back; 4568 } 4569 4570 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4571 if (error) 4572 goto back; 4573 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4574 4575 if (sc->hn_rx_ring_inuse > 1) { 4576 error = hn_rss_reconfig(sc); 4577 } else { 4578 /* Not RSS capable, at least for now; just save the RSS key. */ 4579 error = 0; 4580 } 4581 back: 4582 HN_UNLOCK(sc); 4583 return (error); 4584 } 4585 4586 static int 4587 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4588 { 4589 struct hn_softc *sc = arg1; 4590 int error; 4591 4592 HN_LOCK(sc); 4593 4594 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4595 if (error || req->newptr == NULL) 4596 goto back; 4597 4598 /* 4599 * Don't allow RSS indirect table change, if this interface is not 4600 * RSS capable currently. 4601 */ 4602 if (sc->hn_rx_ring_inuse == 1) { 4603 error = EOPNOTSUPP; 4604 goto back; 4605 } 4606 4607 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4608 if (error) 4609 goto back; 4610 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4611 4612 hn_rss_ind_fixup(sc); 4613 error = hn_rss_reconfig(sc); 4614 back: 4615 HN_UNLOCK(sc); 4616 return (error); 4617 } 4618 4619 #endif /* !RSS */ 4620 4621 static int 4622 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4623 { 4624 struct hn_softc *sc = arg1; 4625 char hash_str[128]; 4626 uint32_t hash; 4627 4628 HN_LOCK(sc); 4629 hash = sc->hn_rss_hash; 4630 HN_UNLOCK(sc); 4631 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4632 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4633 } 4634 4635 static int 4636 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4637 { 4638 struct hn_softc *sc = arg1; 4639 char hash_str[128]; 4640 uint32_t hash; 4641 4642 HN_LOCK(sc); 4643 hash = sc->hn_rss_hcap; 4644 HN_UNLOCK(sc); 4645 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4646 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4647 } 4648 4649 static int 4650 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4651 { 4652 struct hn_softc *sc = arg1; 4653 char hash_str[128]; 4654 uint32_t hash; 4655 4656 HN_LOCK(sc); 4657 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4658 HN_UNLOCK(sc); 4659 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4660 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4661 } 4662 4663 static int 4664 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4665 { 4666 struct hn_softc *sc = arg1; 4667 char vf_name[IFNAMSIZ + 1]; 4668 if_t vf_ifp; 4669 4670 HN_LOCK(sc); 4671 vf_name[0] = '\0'; 4672 vf_ifp = sc->hn_vf_ifp; 4673 if (vf_ifp != NULL) 4674 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4675 HN_UNLOCK(sc); 4676 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4677 } 4678 4679 static int 4680 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4681 { 4682 struct hn_softc *sc = arg1; 4683 char vf_name[IFNAMSIZ + 1]; 4684 if_t vf_ifp; 4685 4686 HN_LOCK(sc); 4687 vf_name[0] = '\0'; 4688 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4689 if (vf_ifp != NULL) 4690 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4691 HN_UNLOCK(sc); 4692 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4693 } 4694 4695 static int 4696 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4697 { 4698 struct rm_priotracker pt; 4699 struct sbuf *sb; 4700 int error, i; 4701 bool first; 4702 4703 error = sysctl_wire_old_buffer(req, 0); 4704 if (error != 0) 4705 return (error); 4706 4707 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4708 if (sb == NULL) 4709 return (ENOMEM); 4710 4711 rm_rlock(&hn_vfmap_lock, &pt); 4712 4713 first = true; 4714 for (i = 0; i < hn_vfmap_size; ++i) { 4715 struct epoch_tracker et; 4716 if_t ifp; 4717 4718 if (hn_vfmap[i] == NULL) 4719 continue; 4720 4721 NET_EPOCH_ENTER(et); 4722 ifp = ifnet_byindex(i); 4723 if (ifp != NULL) { 4724 if (first) 4725 sbuf_printf(sb, "%s", if_name(ifp)); 4726 else 4727 sbuf_printf(sb, " %s", if_name(ifp)); 4728 first = false; 4729 } 4730 NET_EPOCH_EXIT(et); 4731 } 4732 4733 rm_runlock(&hn_vfmap_lock, &pt); 4734 4735 error = sbuf_finish(sb); 4736 sbuf_delete(sb); 4737 return (error); 4738 } 4739 4740 static int 4741 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4742 { 4743 struct rm_priotracker pt; 4744 struct sbuf *sb; 4745 int error, i; 4746 bool first; 4747 4748 error = sysctl_wire_old_buffer(req, 0); 4749 if (error != 0) 4750 return (error); 4751 4752 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4753 if (sb == NULL) 4754 return (ENOMEM); 4755 4756 rm_rlock(&hn_vfmap_lock, &pt); 4757 4758 first = true; 4759 for (i = 0; i < hn_vfmap_size; ++i) { 4760 struct epoch_tracker et; 4761 if_t ifp, hn_ifp; 4762 4763 hn_ifp = hn_vfmap[i]; 4764 if (hn_ifp == NULL) 4765 continue; 4766 4767 NET_EPOCH_ENTER(et); 4768 ifp = ifnet_byindex(i); 4769 if (ifp != NULL) { 4770 if (first) { 4771 sbuf_printf(sb, "%s:%s", if_name(ifp), 4772 if_name(hn_ifp)); 4773 } else { 4774 sbuf_printf(sb, " %s:%s", if_name(ifp), 4775 if_name(hn_ifp)); 4776 } 4777 first = false; 4778 } 4779 NET_EPOCH_EXIT(et); 4780 } 4781 4782 rm_runlock(&hn_vfmap_lock, &pt); 4783 4784 error = sbuf_finish(sb); 4785 sbuf_delete(sb); 4786 return (error); 4787 } 4788 4789 static int 4790 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4791 { 4792 struct hn_softc *sc = arg1; 4793 int error, onoff = 0; 4794 4795 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4796 onoff = 1; 4797 error = sysctl_handle_int(oidp, &onoff, 0, req); 4798 if (error || req->newptr == NULL) 4799 return (error); 4800 4801 HN_LOCK(sc); 4802 /* NOTE: hn_vf_lock for hn_transmit() */ 4803 rm_wlock(&sc->hn_vf_lock); 4804 if (onoff) 4805 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4806 else 4807 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4808 rm_wunlock(&sc->hn_vf_lock); 4809 HN_UNLOCK(sc); 4810 4811 return (0); 4812 } 4813 4814 static int 4815 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4816 { 4817 struct hn_softc *sc = arg1; 4818 int enabled = 0; 4819 4820 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4821 enabled = 1; 4822 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4823 } 4824 4825 static int 4826 hn_check_iplen(const struct mbuf *m, int hoff) 4827 { 4828 const struct ip *ip; 4829 int len, iphlen, iplen; 4830 const struct tcphdr *th; 4831 int thoff; /* TCP data offset */ 4832 4833 len = hoff + sizeof(struct ip); 4834 4835 /* The packet must be at least the size of an IP header. */ 4836 if (m->m_pkthdr.len < len) 4837 return IPPROTO_DONE; 4838 4839 /* The fixed IP header must reside completely in the first mbuf. */ 4840 if (m->m_len < len) 4841 return IPPROTO_DONE; 4842 4843 ip = mtodo(m, hoff); 4844 4845 /* Bound check the packet's stated IP header length. */ 4846 iphlen = ip->ip_hl << 2; 4847 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4848 return IPPROTO_DONE; 4849 4850 /* The full IP header must reside completely in the one mbuf. */ 4851 if (m->m_len < hoff + iphlen) 4852 return IPPROTO_DONE; 4853 4854 iplen = ntohs(ip->ip_len); 4855 4856 /* 4857 * Check that the amount of data in the buffers is as 4858 * at least much as the IP header would have us expect. 4859 */ 4860 if (m->m_pkthdr.len < hoff + iplen) 4861 return IPPROTO_DONE; 4862 4863 /* 4864 * Ignore IP fragments. 4865 */ 4866 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4867 return IPPROTO_DONE; 4868 4869 /* 4870 * The TCP/IP or UDP/IP header must be entirely contained within 4871 * the first fragment of a packet. 4872 */ 4873 switch (ip->ip_p) { 4874 case IPPROTO_TCP: 4875 if (iplen < iphlen + sizeof(struct tcphdr)) 4876 return IPPROTO_DONE; 4877 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4878 return IPPROTO_DONE; 4879 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4880 thoff = th->th_off << 2; 4881 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4882 return IPPROTO_DONE; 4883 if (m->m_len < hoff + iphlen + thoff) 4884 return IPPROTO_DONE; 4885 break; 4886 case IPPROTO_UDP: 4887 if (iplen < iphlen + sizeof(struct udphdr)) 4888 return IPPROTO_DONE; 4889 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4890 return IPPROTO_DONE; 4891 break; 4892 default: 4893 if (iplen < iphlen) 4894 return IPPROTO_DONE; 4895 break; 4896 } 4897 return ip->ip_p; 4898 } 4899 4900 static void 4901 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4902 { 4903 const struct ether_header *eh; 4904 uint16_t etype; 4905 int hoff; 4906 4907 hoff = sizeof(*eh); 4908 /* Checked at the beginning of this function. */ 4909 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4910 4911 eh = mtod(m_new, const struct ether_header *); 4912 etype = ntohs(eh->ether_type); 4913 if (etype == ETHERTYPE_VLAN) { 4914 const struct ether_vlan_header *evl; 4915 4916 hoff = sizeof(*evl); 4917 if (m_new->m_len < hoff) 4918 return; 4919 evl = mtod(m_new, const struct ether_vlan_header *); 4920 etype = ntohs(evl->evl_proto); 4921 } 4922 *l3proto = etype; 4923 4924 if (etype == ETHERTYPE_IP) 4925 *l4proto = hn_check_iplen(m_new, hoff); 4926 else 4927 *l4proto = IPPROTO_DONE; 4928 } 4929 4930 static int 4931 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4932 { 4933 struct sysctl_oid_list *child; 4934 struct sysctl_ctx_list *ctx; 4935 device_t dev = sc->hn_dev; 4936 #if defined(INET) || defined(INET6) 4937 int lroent_cnt; 4938 #endif 4939 int i; 4940 4941 /* 4942 * Create RXBUF for reception. 4943 * 4944 * NOTE: 4945 * - It is shared by all channels. 4946 * - A large enough buffer is allocated, certain version of NVSes 4947 * may further limit the usable space. 4948 */ 4949 sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 4950 0ul, ~0ul, PAGE_SIZE, 0); 4951 if (sc->hn_rxbuf == NULL) { 4952 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4953 return (ENOMEM); 4954 } 4955 4956 sc->hn_rx_ring_cnt = ring_cnt; 4957 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4958 4959 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4960 M_DEVBUF, M_WAITOK | M_ZERO); 4961 4962 #if defined(INET) || defined(INET6) 4963 lroent_cnt = hn_lro_entry_count; 4964 if (lroent_cnt < TCP_LRO_ENTRIES) 4965 lroent_cnt = TCP_LRO_ENTRIES; 4966 if (bootverbose) 4967 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4968 #endif /* INET || INET6 */ 4969 4970 ctx = device_get_sysctl_ctx(dev); 4971 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4972 4973 /* Create dev.hn.UNIT.rx sysctl tree */ 4974 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4975 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4976 4977 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4978 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4979 4980 rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF, 4981 M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); 4982 if (rxr->hn_br == NULL) { 4983 device_printf(dev, "allocate bufring failed\n"); 4984 return (ENOMEM); 4985 } 4986 4987 if (hn_trust_hosttcp) 4988 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 4989 if (hn_trust_hostudp) 4990 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 4991 if (hn_trust_hostip) 4992 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 4993 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 4994 rxr->hn_ifp = sc->hn_ifp; 4995 if (i < sc->hn_tx_ring_cnt) 4996 rxr->hn_txr = &sc->hn_tx_ring[i]; 4997 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 4998 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 4999 rxr->hn_rx_idx = i; 5000 rxr->hn_rxbuf = sc->hn_rxbuf; 5001 5002 /* 5003 * Initialize LRO. 5004 */ 5005 #if defined(INET) || defined(INET6) 5006 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5007 hn_lro_mbufq_depth); 5008 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5009 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5010 #endif /* INET || INET6 */ 5011 5012 if (sc->hn_rx_sysctl_tree != NULL) { 5013 char name[16]; 5014 5015 /* 5016 * Create per RX ring sysctl tree: 5017 * dev.hn.UNIT.rx.RINGID 5018 */ 5019 snprintf(name, sizeof(name), "%d", i); 5020 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5021 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5022 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5023 5024 if (rxr->hn_rx_sysctl_tree != NULL) { 5025 SYSCTL_ADD_ULONG(ctx, 5026 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5027 OID_AUTO, "packets", 5028 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5029 "# of packets received"); 5030 SYSCTL_ADD_ULONG(ctx, 5031 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5032 OID_AUTO, "rss_pkts", 5033 CTLFLAG_RW | CTLFLAG_STATS, 5034 &rxr->hn_rss_pkts, 5035 "# of packets w/ RSS info received"); 5036 SYSCTL_ADD_ULONG(ctx, 5037 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5038 OID_AUTO, "rsc_pkts", 5039 CTLFLAG_RW | CTLFLAG_STATS, 5040 &rxr->hn_rsc_pkts, 5041 "# of RSC packets received"); 5042 SYSCTL_ADD_ULONG(ctx, 5043 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5044 OID_AUTO, "rsc_drop", 5045 CTLFLAG_RW | CTLFLAG_STATS, 5046 &rxr->hn_rsc_drop, 5047 "# of RSC fragments dropped"); 5048 SYSCTL_ADD_INT(ctx, 5049 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5050 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5051 &rxr->hn_pktbuf_len, 0, 5052 "Temporary channel packet buffer length"); 5053 } 5054 } 5055 } 5056 5057 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5058 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5059 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5060 hn_rx_stat_u64_sysctl, 5061 "LU", "LRO queued"); 5062 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5063 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5064 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5065 hn_rx_stat_u64_sysctl, 5066 "LU", "LRO flushed"); 5067 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5068 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5069 __offsetof(struct hn_rx_ring, hn_lro_tried), 5070 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5071 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5072 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5073 hn_lro_lenlim_sysctl, "IU", 5074 "Max # of data bytes to be aggregated by LRO"); 5075 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5076 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5077 hn_lro_ackcnt_sysctl, "I", 5078 "Max # of ACKs to be aggregated by LRO"); 5079 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5080 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5081 hn_trust_hcsum_sysctl, "I", 5082 "Trust tcp segment verification on host side, " 5083 "when csum info is missing"); 5084 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5085 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5086 hn_trust_hcsum_sysctl, "I", 5087 "Trust udp datagram verification on host side, " 5088 "when csum info is missing"); 5089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5090 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5091 hn_trust_hcsum_sysctl, "I", 5092 "Trust ip packet verification on host side, " 5093 "when csum info is missing"); 5094 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5095 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5096 __offsetof(struct hn_rx_ring, hn_csum_ip), 5097 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5098 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5099 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5100 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5101 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5102 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5103 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5104 __offsetof(struct hn_rx_ring, hn_csum_udp), 5105 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5106 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5107 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5108 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5109 hn_rx_stat_ulong_sysctl, "LU", 5110 "# of packets that we trust host's csum verification"); 5111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5112 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5113 __offsetof(struct hn_rx_ring, hn_small_pkts), 5114 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5115 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5116 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5117 __offsetof(struct hn_rx_ring, hn_ack_failed), 5118 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5119 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5120 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5121 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5122 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5123 5124 return (0); 5125 } 5126 5127 static void 5128 hn_destroy_rx_data(struct hn_softc *sc) 5129 { 5130 int i; 5131 5132 if (sc->hn_rxbuf != NULL) { 5133 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5134 contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF); 5135 else 5136 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5137 sc->hn_rxbuf = NULL; 5138 } 5139 5140 if (sc->hn_rx_ring_cnt == 0) 5141 return; 5142 5143 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5144 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5145 5146 if (rxr->hn_br == NULL) 5147 continue; 5148 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5149 contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE, 5150 M_DEVBUF); 5151 } else { 5152 device_printf(sc->hn_dev, 5153 "%dth channel bufring is referenced", i); 5154 } 5155 rxr->hn_br = NULL; 5156 5157 #if defined(INET) || defined(INET6) 5158 tcp_lro_free(&rxr->hn_lro); 5159 #endif 5160 free(rxr->hn_pktbuf, M_DEVBUF); 5161 } 5162 free(sc->hn_rx_ring, M_DEVBUF); 5163 sc->hn_rx_ring = NULL; 5164 5165 sc->hn_rx_ring_cnt = 0; 5166 sc->hn_rx_ring_inuse = 0; 5167 } 5168 5169 static int 5170 hn_tx_ring_create(struct hn_softc *sc, int id) 5171 { 5172 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5173 device_t dev = sc->hn_dev; 5174 bus_dma_tag_t parent_dtag; 5175 int error, i; 5176 5177 txr->hn_sc = sc; 5178 txr->hn_tx_idx = id; 5179 5180 #ifndef HN_USE_TXDESC_BUFRING 5181 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5182 #endif 5183 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5184 5185 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5186 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5187 M_DEVBUF, M_WAITOK | M_ZERO); 5188 #ifndef HN_USE_TXDESC_BUFRING 5189 SLIST_INIT(&txr->hn_txlist); 5190 #else 5191 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5192 M_WAITOK, &txr->hn_tx_lock); 5193 #endif 5194 5195 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5196 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5197 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5198 } else { 5199 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5200 } 5201 5202 #ifdef HN_IFSTART_SUPPORT 5203 if (hn_use_if_start) { 5204 txr->hn_txeof = hn_start_txeof; 5205 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5206 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5207 } else 5208 #endif 5209 { 5210 int br_depth; 5211 5212 txr->hn_txeof = hn_xmit_txeof; 5213 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5214 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5215 5216 br_depth = hn_get_txswq_depth(txr); 5217 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5218 M_WAITOK, &txr->hn_tx_lock); 5219 } 5220 5221 txr->hn_direct_tx_size = hn_direct_tx_size; 5222 5223 /* 5224 * Always schedule transmission instead of trying to do direct 5225 * transmission. This one gives the best performance so far. 5226 */ 5227 txr->hn_sched_tx = 1; 5228 5229 parent_dtag = bus_get_dma_tag(dev); 5230 5231 /* DMA tag for RNDIS packet messages. */ 5232 error = bus_dma_tag_create(parent_dtag, /* parent */ 5233 HN_RNDIS_PKT_ALIGN, /* alignment */ 5234 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5235 BUS_SPACE_MAXADDR, /* lowaddr */ 5236 BUS_SPACE_MAXADDR, /* highaddr */ 5237 NULL, NULL, /* filter, filterarg */ 5238 HN_RNDIS_PKT_LEN, /* maxsize */ 5239 1, /* nsegments */ 5240 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5241 0, /* flags */ 5242 NULL, /* lockfunc */ 5243 NULL, /* lockfuncarg */ 5244 &txr->hn_tx_rndis_dtag); 5245 if (error) { 5246 device_printf(dev, "failed to create rndis dmatag\n"); 5247 return error; 5248 } 5249 5250 /* DMA tag for data. */ 5251 error = bus_dma_tag_create(parent_dtag, /* parent */ 5252 1, /* alignment */ 5253 HN_TX_DATA_BOUNDARY, /* boundary */ 5254 BUS_SPACE_MAXADDR, /* lowaddr */ 5255 BUS_SPACE_MAXADDR, /* highaddr */ 5256 NULL, NULL, /* filter, filterarg */ 5257 HN_TX_DATA_MAXSIZE, /* maxsize */ 5258 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5259 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5260 0, /* flags */ 5261 NULL, /* lockfunc */ 5262 NULL, /* lockfuncarg */ 5263 &txr->hn_tx_data_dtag); 5264 if (error) { 5265 device_printf(dev, "failed to create data dmatag\n"); 5266 return error; 5267 } 5268 5269 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5270 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5271 5272 txd->txr = txr; 5273 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5274 STAILQ_INIT(&txd->agg_list); 5275 5276 /* 5277 * Allocate and load RNDIS packet message. 5278 */ 5279 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5280 (void **)&txd->rndis_pkt, 5281 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5282 &txd->rndis_pkt_dmap); 5283 if (error) { 5284 device_printf(dev, 5285 "failed to allocate rndis_packet_msg, %d\n", i); 5286 return error; 5287 } 5288 5289 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5290 txd->rndis_pkt_dmap, 5291 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5292 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5293 BUS_DMA_NOWAIT); 5294 if (error) { 5295 device_printf(dev, 5296 "failed to load rndis_packet_msg, %d\n", i); 5297 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5298 txd->rndis_pkt, txd->rndis_pkt_dmap); 5299 return error; 5300 } 5301 5302 /* DMA map for TX data. */ 5303 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5304 &txd->data_dmap); 5305 if (error) { 5306 device_printf(dev, 5307 "failed to allocate tx data dmamap\n"); 5308 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5309 txd->rndis_pkt_dmap); 5310 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5311 txd->rndis_pkt, txd->rndis_pkt_dmap); 5312 return error; 5313 } 5314 5315 /* All set, put it to list */ 5316 txd->flags |= HN_TXD_FLAG_ONLIST; 5317 #ifndef HN_USE_TXDESC_BUFRING 5318 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5319 #else 5320 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5321 #endif 5322 } 5323 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5324 5325 if (sc->hn_tx_sysctl_tree != NULL) { 5326 struct sysctl_oid_list *child; 5327 struct sysctl_ctx_list *ctx; 5328 char name[16]; 5329 5330 /* 5331 * Create per TX ring sysctl tree: 5332 * dev.hn.UNIT.tx.RINGID 5333 */ 5334 ctx = device_get_sysctl_ctx(dev); 5335 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5336 5337 snprintf(name, sizeof(name), "%d", id); 5338 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5339 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5340 5341 if (txr->hn_tx_sysctl_tree != NULL) { 5342 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5343 5344 #ifdef HN_DEBUG 5345 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5346 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5347 "# of available TX descs"); 5348 #endif 5349 #ifdef HN_IFSTART_SUPPORT 5350 if (!hn_use_if_start) 5351 #endif 5352 { 5353 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5354 CTLFLAG_RD, &txr->hn_oactive, 0, 5355 "over active"); 5356 } 5357 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5358 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5359 "# of packets transmitted"); 5360 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5361 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5362 "# of sends"); 5363 } 5364 } 5365 5366 return 0; 5367 } 5368 5369 static void 5370 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5371 { 5372 struct hn_tx_ring *txr = txd->txr; 5373 5374 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5375 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5376 5377 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5378 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5379 txd->rndis_pkt_dmap); 5380 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5381 } 5382 5383 static void 5384 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5385 { 5386 5387 KASSERT(txd->refs == 0 || txd->refs == 1, 5388 ("invalid txd refs %d", txd->refs)); 5389 5390 /* Aggregated txds will be freed by their aggregating txd. */ 5391 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5392 int freed __diagused; 5393 5394 freed = hn_txdesc_put(txr, txd); 5395 KASSERT(freed, ("can't free txdesc")); 5396 } 5397 } 5398 5399 static void 5400 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5401 { 5402 int i; 5403 5404 if (txr->hn_txdesc == NULL) 5405 return; 5406 5407 /* 5408 * NOTE: 5409 * Because the freeing of aggregated txds will be deferred 5410 * to the aggregating txd, two passes are used here: 5411 * - The first pass GCes any pending txds. This GC is necessary, 5412 * since if the channels are revoked, hypervisor will not 5413 * deliver send-done for all pending txds. 5414 * - The second pass frees the busdma stuffs, i.e. after all txds 5415 * were freed. 5416 */ 5417 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5418 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5419 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5420 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5421 5422 if (txr->hn_tx_data_dtag != NULL) 5423 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5424 if (txr->hn_tx_rndis_dtag != NULL) 5425 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5426 5427 #ifdef HN_USE_TXDESC_BUFRING 5428 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5429 #endif 5430 5431 free(txr->hn_txdesc, M_DEVBUF); 5432 txr->hn_txdesc = NULL; 5433 5434 if (txr->hn_mbuf_br != NULL) 5435 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5436 5437 #ifndef HN_USE_TXDESC_BUFRING 5438 mtx_destroy(&txr->hn_txlist_spin); 5439 #endif 5440 mtx_destroy(&txr->hn_tx_lock); 5441 } 5442 5443 static int 5444 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5445 { 5446 struct sysctl_oid_list *child; 5447 struct sysctl_ctx_list *ctx; 5448 int i; 5449 5450 /* 5451 * Create TXBUF for chimney sending. 5452 * 5453 * NOTE: It is shared by all channels. 5454 */ 5455 sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 5456 0ul, ~0ul, PAGE_SIZE, 0); 5457 if (sc->hn_chim == NULL) { 5458 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5459 return (ENOMEM); 5460 } 5461 5462 sc->hn_tx_ring_cnt = ring_cnt; 5463 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5464 5465 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5466 M_DEVBUF, M_WAITOK | M_ZERO); 5467 5468 ctx = device_get_sysctl_ctx(sc->hn_dev); 5469 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5470 5471 /* Create dev.hn.UNIT.tx sysctl tree */ 5472 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5473 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5474 5475 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5476 int error; 5477 5478 error = hn_tx_ring_create(sc, i); 5479 if (error) 5480 return error; 5481 } 5482 5483 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5484 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5485 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5486 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5487 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5488 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5489 __offsetof(struct hn_tx_ring, hn_send_failed), 5490 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5491 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5492 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5493 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5494 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5495 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5496 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5497 __offsetof(struct hn_tx_ring, hn_flush_failed), 5498 hn_tx_stat_ulong_sysctl, "LU", 5499 "# of packet transmission aggregation flush failure"); 5500 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5501 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5502 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5503 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5504 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5505 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5506 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5507 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5508 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5509 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5510 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5511 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5512 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5513 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5514 "# of total TX descs"); 5515 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5516 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5517 "Chimney send packet size upper boundary"); 5518 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5519 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5520 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5521 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5522 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5523 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5524 hn_tx_conf_int_sysctl, "I", 5525 "Size of the packet for direct transmission"); 5526 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5527 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5528 __offsetof(struct hn_tx_ring, hn_sched_tx), 5529 hn_tx_conf_int_sysctl, "I", 5530 "Always schedule transmission " 5531 "instead of doing direct transmission"); 5532 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5533 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5534 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5535 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5536 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5537 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5538 "Applied packet transmission aggregation size"); 5539 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5540 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5541 hn_txagg_pktmax_sysctl, "I", 5542 "Applied packet transmission aggregation packets"); 5543 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5544 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5545 hn_txagg_align_sysctl, "I", 5546 "Applied packet transmission aggregation alignment"); 5547 5548 return 0; 5549 } 5550 5551 static void 5552 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5553 { 5554 int i; 5555 5556 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5557 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5558 } 5559 5560 static void 5561 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5562 { 5563 if_t ifp = sc->hn_ifp; 5564 u_int hw_tsomax; 5565 int tso_minlen; 5566 5567 HN_LOCK_ASSERT(sc); 5568 5569 if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5570 return; 5571 5572 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5573 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5574 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5575 5576 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5577 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5578 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5579 5580 if (tso_maxlen < tso_minlen) 5581 tso_maxlen = tso_minlen; 5582 else if (tso_maxlen > IP_MAXPACKET) 5583 tso_maxlen = IP_MAXPACKET; 5584 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5585 tso_maxlen = sc->hn_ndis_tso_szmax; 5586 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5587 5588 if (hn_xpnt_vf_isready(sc)) { 5589 if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp)) 5590 hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp); 5591 } 5592 if_sethwtsomax(ifp, hw_tsomax); 5593 if (bootverbose) 5594 if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp)); 5595 } 5596 5597 static void 5598 hn_fixup_tx_data(struct hn_softc *sc) 5599 { 5600 uint64_t csum_assist; 5601 int i; 5602 5603 hn_set_chim_size(sc, sc->hn_chim_szmax); 5604 if (hn_tx_chimney_size > 0 && 5605 hn_tx_chimney_size < sc->hn_chim_szmax) 5606 hn_set_chim_size(sc, hn_tx_chimney_size); 5607 5608 csum_assist = 0; 5609 if (sc->hn_caps & HN_CAP_IPCS) 5610 csum_assist |= CSUM_IP; 5611 if (sc->hn_caps & HN_CAP_TCP4CS) 5612 csum_assist |= CSUM_IP_TCP; 5613 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5614 csum_assist |= CSUM_IP_UDP; 5615 if (sc->hn_caps & HN_CAP_TCP6CS) 5616 csum_assist |= CSUM_IP6_TCP; 5617 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5618 csum_assist |= CSUM_IP6_UDP; 5619 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5620 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5621 5622 if (sc->hn_caps & HN_CAP_HASHVAL) { 5623 /* 5624 * Support HASHVAL pktinfo on TX path. 5625 */ 5626 if (bootverbose) 5627 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5628 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5629 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5630 } 5631 } 5632 5633 static void 5634 hn_fixup_rx_data(struct hn_softc *sc) 5635 { 5636 5637 if (sc->hn_caps & HN_CAP_UDPHASH) { 5638 int i; 5639 5640 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5641 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5642 } 5643 } 5644 5645 static void 5646 hn_destroy_tx_data(struct hn_softc *sc) 5647 { 5648 int i; 5649 5650 if (sc->hn_chim != NULL) { 5651 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5652 contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF); 5653 } else { 5654 device_printf(sc->hn_dev, 5655 "chimney sending buffer is referenced"); 5656 } 5657 sc->hn_chim = NULL; 5658 } 5659 5660 if (sc->hn_tx_ring_cnt == 0) 5661 return; 5662 5663 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5664 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5665 5666 free(sc->hn_tx_ring, M_DEVBUF); 5667 sc->hn_tx_ring = NULL; 5668 5669 sc->hn_tx_ring_cnt = 0; 5670 sc->hn_tx_ring_inuse = 0; 5671 } 5672 5673 #ifdef HN_IFSTART_SUPPORT 5674 5675 static void 5676 hn_start_taskfunc(void *xtxr, int pending __unused) 5677 { 5678 struct hn_tx_ring *txr = xtxr; 5679 5680 mtx_lock(&txr->hn_tx_lock); 5681 hn_start_locked(txr, 0); 5682 mtx_unlock(&txr->hn_tx_lock); 5683 } 5684 5685 static int 5686 hn_start_locked(struct hn_tx_ring *txr, int len) 5687 { 5688 struct hn_softc *sc = txr->hn_sc; 5689 if_t ifp = sc->hn_ifp; 5690 int sched = 0; 5691 5692 KASSERT(hn_use_if_start, 5693 ("hn_start_locked is called, when if_start is disabled")); 5694 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5695 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5696 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5697 5698 if (__predict_false(txr->hn_suspended)) 5699 return (0); 5700 5701 if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5702 IFF_DRV_RUNNING) 5703 return (0); 5704 5705 while (!if_sendq_empty(ifp)) { 5706 struct hn_txdesc *txd; 5707 struct mbuf *m_head; 5708 int error; 5709 5710 m_head = if_dequeue(ifp); 5711 if (m_head == NULL) 5712 break; 5713 5714 if (len > 0 && m_head->m_pkthdr.len > len) { 5715 /* 5716 * This sending could be time consuming; let callers 5717 * dispatch this packet sending (and sending of any 5718 * following up packets) to tx taskqueue. 5719 */ 5720 if_sendq_prepend(ifp, m_head); 5721 sched = 1; 5722 break; 5723 } 5724 5725 #if defined(INET6) || defined(INET) 5726 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5727 m_head = hn_tso_fixup(m_head); 5728 if (__predict_false(m_head == NULL)) { 5729 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5730 continue; 5731 } 5732 } else if (m_head->m_pkthdr.csum_flags & 5733 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5734 m_head = hn_set_hlen(m_head); 5735 if (__predict_false(m_head == NULL)) { 5736 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5737 continue; 5738 } 5739 } 5740 #endif 5741 5742 txd = hn_txdesc_get(txr); 5743 if (txd == NULL) { 5744 txr->hn_no_txdescs++; 5745 if_sendq_prepend(ifp, m_head); 5746 if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); 5747 break; 5748 } 5749 5750 error = hn_encap(ifp, txr, txd, &m_head); 5751 if (error) { 5752 /* Both txd and m_head are freed */ 5753 KASSERT(txr->hn_agg_txd == NULL, 5754 ("encap failed w/ pending aggregating txdesc")); 5755 continue; 5756 } 5757 5758 if (txr->hn_agg_pktleft == 0) { 5759 if (txr->hn_agg_txd != NULL) { 5760 KASSERT(m_head == NULL, 5761 ("pending mbuf for aggregating txdesc")); 5762 error = hn_flush_txagg(ifp, txr); 5763 if (__predict_false(error)) { 5764 if_setdrvflagbits(ifp, 5765 IFF_DRV_OACTIVE, 0); 5766 break; 5767 } 5768 } else { 5769 KASSERT(m_head != NULL, ("mbuf was freed")); 5770 error = hn_txpkt(ifp, txr, txd); 5771 if (__predict_false(error)) { 5772 /* txd is freed, but m_head is not */ 5773 if_sendq_prepend(ifp, m_head); 5774 if_setdrvflagbits(ifp, 5775 IFF_DRV_OACTIVE, 0); 5776 break; 5777 } 5778 } 5779 } 5780 #ifdef INVARIANTS 5781 else { 5782 KASSERT(txr->hn_agg_txd != NULL, 5783 ("no aggregating txdesc")); 5784 KASSERT(m_head == NULL, 5785 ("pending mbuf for aggregating txdesc")); 5786 } 5787 #endif 5788 } 5789 5790 /* Flush pending aggerated transmission. */ 5791 if (txr->hn_agg_txd != NULL) 5792 hn_flush_txagg(ifp, txr); 5793 return (sched); 5794 } 5795 5796 static void 5797 hn_start(if_t ifp) 5798 { 5799 struct hn_softc *sc = if_getsoftc(ifp); 5800 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5801 5802 if (txr->hn_sched_tx) 5803 goto do_sched; 5804 5805 if (mtx_trylock(&txr->hn_tx_lock)) { 5806 int sched; 5807 5808 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5809 mtx_unlock(&txr->hn_tx_lock); 5810 if (!sched) 5811 return; 5812 } 5813 do_sched: 5814 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5815 } 5816 5817 static void 5818 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5819 { 5820 struct hn_tx_ring *txr = xtxr; 5821 5822 mtx_lock(&txr->hn_tx_lock); 5823 if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE); 5824 hn_start_locked(txr, 0); 5825 mtx_unlock(&txr->hn_tx_lock); 5826 } 5827 5828 static void 5829 hn_start_txeof(struct hn_tx_ring *txr) 5830 { 5831 struct hn_softc *sc = txr->hn_sc; 5832 if_t ifp = sc->hn_ifp; 5833 5834 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5835 5836 if (txr->hn_sched_tx) 5837 goto do_sched; 5838 5839 if (mtx_trylock(&txr->hn_tx_lock)) { 5840 int sched; 5841 5842 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5843 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5844 mtx_unlock(&txr->hn_tx_lock); 5845 if (sched) { 5846 taskqueue_enqueue(txr->hn_tx_taskq, 5847 &txr->hn_tx_task); 5848 } 5849 } else { 5850 do_sched: 5851 /* 5852 * Release the OACTIVE earlier, with the hope, that 5853 * others could catch up. The task will clear the 5854 * flag again with the hn_tx_lock to avoid possible 5855 * races. 5856 */ 5857 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5858 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5859 } 5860 } 5861 5862 #endif /* HN_IFSTART_SUPPORT */ 5863 5864 static int 5865 hn_xmit(struct hn_tx_ring *txr, int len) 5866 { 5867 struct hn_softc *sc = txr->hn_sc; 5868 if_t ifp = sc->hn_ifp; 5869 struct mbuf *m_head; 5870 int sched = 0; 5871 5872 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5873 #ifdef HN_IFSTART_SUPPORT 5874 KASSERT(hn_use_if_start == 0, 5875 ("hn_xmit is called, when if_start is enabled")); 5876 #endif 5877 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5878 5879 if (__predict_false(txr->hn_suspended)) 5880 return (0); 5881 5882 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5883 return (0); 5884 5885 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5886 struct hn_txdesc *txd; 5887 int error; 5888 5889 if (len > 0 && m_head->m_pkthdr.len > len) { 5890 /* 5891 * This sending could be time consuming; let callers 5892 * dispatch this packet sending (and sending of any 5893 * following up packets) to tx taskqueue. 5894 */ 5895 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5896 sched = 1; 5897 break; 5898 } 5899 5900 txd = hn_txdesc_get(txr); 5901 if (txd == NULL) { 5902 txr->hn_no_txdescs++; 5903 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5904 txr->hn_oactive = 1; 5905 break; 5906 } 5907 5908 error = hn_encap(ifp, txr, txd, &m_head); 5909 if (error) { 5910 /* Both txd and m_head are freed; discard */ 5911 KASSERT(txr->hn_agg_txd == NULL, 5912 ("encap failed w/ pending aggregating txdesc")); 5913 drbr_advance(ifp, txr->hn_mbuf_br); 5914 continue; 5915 } 5916 5917 if (txr->hn_agg_pktleft == 0) { 5918 if (txr->hn_agg_txd != NULL) { 5919 KASSERT(m_head == NULL, 5920 ("pending mbuf for aggregating txdesc")); 5921 error = hn_flush_txagg(ifp, txr); 5922 if (__predict_false(error)) { 5923 txr->hn_oactive = 1; 5924 break; 5925 } 5926 } else { 5927 KASSERT(m_head != NULL, ("mbuf was freed")); 5928 error = hn_txpkt(ifp, txr, txd); 5929 if (__predict_false(error)) { 5930 /* txd is freed, but m_head is not */ 5931 drbr_putback(ifp, txr->hn_mbuf_br, 5932 m_head); 5933 txr->hn_oactive = 1; 5934 break; 5935 } 5936 } 5937 } 5938 #ifdef INVARIANTS 5939 else { 5940 KASSERT(txr->hn_agg_txd != NULL, 5941 ("no aggregating txdesc")); 5942 KASSERT(m_head == NULL, 5943 ("pending mbuf for aggregating txdesc")); 5944 } 5945 #endif 5946 5947 /* Sent */ 5948 drbr_advance(ifp, txr->hn_mbuf_br); 5949 } 5950 5951 /* Flush pending aggerated transmission. */ 5952 if (txr->hn_agg_txd != NULL) 5953 hn_flush_txagg(ifp, txr); 5954 return (sched); 5955 } 5956 5957 static int 5958 hn_transmit(if_t ifp, struct mbuf *m) 5959 { 5960 struct hn_softc *sc = if_getsoftc(ifp); 5961 struct hn_tx_ring *txr; 5962 int error, idx = 0; 5963 5964 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5965 struct rm_priotracker pt; 5966 5967 rm_rlock(&sc->hn_vf_lock, &pt); 5968 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5969 struct mbuf *m_bpf = NULL; 5970 int obytes, omcast; 5971 5972 obytes = m->m_pkthdr.len; 5973 omcast = (m->m_flags & M_MCAST) != 0; 5974 5975 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 5976 if (bpf_peers_present_if(ifp)) { 5977 m_bpf = m_copypacket(m, M_NOWAIT); 5978 if (m_bpf == NULL) { 5979 /* 5980 * Failed to grab a shallow 5981 * copy; tap now. 5982 */ 5983 ETHER_BPF_MTAP(ifp, m); 5984 } 5985 } 5986 } else { 5987 ETHER_BPF_MTAP(ifp, m); 5988 } 5989 5990 error = if_transmit(sc->hn_vf_ifp, m); 5991 rm_runlock(&sc->hn_vf_lock, &pt); 5992 5993 if (m_bpf != NULL) { 5994 if (!error) 5995 ETHER_BPF_MTAP(ifp, m_bpf); 5996 m_freem(m_bpf); 5997 } 5998 5999 if (error == ENOBUFS) { 6000 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6001 } else if (error) { 6002 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6003 } else { 6004 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6005 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6006 if (omcast) { 6007 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6008 omcast); 6009 } 6010 } 6011 return (error); 6012 } 6013 rm_runlock(&sc->hn_vf_lock, &pt); 6014 } 6015 6016 #if defined(INET6) || defined(INET) 6017 /* 6018 * Perform TSO packet header fixup or get l2/l3 header length now, 6019 * since packet headers should be cache-hot. 6020 */ 6021 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6022 m = hn_tso_fixup(m); 6023 if (__predict_false(m == NULL)) { 6024 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6025 return EIO; 6026 } 6027 } else if (m->m_pkthdr.csum_flags & 6028 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6029 m = hn_set_hlen(m); 6030 if (__predict_false(m == NULL)) { 6031 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6032 return EIO; 6033 } 6034 } 6035 #endif 6036 6037 /* 6038 * Select the TX ring based on flowid 6039 */ 6040 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6041 #ifdef RSS 6042 uint32_t bid; 6043 6044 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6045 &bid) == 0) 6046 idx = bid % sc->hn_tx_ring_inuse; 6047 else 6048 #endif 6049 { 6050 #if defined(INET6) || defined(INET) 6051 int tcpsyn = 0; 6052 6053 if (m->m_pkthdr.len < 128 && 6054 (m->m_pkthdr.csum_flags & 6055 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6056 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6057 m = hn_check_tcpsyn(m, &tcpsyn); 6058 if (__predict_false(m == NULL)) { 6059 if_inc_counter(ifp, 6060 IFCOUNTER_OERRORS, 1); 6061 return (EIO); 6062 } 6063 } 6064 #else 6065 const int tcpsyn = 0; 6066 #endif 6067 if (tcpsyn) 6068 idx = 0; 6069 else 6070 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6071 } 6072 } 6073 txr = &sc->hn_tx_ring[idx]; 6074 6075 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6076 if (error) { 6077 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6078 return error; 6079 } 6080 6081 if (txr->hn_oactive) 6082 return 0; 6083 6084 if (txr->hn_sched_tx) 6085 goto do_sched; 6086 6087 if (mtx_trylock(&txr->hn_tx_lock)) { 6088 int sched; 6089 6090 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6091 mtx_unlock(&txr->hn_tx_lock); 6092 if (!sched) 6093 return 0; 6094 } 6095 do_sched: 6096 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6097 return 0; 6098 } 6099 6100 static void 6101 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6102 { 6103 struct mbuf *m; 6104 6105 mtx_lock(&txr->hn_tx_lock); 6106 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6107 m_freem(m); 6108 mtx_unlock(&txr->hn_tx_lock); 6109 } 6110 6111 static void 6112 hn_xmit_qflush(if_t ifp) 6113 { 6114 struct hn_softc *sc = if_getsoftc(ifp); 6115 struct rm_priotracker pt; 6116 int i; 6117 6118 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6119 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6120 if_qflush(ifp); 6121 6122 rm_rlock(&sc->hn_vf_lock, &pt); 6123 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6124 if_qflush(sc->hn_vf_ifp); 6125 rm_runlock(&sc->hn_vf_lock, &pt); 6126 } 6127 6128 static void 6129 hn_xmit_txeof(struct hn_tx_ring *txr) 6130 { 6131 6132 if (txr->hn_sched_tx) 6133 goto do_sched; 6134 6135 if (mtx_trylock(&txr->hn_tx_lock)) { 6136 int sched; 6137 6138 txr->hn_oactive = 0; 6139 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6140 mtx_unlock(&txr->hn_tx_lock); 6141 if (sched) { 6142 taskqueue_enqueue(txr->hn_tx_taskq, 6143 &txr->hn_tx_task); 6144 } 6145 } else { 6146 do_sched: 6147 /* 6148 * Release the oactive earlier, with the hope, that 6149 * others could catch up. The task will clear the 6150 * oactive again with the hn_tx_lock to avoid possible 6151 * races. 6152 */ 6153 txr->hn_oactive = 0; 6154 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6155 } 6156 } 6157 6158 static void 6159 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6160 { 6161 struct hn_tx_ring *txr = xtxr; 6162 6163 mtx_lock(&txr->hn_tx_lock); 6164 hn_xmit(txr, 0); 6165 mtx_unlock(&txr->hn_tx_lock); 6166 } 6167 6168 static void 6169 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6170 { 6171 struct hn_tx_ring *txr = xtxr; 6172 6173 mtx_lock(&txr->hn_tx_lock); 6174 txr->hn_oactive = 0; 6175 hn_xmit(txr, 0); 6176 mtx_unlock(&txr->hn_tx_lock); 6177 } 6178 6179 static int 6180 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6181 { 6182 struct vmbus_chan_br cbr; 6183 struct hn_rx_ring *rxr; 6184 struct hn_tx_ring *txr = NULL; 6185 int idx, error; 6186 6187 idx = vmbus_chan_subidx(chan); 6188 6189 /* 6190 * Link this channel to RX/TX ring. 6191 */ 6192 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6193 ("invalid channel index %d, should > 0 && < %d", 6194 idx, sc->hn_rx_ring_inuse)); 6195 rxr = &sc->hn_rx_ring[idx]; 6196 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6197 ("RX ring %d already attached", idx)); 6198 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6199 rxr->hn_chan = chan; 6200 6201 if (bootverbose) { 6202 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6203 idx, vmbus_chan_id(chan)); 6204 } 6205 6206 if (idx < sc->hn_tx_ring_inuse) { 6207 txr = &sc->hn_tx_ring[idx]; 6208 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6209 ("TX ring %d already attached", idx)); 6210 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6211 6212 txr->hn_chan = chan; 6213 if (bootverbose) { 6214 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6215 idx, vmbus_chan_id(chan)); 6216 } 6217 } 6218 6219 /* Bind this channel to a proper CPU. */ 6220 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6221 6222 /* 6223 * Open this channel 6224 */ 6225 cbr.cbr = rxr->hn_br; 6226 cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br); 6227 cbr.cbr_txsz = HN_TXBR_SIZE; 6228 cbr.cbr_rxsz = HN_RXBR_SIZE; 6229 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6230 if (error) { 6231 if (error == EISCONN) { 6232 if_printf(sc->hn_ifp, "bufring is connected after " 6233 "chan%u open failure\n", vmbus_chan_id(chan)); 6234 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6235 } else { 6236 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6237 vmbus_chan_id(chan), error); 6238 } 6239 } 6240 return (error); 6241 } 6242 6243 static void 6244 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6245 { 6246 struct hn_rx_ring *rxr; 6247 int idx, error; 6248 6249 idx = vmbus_chan_subidx(chan); 6250 6251 /* 6252 * Link this channel to RX/TX ring. 6253 */ 6254 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6255 ("invalid channel index %d, should > 0 && < %d", 6256 idx, sc->hn_rx_ring_inuse)); 6257 rxr = &sc->hn_rx_ring[idx]; 6258 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6259 ("RX ring %d is not attached", idx)); 6260 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6261 6262 if (idx < sc->hn_tx_ring_inuse) { 6263 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6264 6265 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6266 ("TX ring %d is not attached attached", idx)); 6267 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6268 } 6269 6270 /* 6271 * Close this channel. 6272 * 6273 * NOTE: 6274 * Channel closing does _not_ destroy the target channel. 6275 */ 6276 error = vmbus_chan_close_direct(chan); 6277 if (error == EISCONN) { 6278 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6279 "after being closed\n", vmbus_chan_id(chan)); 6280 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6281 } else if (error) { 6282 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6283 vmbus_chan_id(chan), error); 6284 } 6285 } 6286 6287 static int 6288 hn_attach_subchans(struct hn_softc *sc) 6289 { 6290 struct vmbus_channel **subchans; 6291 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6292 int i, error = 0; 6293 6294 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6295 6296 /* Attach the sub-channels. */ 6297 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6298 for (i = 0; i < subchan_cnt; ++i) { 6299 int error1; 6300 6301 error1 = hn_chan_attach(sc, subchans[i]); 6302 if (error1) { 6303 error = error1; 6304 /* Move on; all channels will be detached later. */ 6305 } 6306 } 6307 vmbus_subchan_rel(subchans, subchan_cnt); 6308 6309 if (error) { 6310 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6311 } else { 6312 if (bootverbose) { 6313 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6314 subchan_cnt); 6315 } 6316 } 6317 return (error); 6318 } 6319 6320 static void 6321 hn_detach_allchans(struct hn_softc *sc) 6322 { 6323 struct vmbus_channel **subchans; 6324 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6325 int i; 6326 6327 if (subchan_cnt == 0) 6328 goto back; 6329 6330 /* Detach the sub-channels. */ 6331 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6332 for (i = 0; i < subchan_cnt; ++i) 6333 hn_chan_detach(sc, subchans[i]); 6334 vmbus_subchan_rel(subchans, subchan_cnt); 6335 6336 back: 6337 /* 6338 * Detach the primary channel, _after_ all sub-channels 6339 * are detached. 6340 */ 6341 hn_chan_detach(sc, sc->hn_prichan); 6342 6343 /* Wait for sub-channels to be destroyed, if any. */ 6344 vmbus_subchan_drain(sc->hn_prichan); 6345 6346 #ifdef INVARIANTS 6347 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6348 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6349 HN_RX_FLAG_ATTACHED) == 0, 6350 ("%dth RX ring is still attached", i)); 6351 } 6352 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6353 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6354 HN_TX_FLAG_ATTACHED) == 0, 6355 ("%dth TX ring is still attached", i)); 6356 } 6357 #endif 6358 } 6359 6360 static int 6361 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6362 { 6363 struct vmbus_channel **subchans; 6364 int nchan, rxr_cnt, error; 6365 6366 nchan = *nsubch + 1; 6367 if (nchan == 1) { 6368 /* 6369 * Multiple RX/TX rings are not requested. 6370 */ 6371 *nsubch = 0; 6372 return (0); 6373 } 6374 6375 /* 6376 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6377 * table entries. 6378 */ 6379 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6380 if (error) { 6381 /* No RSS; this is benign. */ 6382 *nsubch = 0; 6383 return (0); 6384 } 6385 if (bootverbose) { 6386 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6387 rxr_cnt, nchan); 6388 } 6389 6390 if (nchan > rxr_cnt) 6391 nchan = rxr_cnt; 6392 if (nchan == 1) { 6393 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6394 *nsubch = 0; 6395 return (0); 6396 } 6397 6398 /* 6399 * Allocate sub-channels from NVS. 6400 */ 6401 *nsubch = nchan - 1; 6402 error = hn_nvs_alloc_subchans(sc, nsubch); 6403 if (error || *nsubch == 0) { 6404 /* Failed to allocate sub-channels. */ 6405 *nsubch = 0; 6406 return (0); 6407 } 6408 6409 /* 6410 * Wait for all sub-channels to become ready before moving on. 6411 */ 6412 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6413 vmbus_subchan_rel(subchans, *nsubch); 6414 return (0); 6415 } 6416 6417 static bool 6418 hn_synth_attachable(const struct hn_softc *sc) 6419 { 6420 int i; 6421 6422 if (sc->hn_flags & HN_FLAG_ERRORS) 6423 return (false); 6424 6425 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6426 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6427 6428 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6429 return (false); 6430 } 6431 return (true); 6432 } 6433 6434 /* 6435 * Make sure that the RX filter is zero after the successful 6436 * RNDIS initialization. 6437 * 6438 * NOTE: 6439 * Under certain conditions on certain versions of Hyper-V, 6440 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6441 * after the successful RNDIS initialization, which breaks 6442 * the assumption of any following code (well, it breaks the 6443 * RNDIS API contract actually). Clear the RNDIS rxfilter 6444 * explicitly, drain packets sneaking through, and drain the 6445 * interrupt taskqueues scheduled due to the stealth packets. 6446 */ 6447 static void 6448 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6449 { 6450 6451 hn_disable_rx(sc); 6452 hn_drain_rxtx(sc, nchan); 6453 } 6454 6455 static int 6456 hn_synth_attach(struct hn_softc *sc, int mtu) 6457 { 6458 #define ATTACHED_NVS 0x0002 6459 #define ATTACHED_RNDIS 0x0004 6460 6461 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6462 int error, nsubch, nchan = 1, i, rndis_inited; 6463 uint32_t old_caps, attached = 0; 6464 6465 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6466 ("synthetic parts were attached")); 6467 6468 if (!hn_synth_attachable(sc)) 6469 return (ENXIO); 6470 6471 /* Save capabilities for later verification. */ 6472 old_caps = sc->hn_caps; 6473 sc->hn_caps = 0; 6474 6475 /* Clear RSS stuffs. */ 6476 sc->hn_rss_ind_size = 0; 6477 sc->hn_rss_hash = 0; 6478 sc->hn_rss_hcap = 0; 6479 6480 /* 6481 * Attach the primary channel _before_ attaching NVS and RNDIS. 6482 */ 6483 error = hn_chan_attach(sc, sc->hn_prichan); 6484 if (error) 6485 goto failed; 6486 6487 /* 6488 * Attach NVS. 6489 */ 6490 error = hn_nvs_attach(sc, mtu); 6491 if (error) 6492 goto failed; 6493 attached |= ATTACHED_NVS; 6494 6495 /* 6496 * Attach RNDIS _after_ NVS is attached. 6497 */ 6498 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6499 if (rndis_inited) 6500 attached |= ATTACHED_RNDIS; 6501 if (error) 6502 goto failed; 6503 6504 /* 6505 * Make sure capabilities are not changed. 6506 */ 6507 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6508 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6509 old_caps, sc->hn_caps); 6510 error = ENXIO; 6511 goto failed; 6512 } 6513 6514 /* 6515 * Allocate sub-channels for multi-TX/RX rings. 6516 * 6517 * NOTE: 6518 * The # of RX rings that can be used is equivalent to the # of 6519 * channels to be requested. 6520 */ 6521 nsubch = sc->hn_rx_ring_cnt - 1; 6522 error = hn_synth_alloc_subchans(sc, &nsubch); 6523 if (error) 6524 goto failed; 6525 /* NOTE: _Full_ synthetic parts detach is required now. */ 6526 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6527 6528 /* 6529 * Set the # of TX/RX rings that could be used according to 6530 * the # of channels that NVS offered. 6531 */ 6532 nchan = nsubch + 1; 6533 hn_set_ring_inuse(sc, nchan); 6534 if (nchan == 1) { 6535 /* Only the primary channel can be used; done */ 6536 goto back; 6537 } 6538 6539 /* 6540 * Attach the sub-channels. 6541 * 6542 * NOTE: hn_set_ring_inuse() _must_ have been called. 6543 */ 6544 error = hn_attach_subchans(sc); 6545 if (error) 6546 goto failed; 6547 6548 /* 6549 * Configure RSS key and indirect table _after_ all sub-channels 6550 * are attached. 6551 */ 6552 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6553 /* 6554 * RSS key is not set yet; set it to the default RSS key. 6555 */ 6556 if (bootverbose) 6557 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6558 #ifdef RSS 6559 rss_getkey(rss->rss_key); 6560 #else 6561 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6562 #endif 6563 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6564 } 6565 6566 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6567 /* 6568 * RSS indirect table is not set yet; set it up in round- 6569 * robin fashion. 6570 */ 6571 if (bootverbose) { 6572 if_printf(sc->hn_ifp, "setup default RSS indirect " 6573 "table\n"); 6574 } 6575 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6576 uint32_t subidx; 6577 6578 #ifdef RSS 6579 subidx = rss_get_indirection_to_bucket(i); 6580 #else 6581 subidx = i; 6582 #endif 6583 rss->rss_ind[i] = subidx % nchan; 6584 } 6585 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6586 } else { 6587 /* 6588 * # of usable channels may be changed, so we have to 6589 * make sure that all entries in RSS indirect table 6590 * are valid. 6591 * 6592 * NOTE: hn_set_ring_inuse() _must_ have been called. 6593 */ 6594 hn_rss_ind_fixup(sc); 6595 } 6596 6597 sc->hn_rss_hash = sc->hn_rss_hcap; 6598 if ((sc->hn_flags & HN_FLAG_RXVF) || 6599 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6600 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6601 hn_vf_rss_fixup(sc, false); 6602 } 6603 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6604 if (error) 6605 goto failed; 6606 back: 6607 /* 6608 * Fixup transmission aggregation setup. 6609 */ 6610 hn_set_txagg(sc); 6611 hn_rndis_init_fixat(sc, nchan); 6612 return (0); 6613 6614 failed: 6615 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6616 hn_rndis_init_fixat(sc, nchan); 6617 hn_synth_detach(sc); 6618 } else { 6619 if (attached & ATTACHED_RNDIS) { 6620 hn_rndis_init_fixat(sc, nchan); 6621 hn_rndis_detach(sc); 6622 } 6623 if (attached & ATTACHED_NVS) 6624 hn_nvs_detach(sc); 6625 hn_chan_detach(sc, sc->hn_prichan); 6626 /* Restore old capabilities. */ 6627 sc->hn_caps = old_caps; 6628 } 6629 return (error); 6630 6631 #undef ATTACHED_RNDIS 6632 #undef ATTACHED_NVS 6633 } 6634 6635 /* 6636 * NOTE: 6637 * The interface must have been suspended though hn_suspend(), before 6638 * this function get called. 6639 */ 6640 static void 6641 hn_synth_detach(struct hn_softc *sc) 6642 { 6643 6644 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6645 ("synthetic parts were not attached")); 6646 6647 /* Detach the RNDIS first. */ 6648 hn_rndis_detach(sc); 6649 6650 /* Detach NVS. */ 6651 hn_nvs_detach(sc); 6652 6653 /* Detach all of the channels. */ 6654 hn_detach_allchans(sc); 6655 6656 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6657 /* 6658 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6659 */ 6660 int error; 6661 6662 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6663 sc->hn_rxbuf_gpadl); 6664 if (error) { 6665 if_printf(sc->hn_ifp, 6666 "rxbuf gpadl disconn failed: %d\n", error); 6667 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6668 } 6669 sc->hn_rxbuf_gpadl = 0; 6670 } 6671 6672 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6673 /* 6674 * Host is post-Win2016, disconnect chimney sending buffer from 6675 * primary channel here. 6676 */ 6677 int error; 6678 6679 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6680 sc->hn_chim_gpadl); 6681 if (error) { 6682 if_printf(sc->hn_ifp, 6683 "chim gpadl disconn failed: %d\n", error); 6684 sc->hn_flags |= HN_FLAG_CHIM_REF; 6685 } 6686 sc->hn_chim_gpadl = 0; 6687 } 6688 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6689 } 6690 6691 static void 6692 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6693 { 6694 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6695 ("invalid ring count %d", ring_cnt)); 6696 6697 if (sc->hn_tx_ring_cnt > ring_cnt) 6698 sc->hn_tx_ring_inuse = ring_cnt; 6699 else 6700 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6701 sc->hn_rx_ring_inuse = ring_cnt; 6702 6703 #ifdef RSS 6704 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6705 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6706 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6707 rss_getnumbuckets()); 6708 } 6709 #endif 6710 6711 if (bootverbose) { 6712 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6713 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6714 } 6715 } 6716 6717 static void 6718 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6719 { 6720 6721 /* 6722 * NOTE: 6723 * The TX bufring will not be drained by the hypervisor, 6724 * if the primary channel is revoked. 6725 */ 6726 while (!vmbus_chan_rx_empty(chan) || 6727 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6728 !vmbus_chan_tx_empty(chan))) 6729 pause("waitch", 1); 6730 vmbus_chan_intr_drain(chan); 6731 } 6732 6733 static void 6734 hn_disable_rx(struct hn_softc *sc) 6735 { 6736 6737 /* 6738 * Disable RX by clearing RX filter forcefully. 6739 */ 6740 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6741 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6742 6743 /* 6744 * Give RNDIS enough time to flush all pending data packets. 6745 */ 6746 pause("waitrx", (200 * hz) / 1000); 6747 } 6748 6749 /* 6750 * NOTE: 6751 * RX/TX _must_ have been suspended/disabled, before this function 6752 * is called. 6753 */ 6754 static void 6755 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6756 { 6757 struct vmbus_channel **subch = NULL; 6758 int nsubch; 6759 6760 /* 6761 * Drain RX/TX bufrings and interrupts. 6762 */ 6763 nsubch = nchan - 1; 6764 if (nsubch > 0) 6765 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6766 6767 if (subch != NULL) { 6768 int i; 6769 6770 for (i = 0; i < nsubch; ++i) 6771 hn_chan_drain(sc, subch[i]); 6772 } 6773 hn_chan_drain(sc, sc->hn_prichan); 6774 6775 if (subch != NULL) 6776 vmbus_subchan_rel(subch, nsubch); 6777 } 6778 6779 static void 6780 hn_suspend_data(struct hn_softc *sc) 6781 { 6782 struct hn_tx_ring *txr; 6783 int i; 6784 6785 HN_LOCK_ASSERT(sc); 6786 6787 /* 6788 * Suspend TX. 6789 */ 6790 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6791 txr = &sc->hn_tx_ring[i]; 6792 6793 mtx_lock(&txr->hn_tx_lock); 6794 txr->hn_suspended = 1; 6795 mtx_unlock(&txr->hn_tx_lock); 6796 /* No one is able send more packets now. */ 6797 6798 /* 6799 * Wait for all pending sends to finish. 6800 * 6801 * NOTE: 6802 * We will _not_ receive all pending send-done, if the 6803 * primary channel is revoked. 6804 */ 6805 while (hn_tx_ring_pending(txr) && 6806 !vmbus_chan_is_revoked(sc->hn_prichan)) 6807 pause("hnwtx", 1 /* 1 tick */); 6808 } 6809 6810 /* 6811 * Disable RX. 6812 */ 6813 hn_disable_rx(sc); 6814 6815 /* 6816 * Drain RX/TX. 6817 */ 6818 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6819 6820 /* 6821 * Drain any pending TX tasks. 6822 * 6823 * NOTE: 6824 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6825 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6826 */ 6827 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6828 txr = &sc->hn_tx_ring[i]; 6829 6830 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6831 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6832 } 6833 } 6834 6835 static void 6836 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6837 { 6838 6839 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6840 } 6841 6842 static void 6843 hn_suspend_mgmt(struct hn_softc *sc) 6844 { 6845 struct task task; 6846 6847 HN_LOCK_ASSERT(sc); 6848 6849 /* 6850 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6851 * through hn_mgmt_taskq. 6852 */ 6853 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6854 vmbus_chan_run_task(sc->hn_prichan, &task); 6855 6856 /* 6857 * Make sure that all pending management tasks are completed. 6858 */ 6859 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6860 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6861 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6862 } 6863 6864 static void 6865 hn_suspend(struct hn_softc *sc) 6866 { 6867 6868 /* Disable polling. */ 6869 hn_polling(sc, 0); 6870 6871 /* 6872 * If the non-transparent mode VF is activated, the synthetic 6873 * device is receiving packets, so the data path of the 6874 * synthetic device must be suspended. 6875 */ 6876 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 6877 (sc->hn_flags & HN_FLAG_RXVF)) 6878 hn_suspend_data(sc); 6879 hn_suspend_mgmt(sc); 6880 } 6881 6882 static void 6883 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6884 { 6885 int i; 6886 6887 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6888 ("invalid TX ring count %d", tx_ring_cnt)); 6889 6890 for (i = 0; i < tx_ring_cnt; ++i) { 6891 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6892 6893 mtx_lock(&txr->hn_tx_lock); 6894 txr->hn_suspended = 0; 6895 mtx_unlock(&txr->hn_tx_lock); 6896 } 6897 } 6898 6899 static void 6900 hn_resume_data(struct hn_softc *sc) 6901 { 6902 int i; 6903 6904 HN_LOCK_ASSERT(sc); 6905 6906 /* 6907 * Re-enable RX. 6908 */ 6909 hn_rxfilter_config(sc); 6910 6911 /* 6912 * Make sure to clear suspend status on "all" TX rings, 6913 * since hn_tx_ring_inuse can be changed after 6914 * hn_suspend_data(). 6915 */ 6916 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6917 6918 #ifdef HN_IFSTART_SUPPORT 6919 if (!hn_use_if_start) 6920 #endif 6921 { 6922 /* 6923 * Flush unused drbrs, since hn_tx_ring_inuse may be 6924 * reduced. 6925 */ 6926 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6927 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6928 } 6929 6930 /* 6931 * Kick start TX. 6932 */ 6933 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6934 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6935 6936 /* 6937 * Use txeof task, so that any pending oactive can be 6938 * cleared properly. 6939 */ 6940 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6941 } 6942 } 6943 6944 static void 6945 hn_resume_mgmt(struct hn_softc *sc) 6946 { 6947 6948 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6949 6950 /* 6951 * Kick off network change detection, if it was pending. 6952 * If no network change was pending, start link status 6953 * checks, which is more lightweight than network change 6954 * detection. 6955 */ 6956 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6957 hn_change_network(sc); 6958 else 6959 hn_update_link_status(sc); 6960 } 6961 6962 static void 6963 hn_resume(struct hn_softc *sc) 6964 { 6965 6966 /* 6967 * If the non-transparent mode VF is activated, the synthetic 6968 * device have to receive packets, so the data path of the 6969 * synthetic device must be resumed. 6970 */ 6971 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 6972 (sc->hn_flags & HN_FLAG_RXVF)) 6973 hn_resume_data(sc); 6974 6975 /* 6976 * Don't resume link status change if VF is attached/activated. 6977 * - In the non-transparent VF mode, the synthetic device marks 6978 * link down until the VF is deactivated; i.e. VF is down. 6979 * - In transparent VF mode, VF's media status is used until 6980 * the VF is detached. 6981 */ 6982 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 6983 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 6984 hn_resume_mgmt(sc); 6985 6986 /* 6987 * Re-enable polling if this interface is running and 6988 * the polling is requested. 6989 */ 6990 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 6991 hn_polling(sc, sc->hn_pollhz); 6992 } 6993 6994 static void 6995 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 6996 { 6997 const struct rndis_status_msg *msg; 6998 int ofs; 6999 7000 if (dlen < sizeof(*msg)) { 7001 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7002 return; 7003 } 7004 msg = data; 7005 7006 switch (msg->rm_status) { 7007 case RNDIS_STATUS_MEDIA_CONNECT: 7008 case RNDIS_STATUS_MEDIA_DISCONNECT: 7009 hn_update_link_status(sc); 7010 break; 7011 7012 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7013 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7014 /* Not really useful; ignore. */ 7015 break; 7016 7017 case RNDIS_STATUS_NETWORK_CHANGE: 7018 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7019 if (dlen < ofs + msg->rm_stbuflen || 7020 msg->rm_stbuflen < sizeof(uint32_t)) { 7021 if_printf(sc->hn_ifp, "network changed\n"); 7022 } else { 7023 uint32_t change; 7024 7025 memcpy(&change, ((const uint8_t *)msg) + ofs, 7026 sizeof(change)); 7027 if_printf(sc->hn_ifp, "network changed, change %u\n", 7028 change); 7029 } 7030 hn_change_network(sc); 7031 break; 7032 7033 default: 7034 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7035 msg->rm_status); 7036 break; 7037 } 7038 } 7039 7040 static int 7041 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7042 { 7043 const struct rndis_pktinfo *pi = info_data; 7044 uint32_t mask = 0; 7045 7046 while (info_dlen != 0) { 7047 const void *data; 7048 uint32_t dlen; 7049 7050 if (__predict_false(info_dlen < sizeof(*pi))) 7051 return (EINVAL); 7052 if (__predict_false(info_dlen < pi->rm_size)) 7053 return (EINVAL); 7054 info_dlen -= pi->rm_size; 7055 7056 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7057 return (EINVAL); 7058 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7059 return (EINVAL); 7060 dlen = pi->rm_size - pi->rm_pktinfooffset; 7061 data = pi->rm_data; 7062 7063 if (pi->rm_internal == 1) { 7064 switch (pi->rm_type) { 7065 case NDIS_PKTINFO_IT_PKTINFO_ID: 7066 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7067 return (EINVAL); 7068 info->pktinfo_id = 7069 (const struct packet_info_id *)data; 7070 mask |= HN_RXINFO_PKTINFO_ID; 7071 break; 7072 7073 default: 7074 goto next; 7075 } 7076 } else { 7077 switch (pi->rm_type) { 7078 case NDIS_PKTINFO_TYPE_VLAN: 7079 if (__predict_false(dlen 7080 < NDIS_VLAN_INFO_SIZE)) 7081 return (EINVAL); 7082 info->vlan_info = (const uint32_t *)data; 7083 mask |= HN_RXINFO_VLAN; 7084 break; 7085 7086 case NDIS_PKTINFO_TYPE_CSUM: 7087 if (__predict_false(dlen 7088 < NDIS_RXCSUM_INFO_SIZE)) 7089 return (EINVAL); 7090 info->csum_info = (const uint32_t *)data; 7091 mask |= HN_RXINFO_CSUM; 7092 break; 7093 7094 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7095 if (__predict_false(dlen 7096 < HN_NDIS_HASH_VALUE_SIZE)) 7097 return (EINVAL); 7098 info->hash_value = (const uint32_t *)data; 7099 mask |= HN_RXINFO_HASHVAL; 7100 break; 7101 7102 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7103 if (__predict_false(dlen 7104 < HN_NDIS_HASH_INFO_SIZE)) 7105 return (EINVAL); 7106 info->hash_info = (const uint32_t *)data; 7107 mask |= HN_RXINFO_HASHINF; 7108 break; 7109 7110 default: 7111 goto next; 7112 } 7113 } 7114 7115 if (mask == HN_RXINFO_ALL) { 7116 /* All found; done */ 7117 break; 7118 } 7119 next: 7120 pi = (const struct rndis_pktinfo *) 7121 ((const uint8_t *)pi + pi->rm_size); 7122 } 7123 7124 /* 7125 * Final fixup. 7126 * - If there is no hash value, invalidate the hash info. 7127 */ 7128 if ((mask & HN_RXINFO_HASHVAL) == 0) 7129 info->hash_info = NULL; 7130 return (0); 7131 } 7132 7133 static __inline bool 7134 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7135 { 7136 7137 if (off < check_off) { 7138 if (__predict_true(off + len <= check_off)) 7139 return (false); 7140 } else if (off > check_off) { 7141 if (__predict_true(check_off + check_len <= off)) 7142 return (false); 7143 } 7144 return (true); 7145 } 7146 7147 static __inline void 7148 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7149 uint32_t len, struct hn_rxinfo *info) 7150 { 7151 uint32_t cnt = rxr->rsc.cnt; 7152 7153 if (cnt) { 7154 rxr->rsc.pktlen += len; 7155 } else { 7156 rxr->rsc.vlan_info = info->vlan_info; 7157 rxr->rsc.csum_info = info->csum_info; 7158 rxr->rsc.hash_info = info->hash_info; 7159 rxr->rsc.hash_value = info->hash_value; 7160 rxr->rsc.pktlen = len; 7161 } 7162 7163 rxr->rsc.frag_data[cnt] = data; 7164 rxr->rsc.frag_len[cnt] = len; 7165 rxr->rsc.cnt++; 7166 } 7167 7168 static void 7169 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7170 { 7171 const struct rndis_packet_msg *pkt; 7172 struct hn_rxinfo info; 7173 int data_off, pktinfo_off, data_len, pktinfo_len; 7174 bool rsc_more= false; 7175 7176 /* 7177 * Check length. 7178 */ 7179 if (__predict_false(dlen < sizeof(*pkt))) { 7180 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7181 return; 7182 } 7183 pkt = data; 7184 7185 if (__predict_false(dlen < pkt->rm_len)) { 7186 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7187 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7188 return; 7189 } 7190 if (__predict_false(pkt->rm_len < 7191 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7192 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7193 "msglen %u, data %u, oob %u, pktinfo %u\n", 7194 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7195 pkt->rm_pktinfolen); 7196 return; 7197 } 7198 if (__predict_false(pkt->rm_datalen == 0)) { 7199 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7200 return; 7201 } 7202 7203 /* 7204 * Check offests. 7205 */ 7206 #define IS_OFFSET_INVALID(ofs) \ 7207 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7208 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7209 7210 /* XXX Hyper-V does not meet data offset alignment requirement */ 7211 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7212 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7213 "data offset %u\n", pkt->rm_dataoffset); 7214 return; 7215 } 7216 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7217 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7218 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7219 "oob offset %u\n", pkt->rm_oobdataoffset); 7220 return; 7221 } 7222 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7223 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7224 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7225 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7226 return; 7227 } 7228 7229 #undef IS_OFFSET_INVALID 7230 7231 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7232 data_len = pkt->rm_datalen; 7233 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7234 pktinfo_len = pkt->rm_pktinfolen; 7235 7236 /* 7237 * Check OOB coverage. 7238 */ 7239 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7240 int oob_off, oob_len; 7241 7242 if_printf(rxr->hn_ifp, "got oobdata\n"); 7243 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7244 oob_len = pkt->rm_oobdatalen; 7245 7246 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7247 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7248 "oob overflow, msglen %u, oob abs %d len %d\n", 7249 pkt->rm_len, oob_off, oob_len); 7250 return; 7251 } 7252 7253 /* 7254 * Check against data. 7255 */ 7256 if (hn_rndis_check_overlap(oob_off, oob_len, 7257 data_off, data_len)) { 7258 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7259 "oob overlaps data, oob abs %d len %d, " 7260 "data abs %d len %d\n", 7261 oob_off, oob_len, data_off, data_len); 7262 return; 7263 } 7264 7265 /* 7266 * Check against pktinfo. 7267 */ 7268 if (pktinfo_len != 0 && 7269 hn_rndis_check_overlap(oob_off, oob_len, 7270 pktinfo_off, pktinfo_len)) { 7271 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7272 "oob overlaps pktinfo, oob abs %d len %d, " 7273 "pktinfo abs %d len %d\n", 7274 oob_off, oob_len, pktinfo_off, pktinfo_len); 7275 return; 7276 } 7277 } 7278 7279 /* 7280 * Check per-packet-info coverage and find useful per-packet-info. 7281 */ 7282 info.vlan_info = NULL; 7283 info.csum_info = NULL; 7284 info.hash_info = NULL; 7285 info.pktinfo_id = NULL; 7286 7287 if (__predict_true(pktinfo_len != 0)) { 7288 bool overlap; 7289 int error; 7290 7291 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7292 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7293 "pktinfo overflow, msglen %u, " 7294 "pktinfo abs %d len %d\n", 7295 pkt->rm_len, pktinfo_off, pktinfo_len); 7296 return; 7297 } 7298 7299 /* 7300 * Check packet info coverage. 7301 */ 7302 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7303 data_off, data_len); 7304 if (__predict_false(overlap)) { 7305 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7306 "pktinfo overlap data, pktinfo abs %d len %d, " 7307 "data abs %d len %d\n", 7308 pktinfo_off, pktinfo_len, data_off, data_len); 7309 return; 7310 } 7311 7312 /* 7313 * Find useful per-packet-info. 7314 */ 7315 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7316 pktinfo_len, &info); 7317 if (__predict_false(error)) { 7318 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7319 "pktinfo\n"); 7320 return; 7321 } 7322 } 7323 7324 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7325 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7326 "data overflow, msglen %u, data abs %d len %d\n", 7327 pkt->rm_len, data_off, data_len); 7328 return; 7329 } 7330 7331 /* Identify RSC fragments, drop invalid packets */ 7332 if ((info.pktinfo_id != NULL) && 7333 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7334 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7335 rxr->rsc.cnt = 0; 7336 rxr->hn_rsc_pkts++; 7337 } else if (rxr->rsc.cnt == 0) 7338 goto drop; 7339 7340 rsc_more = true; 7341 7342 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7343 rsc_more = false; 7344 7345 if (rsc_more && rxr->rsc.is_last) 7346 goto drop; 7347 } else { 7348 rxr->rsc.cnt = 0; 7349 } 7350 7351 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7352 goto drop; 7353 7354 /* Store data in per rx ring structure */ 7355 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7356 data_len, &info); 7357 7358 if (rsc_more) 7359 return; 7360 7361 hn_rxpkt(rxr); 7362 rxr->rsc.cnt = 0; 7363 return; 7364 drop: 7365 rxr->hn_rsc_drop++; 7366 return; 7367 } 7368 7369 static __inline void 7370 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7371 { 7372 const struct rndis_msghdr *hdr; 7373 7374 if (__predict_false(dlen < sizeof(*hdr))) { 7375 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7376 return; 7377 } 7378 hdr = data; 7379 7380 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7381 /* Hot data path. */ 7382 hn_rndis_rx_data(rxr, data, dlen); 7383 /* Done! */ 7384 return; 7385 } 7386 7387 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7388 hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen); 7389 else 7390 hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen); 7391 } 7392 7393 static void 7394 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7395 { 7396 const struct hn_nvs_hdr *hdr; 7397 7398 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7399 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7400 return; 7401 } 7402 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7403 7404 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7405 /* Useless; ignore */ 7406 return; 7407 } 7408 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7409 } 7410 7411 static void 7412 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7413 const struct vmbus_chanpkt_hdr *pkt) 7414 { 7415 struct hn_nvs_sendctx *sndc; 7416 7417 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7418 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7419 VMBUS_CHANPKT_DATALEN(pkt)); 7420 /* 7421 * NOTE: 7422 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7423 * its callback. 7424 */ 7425 } 7426 7427 static void 7428 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7429 const struct vmbus_chanpkt_hdr *pkthdr) 7430 { 7431 struct epoch_tracker et; 7432 const struct vmbus_chanpkt_rxbuf *pkt; 7433 const struct hn_nvs_hdr *nvs_hdr; 7434 int count, i, hlen; 7435 7436 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7437 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7438 return; 7439 } 7440 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7441 7442 /* Make sure that this is a RNDIS message. */ 7443 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7444 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7445 nvs_hdr->nvs_type); 7446 return; 7447 } 7448 7449 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7450 if (__predict_false(hlen < sizeof(*pkt))) { 7451 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7452 return; 7453 } 7454 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7455 7456 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7457 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7458 pkt->cp_rxbuf_id); 7459 return; 7460 } 7461 7462 count = pkt->cp_rxbuf_cnt; 7463 if (__predict_false(hlen < 7464 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7465 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7466 return; 7467 } 7468 7469 NET_EPOCH_ENTER(et); 7470 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7471 for (i = 0; i < count; ++i) { 7472 int ofs, len; 7473 7474 ofs = pkt->cp_rxbuf[i].rb_ofs; 7475 len = pkt->cp_rxbuf[i].rb_len; 7476 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7477 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7478 "ofs %d, len %d\n", i, ofs, len); 7479 continue; 7480 } 7481 7482 rxr->rsc.is_last = (i == (count - 1)); 7483 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7484 } 7485 NET_EPOCH_EXIT(et); 7486 7487 /* 7488 * Ack the consumed RXBUF associated w/ this channel packet, 7489 * so that this RXBUF can be recycled by the hypervisor. 7490 */ 7491 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7492 } 7493 7494 static void 7495 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7496 uint64_t tid) 7497 { 7498 struct hn_nvs_rndis_ack ack; 7499 int retries, error; 7500 7501 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7502 ack.nvs_status = HN_NVS_STATUS_OK; 7503 7504 retries = 0; 7505 again: 7506 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7507 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7508 if (__predict_false(error == EAGAIN)) { 7509 /* 7510 * NOTE: 7511 * This should _not_ happen in real world, since the 7512 * consumption of the TX bufring from the TX path is 7513 * controlled. 7514 */ 7515 if (rxr->hn_ack_failed == 0) 7516 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7517 rxr->hn_ack_failed++; 7518 retries++; 7519 if (retries < 10) { 7520 DELAY(100); 7521 goto again; 7522 } 7523 /* RXBUF leaks! */ 7524 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7525 } 7526 } 7527 7528 static void 7529 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7530 { 7531 struct hn_rx_ring *rxr = xrxr; 7532 struct hn_softc *sc = if_getsoftc(rxr->hn_ifp); 7533 7534 for (;;) { 7535 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7536 int error, pktlen; 7537 7538 pktlen = rxr->hn_pktbuf_len; 7539 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7540 if (__predict_false(error == ENOBUFS)) { 7541 void *nbuf; 7542 int nlen; 7543 7544 /* 7545 * Expand channel packet buffer. 7546 * 7547 * XXX 7548 * Use M_WAITOK here, since allocation failure 7549 * is fatal. 7550 */ 7551 nlen = rxr->hn_pktbuf_len * 2; 7552 while (nlen < pktlen) 7553 nlen *= 2; 7554 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7555 7556 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7557 rxr->hn_pktbuf_len, nlen); 7558 7559 free(rxr->hn_pktbuf, M_DEVBUF); 7560 rxr->hn_pktbuf = nbuf; 7561 rxr->hn_pktbuf_len = nlen; 7562 /* Retry! */ 7563 continue; 7564 } else if (__predict_false(error == EAGAIN)) { 7565 /* No more channel packets; done! */ 7566 break; 7567 } 7568 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7569 7570 switch (pkt->cph_type) { 7571 case VMBUS_CHANPKT_TYPE_COMP: 7572 hn_nvs_handle_comp(sc, chan, pkt); 7573 break; 7574 7575 case VMBUS_CHANPKT_TYPE_RXBUF: 7576 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7577 break; 7578 7579 case VMBUS_CHANPKT_TYPE_INBAND: 7580 hn_nvs_handle_notify(sc, pkt); 7581 break; 7582 7583 default: 7584 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7585 pkt->cph_type); 7586 break; 7587 } 7588 } 7589 hn_chan_rollup(rxr, rxr->hn_txr); 7590 } 7591 7592 static void 7593 hn_sysinit(void *arg __unused) 7594 { 7595 int i; 7596 7597 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7598 7599 #ifdef HN_IFSTART_SUPPORT 7600 /* 7601 * Don't use ifnet.if_start if transparent VF mode is requested; 7602 * mainly due to the IFF_DRV_OACTIVE flag. 7603 */ 7604 if (hn_xpnt_vf && hn_use_if_start) { 7605 hn_use_if_start = 0; 7606 printf("hn: tranparent VF mode, if_transmit will be used, " 7607 "instead of if_start\n"); 7608 } 7609 #endif 7610 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7611 printf("hn: invalid transparent VF attach routing " 7612 "wait timeout %d, reset to %d\n", 7613 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7614 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7615 } 7616 7617 /* 7618 * Initialize VF map. 7619 */ 7620 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7621 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7622 hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF, 7623 M_WAITOK | M_ZERO); 7624 7625 /* 7626 * Fix the # of TX taskqueues. 7627 */ 7628 if (hn_tx_taskq_cnt <= 0) 7629 hn_tx_taskq_cnt = 1; 7630 else if (hn_tx_taskq_cnt > mp_ncpus) 7631 hn_tx_taskq_cnt = mp_ncpus; 7632 7633 /* 7634 * Fix the TX taskqueue mode. 7635 */ 7636 switch (hn_tx_taskq_mode) { 7637 case HN_TX_TASKQ_M_INDEP: 7638 case HN_TX_TASKQ_M_GLOBAL: 7639 case HN_TX_TASKQ_M_EVTTQ: 7640 break; 7641 default: 7642 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7643 break; 7644 } 7645 7646 if (vm_guest != VM_GUEST_HV) 7647 return; 7648 7649 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7650 return; 7651 7652 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7653 M_DEVBUF, M_WAITOK); 7654 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7655 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7656 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7657 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7658 "hn tx%d", i); 7659 } 7660 } 7661 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7662 7663 static void 7664 hn_sysuninit(void *arg __unused) 7665 { 7666 7667 if (hn_tx_taskque != NULL) { 7668 int i; 7669 7670 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7671 taskqueue_free(hn_tx_taskque[i]); 7672 free(hn_tx_taskque, M_DEVBUF); 7673 } 7674 7675 if (hn_vfmap != NULL) 7676 free(hn_vfmap, M_DEVBUF); 7677 rm_destroy(&hn_vfmap_lock); 7678 7679 counter_u64_free(hn_udpcs_fixup); 7680 } 7681 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7682