1 /*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55 #include <sys/cdefs.h> 56 __FBSDID("$FreeBSD$"); 57 58 #include "opt_hn.h" 59 #include "opt_inet6.h" 60 #include "opt_inet.h" 61 #include "opt_rss.h" 62 63 #include <sys/param.h> 64 #include <sys/systm.h> 65 #include <sys/bus.h> 66 #include <sys/counter.h> 67 #include <sys/kernel.h> 68 #include <sys/limits.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/module.h> 72 #include <sys/queue.h> 73 #include <sys/lock.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/socket.h> 80 #include <sys/sockio.h> 81 #include <sys/sx.h> 82 #include <sys/sysctl.h> 83 #include <sys/taskqueue.h> 84 #include <sys/buf_ring.h> 85 #include <sys/eventhandler.h> 86 #include <sys/epoch.h> 87 88 #include <vm/vm.h> 89 #include <vm/vm_extern.h> 90 #include <vm/pmap.h> 91 92 #include <machine/atomic.h> 93 #include <machine/in_cksum.h> 94 95 #include <net/bpf.h> 96 #include <net/ethernet.h> 97 #include <net/if.h> 98 #include <net/if_dl.h> 99 #include <net/if_media.h> 100 #include <net/if_types.h> 101 #include <net/if_var.h> 102 #include <net/rndis.h> 103 #ifdef RSS 104 #include <net/rss_config.h> 105 #endif 106 107 #include <netinet/in_systm.h> 108 #include <netinet/in.h> 109 #include <netinet/ip.h> 110 #include <netinet/ip6.h> 111 #include <netinet/tcp.h> 112 #include <netinet/tcp_lro.h> 113 #include <netinet/udp.h> 114 115 #include <dev/hyperv/include/hyperv.h> 116 #include <dev/hyperv/include/hyperv_busdma.h> 117 #include <dev/hyperv/include/vmbus.h> 118 #include <dev/hyperv/include/vmbus_xact.h> 119 120 #include <dev/hyperv/netvsc/ndis.h> 121 #include <dev/hyperv/netvsc/if_hnreg.h> 122 #include <dev/hyperv/netvsc/if_hnvar.h> 123 #include <dev/hyperv/netvsc/hn_nvs.h> 124 #include <dev/hyperv/netvsc/hn_rndis.h> 125 126 #include "vmbus_if.h" 127 128 #define HN_IFSTART_SUPPORT 129 130 #define HN_RING_CNT_DEF_MAX 8 131 132 #define HN_VFMAP_SIZE_DEF 8 133 134 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 135 136 /* YYY should get it from the underlying channel */ 137 #define HN_TX_DESC_CNT 512 138 139 #define HN_RNDIS_PKT_LEN \ 140 (sizeof(struct rndis_packet_msg) + \ 141 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 142 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 143 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 144 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 145 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 146 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 147 148 #define HN_TX_DATA_BOUNDARY PAGE_SIZE 149 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET 150 #define HN_TX_DATA_SEGSIZE PAGE_SIZE 151 /* -1 for RNDIS packet message */ 152 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 153 154 #define HN_DIRECT_TX_SIZE_DEF 128 155 156 #define HN_EARLY_TXEOF_THRESH 8 157 158 #define HN_PKTBUF_LEN_DEF (16 * 1024) 159 160 #define HN_LROENT_CNT_DEF 128 161 162 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 163 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 164 /* YYY 2*MTU is a bit rough, but should be good enough. */ 165 #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp)) 166 167 #define HN_LRO_ACKCNT_DEF 1 168 169 #define HN_LOCK_INIT(sc) \ 170 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 171 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 172 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 173 #define HN_LOCK(sc) \ 174 do { \ 175 while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ 176 /* Relinquish cpu to avoid deadlock */ \ 177 sched_relinquish(curthread); \ 178 DELAY(1000); \ 179 } \ 180 } while (0) 181 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 182 183 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 184 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 185 #define HN_CSUM_IP_HWASSIST(sc) \ 186 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 187 #define HN_CSUM_IP6_HWASSIST(sc) \ 188 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 189 190 #define HN_PKTSIZE_MIN(align) \ 191 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 192 HN_RNDIS_PKT_LEN, (align)) 193 #define HN_PKTSIZE(m, align) \ 194 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 195 196 #ifdef RSS 197 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) 198 #else 199 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 200 #endif 201 202 struct hn_txdesc { 203 #ifndef HN_USE_TXDESC_BUFRING 204 SLIST_ENTRY(hn_txdesc) link; 205 #endif 206 STAILQ_ENTRY(hn_txdesc) agg_link; 207 208 /* Aggregated txdescs, in sending order. */ 209 STAILQ_HEAD(, hn_txdesc) agg_list; 210 211 /* The oldest packet, if transmission aggregation happens. */ 212 struct mbuf *m; 213 struct hn_tx_ring *txr; 214 int refs; 215 uint32_t flags; /* HN_TXD_FLAG_ */ 216 struct hn_nvs_sendctx send_ctx; 217 uint32_t chim_index; 218 int chim_size; 219 220 bus_dmamap_t data_dmap; 221 222 bus_addr_t rndis_pkt_paddr; 223 struct rndis_packet_msg *rndis_pkt; 224 bus_dmamap_t rndis_pkt_dmap; 225 }; 226 227 #define HN_TXD_FLAG_ONLIST 0x0001 228 #define HN_TXD_FLAG_DMAMAP 0x0002 229 #define HN_TXD_FLAG_ONAGG 0x0004 230 231 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 232 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 233 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 234 235 struct packet_info_id { 236 uint8_t ver; 237 uint8_t flag; 238 uint16_t pkt_id; 239 }; 240 241 #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) 242 243 244 struct hn_rxinfo { 245 const uint32_t *vlan_info; 246 const uint32_t *csum_info; 247 const uint32_t *hash_info; 248 const uint32_t *hash_value; 249 const struct packet_info_id *pktinfo_id; 250 }; 251 252 struct hn_rxvf_setarg { 253 struct hn_rx_ring *rxr; 254 if_t vf_ifp; 255 }; 256 257 #define HN_RXINFO_VLAN 0x0001 258 #define HN_RXINFO_CSUM 0x0002 259 #define HN_RXINFO_HASHINF 0x0004 260 #define HN_RXINFO_HASHVAL 0x0008 261 #define HN_RXINFO_PKTINFO_ID 0x0010 262 #define HN_RXINFO_ALL \ 263 (HN_RXINFO_VLAN | \ 264 HN_RXINFO_CSUM | \ 265 HN_RXINFO_HASHINF | \ 266 HN_RXINFO_HASHVAL | \ 267 HN_RXINFO_PKTINFO_ID) 268 269 static int hn_probe(device_t); 270 static int hn_attach(device_t); 271 static int hn_detach(device_t); 272 static int hn_shutdown(device_t); 273 static void hn_chan_callback(struct vmbus_channel *, 274 void *); 275 276 static void hn_init(void *); 277 static int hn_ioctl(if_t, u_long, caddr_t); 278 #ifdef HN_IFSTART_SUPPORT 279 static void hn_start(if_t); 280 #endif 281 static int hn_transmit(if_t, struct mbuf *); 282 static void hn_xmit_qflush(if_t); 283 static int hn_ifmedia_upd(if_t); 284 static void hn_ifmedia_sts(if_t, 285 struct ifmediareq *); 286 287 static void hn_ifnet_event(void *, if_t, int); 288 static void hn_ifaddr_event(void *, if_t); 289 static void hn_ifnet_attevent(void *, if_t); 290 static void hn_ifnet_detevent(void *, if_t); 291 static void hn_ifnet_lnkevent(void *, if_t, int); 292 293 static bool hn_ismyvf(const struct hn_softc *, 294 const if_t); 295 static void hn_rxvf_change(struct hn_softc *, 296 if_t, bool); 297 static void hn_rxvf_set(struct hn_softc *, if_t); 298 static void hn_rxvf_set_task(void *, int); 299 static void hn_xpnt_vf_input(if_t, struct mbuf *); 300 static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 301 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 302 struct ifreq *); 303 static void hn_xpnt_vf_saveifflags(struct hn_softc *); 304 static bool hn_xpnt_vf_isready(struct hn_softc *); 305 static void hn_xpnt_vf_setready(struct hn_softc *); 306 static void hn_xpnt_vf_init_taskfunc(void *, int); 307 static void hn_xpnt_vf_init(struct hn_softc *); 308 static void hn_xpnt_vf_setenable(struct hn_softc *); 309 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 310 static void hn_vf_rss_fixup(struct hn_softc *, bool); 311 static void hn_vf_rss_restore(struct hn_softc *); 312 313 static int hn_rndis_rxinfo(const void *, int, 314 struct hn_rxinfo *); 315 static void hn_rndis_rx_data(struct hn_rx_ring *, 316 const void *, int); 317 static void hn_rndis_rx_status(struct hn_softc *, 318 const void *, int); 319 static void hn_rndis_init_fixat(struct hn_softc *, int); 320 321 static void hn_nvs_handle_notify(struct hn_softc *, 322 const struct vmbus_chanpkt_hdr *); 323 static void hn_nvs_handle_comp(struct hn_softc *, 324 struct vmbus_channel *, 325 const struct vmbus_chanpkt_hdr *); 326 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 327 struct vmbus_channel *, 328 const struct vmbus_chanpkt_hdr *); 329 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 330 struct vmbus_channel *, uint64_t); 331 332 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 333 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 334 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 335 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 336 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 337 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 338 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 339 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 340 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 341 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 342 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 343 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 344 #ifndef RSS 345 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 346 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 347 #endif 348 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 349 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 350 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 351 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 352 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 353 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 354 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 355 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 356 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 357 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 358 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 359 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 360 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 361 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 362 363 static void hn_stop(struct hn_softc *, bool); 364 static void hn_init_locked(struct hn_softc *); 365 static int hn_chan_attach(struct hn_softc *, 366 struct vmbus_channel *); 367 static void hn_chan_detach(struct hn_softc *, 368 struct vmbus_channel *); 369 static int hn_attach_subchans(struct hn_softc *); 370 static void hn_detach_allchans(struct hn_softc *); 371 static void hn_chan_rollup(struct hn_rx_ring *, 372 struct hn_tx_ring *); 373 static void hn_set_ring_inuse(struct hn_softc *, int); 374 static int hn_synth_attach(struct hn_softc *, int); 375 static void hn_synth_detach(struct hn_softc *); 376 static int hn_synth_alloc_subchans(struct hn_softc *, 377 int *); 378 static bool hn_synth_attachable(const struct hn_softc *); 379 static void hn_suspend(struct hn_softc *); 380 static void hn_suspend_data(struct hn_softc *); 381 static void hn_suspend_mgmt(struct hn_softc *); 382 static void hn_resume(struct hn_softc *); 383 static void hn_resume_data(struct hn_softc *); 384 static void hn_resume_mgmt(struct hn_softc *); 385 static void hn_suspend_mgmt_taskfunc(void *, int); 386 static void hn_chan_drain(struct hn_softc *, 387 struct vmbus_channel *); 388 static void hn_disable_rx(struct hn_softc *); 389 static void hn_drain_rxtx(struct hn_softc *, int); 390 static void hn_polling(struct hn_softc *, u_int); 391 static void hn_chan_polling(struct vmbus_channel *, u_int); 392 static void hn_mtu_change_fixup(struct hn_softc *); 393 394 static void hn_update_link_status(struct hn_softc *); 395 static void hn_change_network(struct hn_softc *); 396 static void hn_link_taskfunc(void *, int); 397 static void hn_netchg_init_taskfunc(void *, int); 398 static void hn_netchg_status_taskfunc(void *, int); 399 static void hn_link_status(struct hn_softc *); 400 401 static int hn_create_rx_data(struct hn_softc *, int); 402 static void hn_destroy_rx_data(struct hn_softc *); 403 static int hn_check_iplen(const struct mbuf *, int); 404 static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 405 static int hn_set_rxfilter(struct hn_softc *, uint32_t); 406 static int hn_rxfilter_config(struct hn_softc *); 407 static int hn_rss_reconfig(struct hn_softc *); 408 static void hn_rss_ind_fixup(struct hn_softc *); 409 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 410 static int hn_rxpkt(struct hn_rx_ring *); 411 static uint32_t hn_rss_type_fromndis(uint32_t); 412 static uint32_t hn_rss_type_tondis(uint32_t); 413 414 static int hn_tx_ring_create(struct hn_softc *, int); 415 static void hn_tx_ring_destroy(struct hn_tx_ring *); 416 static int hn_create_tx_data(struct hn_softc *, int); 417 static void hn_fixup_tx_data(struct hn_softc *); 418 static void hn_fixup_rx_data(struct hn_softc *); 419 static void hn_destroy_tx_data(struct hn_softc *); 420 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 421 static void hn_txdesc_gc(struct hn_tx_ring *, 422 struct hn_txdesc *); 423 static int hn_encap(if_t, struct hn_tx_ring *, 424 struct hn_txdesc *, struct mbuf **); 425 static int hn_txpkt(if_t, struct hn_tx_ring *, 426 struct hn_txdesc *); 427 static void hn_set_chim_size(struct hn_softc *, int); 428 static void hn_set_tso_maxsize(struct hn_softc *, int, int); 429 static bool hn_tx_ring_pending(struct hn_tx_ring *); 430 static void hn_tx_ring_qflush(struct hn_tx_ring *); 431 static void hn_resume_tx(struct hn_softc *, int); 432 static void hn_set_txagg(struct hn_softc *); 433 static void *hn_try_txagg(if_t, 434 struct hn_tx_ring *, struct hn_txdesc *, 435 int); 436 static int hn_get_txswq_depth(const struct hn_tx_ring *); 437 static void hn_txpkt_done(struct hn_nvs_sendctx *, 438 struct hn_softc *, struct vmbus_channel *, 439 const void *, int); 440 static int hn_txpkt_sglist(struct hn_tx_ring *, 441 struct hn_txdesc *); 442 static int hn_txpkt_chim(struct hn_tx_ring *, 443 struct hn_txdesc *); 444 static int hn_xmit(struct hn_tx_ring *, int); 445 static void hn_xmit_taskfunc(void *, int); 446 static void hn_xmit_txeof(struct hn_tx_ring *); 447 static void hn_xmit_txeof_taskfunc(void *, int); 448 #ifdef HN_IFSTART_SUPPORT 449 static int hn_start_locked(struct hn_tx_ring *, int); 450 static void hn_start_taskfunc(void *, int); 451 static void hn_start_txeof(struct hn_tx_ring *); 452 static void hn_start_txeof_taskfunc(void *, int); 453 #endif 454 455 static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); 456 457 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 458 "Hyper-V network interface"); 459 460 /* Trust tcp segment verification on host side. */ 461 static int hn_trust_hosttcp = 1; 462 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 463 &hn_trust_hosttcp, 0, 464 "Trust tcp segment verification on host side, " 465 "when csum info is missing (global setting)"); 466 467 /* Trust udp datagrams verification on host side. */ 468 static int hn_trust_hostudp = 1; 469 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 470 &hn_trust_hostudp, 0, 471 "Trust udp datagram verification on host side, " 472 "when csum info is missing (global setting)"); 473 474 /* Trust ip packets verification on host side. */ 475 static int hn_trust_hostip = 1; 476 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 477 &hn_trust_hostip, 0, 478 "Trust ip packet verification on host side, " 479 "when csum info is missing (global setting)"); 480 481 /* 482 * Offload UDP/IPv4 checksum. 483 */ 484 static int hn_enable_udp4cs = 1; 485 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 486 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 487 488 /* 489 * Offload UDP/IPv6 checksum. 490 */ 491 static int hn_enable_udp6cs = 1; 492 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 493 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 494 495 /* Stats. */ 496 static counter_u64_t hn_udpcs_fixup; 497 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 498 &hn_udpcs_fixup, "# of UDP checksum fixup"); 499 500 /* 501 * See hn_set_hlen(). 502 * 503 * This value is for Azure. For Hyper-V, set this above 504 * 65536 to disable UDP datagram checksum fixup. 505 */ 506 static int hn_udpcs_fixup_mtu = 1420; 507 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 508 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 509 510 /* Limit TSO burst size */ 511 static int hn_tso_maxlen = IP_MAXPACKET; 512 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 513 &hn_tso_maxlen, 0, "TSO burst limit"); 514 515 /* Limit chimney send size */ 516 static int hn_tx_chimney_size = 0; 517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 518 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 519 520 /* Limit the size of packet for direct transmission */ 521 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 522 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 523 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 524 525 /* # of LRO entries per RX ring */ 526 #if defined(INET) || defined(INET6) 527 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 528 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 529 &hn_lro_entry_count, 0, "LRO entry count"); 530 #endif 531 532 static int hn_tx_taskq_cnt = 1; 533 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 534 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 535 536 #define HN_TX_TASKQ_M_INDEP 0 537 #define HN_TX_TASKQ_M_GLOBAL 1 538 #define HN_TX_TASKQ_M_EVTTQ 2 539 540 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 541 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 542 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 543 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 544 545 #ifndef HN_USE_TXDESC_BUFRING 546 static int hn_use_txdesc_bufring = 0; 547 #else 548 static int hn_use_txdesc_bufring = 1; 549 #endif 550 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 551 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 552 553 #ifdef HN_IFSTART_SUPPORT 554 /* Use ifnet.if_start instead of ifnet.if_transmit */ 555 static int hn_use_if_start = 0; 556 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 557 &hn_use_if_start, 0, "Use if_start TX method"); 558 #endif 559 560 /* # of channels to use */ 561 static int hn_chan_cnt = 0; 562 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 563 &hn_chan_cnt, 0, 564 "# of channels to use; each channel has one RX ring and one TX ring"); 565 566 /* # of transmit rings to use */ 567 static int hn_tx_ring_cnt = 0; 568 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 569 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 570 571 /* Software TX ring deptch */ 572 static int hn_tx_swq_depth = 0; 573 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 574 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 575 576 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 577 static u_int hn_lro_mbufq_depth = 0; 578 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 579 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 580 581 /* Packet transmission aggregation size limit */ 582 static int hn_tx_agg_size = -1; 583 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 584 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 585 586 /* Packet transmission aggregation count limit */ 587 static int hn_tx_agg_pkts = -1; 588 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 589 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 590 591 /* VF list */ 592 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, 593 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 594 hn_vflist_sysctl, "A", 595 "VF list"); 596 597 /* VF mapping */ 598 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, 599 CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, 600 hn_vfmap_sysctl, "A", 601 "VF mapping"); 602 603 /* Transparent VF */ 604 static int hn_xpnt_vf = 1; 605 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 606 &hn_xpnt_vf, 0, "Transparent VF mod"); 607 608 /* Accurate BPF support for Transparent VF */ 609 static int hn_xpnt_vf_accbpf = 0; 610 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 611 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 612 613 /* Extra wait for transparent VF attach routing; unit seconds. */ 614 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 615 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 616 &hn_xpnt_vf_attwait, 0, 617 "Extra wait for transparent VF attach routing; unit: seconds"); 618 619 static u_int hn_cpu_index; /* next CPU for channel */ 620 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 621 622 static struct rmlock hn_vfmap_lock; 623 static int hn_vfmap_size; 624 static if_t *hn_vfmap; 625 626 #ifndef RSS 627 static const uint8_t 628 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 629 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 630 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 631 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 632 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 633 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 634 }; 635 #endif /* !RSS */ 636 637 static const struct hyperv_guid hn_guid = { 638 .hv_guid = { 639 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 640 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 641 }; 642 643 static device_method_t hn_methods[] = { 644 /* Device interface */ 645 DEVMETHOD(device_probe, hn_probe), 646 DEVMETHOD(device_attach, hn_attach), 647 DEVMETHOD(device_detach, hn_detach), 648 DEVMETHOD(device_shutdown, hn_shutdown), 649 DEVMETHOD_END 650 }; 651 652 static driver_t hn_driver = { 653 "hn", 654 hn_methods, 655 sizeof(struct hn_softc) 656 }; 657 658 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); 659 MODULE_VERSION(hn, 1); 660 MODULE_DEPEND(hn, vmbus, 1, 1, 1); 661 662 static void 663 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 664 { 665 int i; 666 667 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 668 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 669 } 670 671 static int 672 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 673 { 674 675 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 676 txd->chim_size == 0, ("invalid rndis sglist txd")); 677 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 678 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 679 } 680 681 static int 682 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 683 { 684 struct hn_nvs_rndis rndis; 685 686 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 687 txd->chim_size > 0, ("invalid rndis chim txd")); 688 689 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 690 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 691 rndis.nvs_chim_idx = txd->chim_index; 692 rndis.nvs_chim_sz = txd->chim_size; 693 694 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 695 &rndis, sizeof(rndis), &txd->send_ctx)); 696 } 697 698 static __inline uint32_t 699 hn_chim_alloc(struct hn_softc *sc) 700 { 701 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 702 u_long *bmap = sc->hn_chim_bmap; 703 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 704 705 for (i = 0; i < bmap_cnt; ++i) { 706 int idx; 707 708 idx = ffsl(~bmap[i]); 709 if (idx == 0) 710 continue; 711 712 --idx; /* ffsl is 1-based */ 713 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 714 ("invalid i %d and idx %d", i, idx)); 715 716 if (atomic_testandset_long(&bmap[i], idx)) 717 continue; 718 719 ret = i * LONG_BIT + idx; 720 break; 721 } 722 return (ret); 723 } 724 725 static __inline void 726 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 727 { 728 u_long mask; 729 uint32_t idx; 730 731 idx = chim_idx / LONG_BIT; 732 KASSERT(idx < sc->hn_chim_bmap_cnt, 733 ("invalid chimney index 0x%x", chim_idx)); 734 735 mask = 1UL << (chim_idx % LONG_BIT); 736 KASSERT(sc->hn_chim_bmap[idx] & mask, 737 ("index bitmap 0x%lx, chimney index %u, " 738 "bitmap idx %d, bitmask 0x%lx", 739 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 740 741 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 742 } 743 744 #if defined(INET6) || defined(INET) 745 746 #define PULLUP_HDR(m, len) \ 747 do { \ 748 if (__predict_false((m)->m_len < (len))) { \ 749 (m) = m_pullup((m), (len)); \ 750 if ((m) == NULL) \ 751 return (NULL); \ 752 } \ 753 } while (0) 754 755 /* 756 * NOTE: If this function failed, the m_head would be freed. 757 */ 758 static __inline struct mbuf * 759 hn_tso_fixup(struct mbuf *m_head) 760 { 761 struct ether_vlan_header *evl; 762 struct tcphdr *th; 763 int ehlen; 764 765 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 766 767 PULLUP_HDR(m_head, sizeof(*evl)); 768 evl = mtod(m_head, struct ether_vlan_header *); 769 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 770 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 771 else 772 ehlen = ETHER_HDR_LEN; 773 m_head->m_pkthdr.l2hlen = ehlen; 774 775 #ifdef INET 776 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 777 struct ip *ip; 778 int iphlen; 779 780 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 781 ip = mtodo(m_head, ehlen); 782 iphlen = ip->ip_hl << 2; 783 m_head->m_pkthdr.l3hlen = iphlen; 784 785 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 786 th = mtodo(m_head, ehlen + iphlen); 787 788 ip->ip_len = 0; 789 ip->ip_sum = 0; 790 th->th_sum = in_pseudo(ip->ip_src.s_addr, 791 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 792 } 793 #endif 794 #if defined(INET6) && defined(INET) 795 else 796 #endif 797 #ifdef INET6 798 { 799 struct ip6_hdr *ip6; 800 801 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 802 ip6 = mtodo(m_head, ehlen); 803 if (ip6->ip6_nxt != IPPROTO_TCP) { 804 m_freem(m_head); 805 return (NULL); 806 } 807 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 808 809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 810 th = mtodo(m_head, ehlen + sizeof(*ip6)); 811 812 ip6->ip6_plen = 0; 813 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 814 } 815 #endif 816 return (m_head); 817 } 818 819 /* 820 * NOTE: If this function failed, the m_head would be freed. 821 */ 822 static __inline struct mbuf * 823 hn_set_hlen(struct mbuf *m_head) 824 { 825 const struct ether_vlan_header *evl; 826 int ehlen; 827 828 PULLUP_HDR(m_head, sizeof(*evl)); 829 evl = mtod(m_head, const struct ether_vlan_header *); 830 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 831 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 832 else 833 ehlen = ETHER_HDR_LEN; 834 m_head->m_pkthdr.l2hlen = ehlen; 835 836 #ifdef INET 837 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 838 const struct ip *ip; 839 int iphlen; 840 841 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 842 ip = mtodo(m_head, ehlen); 843 iphlen = ip->ip_hl << 2; 844 m_head->m_pkthdr.l3hlen = iphlen; 845 846 /* 847 * UDP checksum offload does not work in Azure, if the 848 * following conditions meet: 849 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 850 * - IP_DF is not set in the IP hdr. 851 * 852 * Fallback to software checksum for these UDP datagrams. 853 */ 854 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 855 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 856 (ntohs(ip->ip_off) & IP_DF) == 0) { 857 uint16_t off = ehlen + iphlen; 858 859 counter_u64_add(hn_udpcs_fixup, 1); 860 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 861 *(uint16_t *)(m_head->m_data + off + 862 m_head->m_pkthdr.csum_data) = in_cksum_skip( 863 m_head, m_head->m_pkthdr.len, off); 864 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 865 } 866 } 867 #endif 868 #if defined(INET6) && defined(INET) 869 else 870 #endif 871 #ifdef INET6 872 { 873 const struct ip6_hdr *ip6; 874 875 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 876 ip6 = mtodo(m_head, ehlen); 877 if (ip6->ip6_nxt != IPPROTO_TCP && 878 ip6->ip6_nxt != IPPROTO_UDP) { 879 m_freem(m_head); 880 return (NULL); 881 } 882 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 883 } 884 #endif 885 return (m_head); 886 } 887 888 /* 889 * NOTE: If this function failed, the m_head would be freed. 890 */ 891 static __inline struct mbuf * 892 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 893 { 894 const struct tcphdr *th; 895 int ehlen, iphlen; 896 897 *tcpsyn = 0; 898 ehlen = m_head->m_pkthdr.l2hlen; 899 iphlen = m_head->m_pkthdr.l3hlen; 900 901 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 902 th = mtodo(m_head, ehlen + iphlen); 903 if (th->th_flags & TH_SYN) 904 *tcpsyn = 1; 905 return (m_head); 906 } 907 908 #undef PULLUP_HDR 909 910 #endif /* INET6 || INET */ 911 912 static int 913 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 914 { 915 int error = 0; 916 917 HN_LOCK_ASSERT(sc); 918 919 if (sc->hn_rx_filter != filter) { 920 error = hn_rndis_set_rxfilter(sc, filter); 921 if (!error) 922 sc->hn_rx_filter = filter; 923 } 924 return (error); 925 } 926 927 static int 928 hn_rxfilter_config(struct hn_softc *sc) 929 { 930 if_t ifp = sc->hn_ifp; 931 uint32_t filter; 932 933 HN_LOCK_ASSERT(sc); 934 935 /* 936 * If the non-transparent mode VF is activated, we don't know how 937 * its RX filter is configured, so stick the synthetic device in 938 * the promiscous mode. 939 */ 940 if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 941 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 942 } else { 943 filter = NDIS_PACKET_TYPE_DIRECTED; 944 if (if_getflags(ifp) & IFF_BROADCAST) 945 filter |= NDIS_PACKET_TYPE_BROADCAST; 946 /* TODO: support multicast list */ 947 if ((if_getflags(ifp) & IFF_ALLMULTI) || 948 !if_maddr_empty(ifp)) 949 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 950 } 951 return (hn_set_rxfilter(sc, filter)); 952 } 953 954 static void 955 hn_set_txagg(struct hn_softc *sc) 956 { 957 uint32_t size, pkts; 958 int i; 959 960 /* 961 * Setup aggregation size. 962 */ 963 if (sc->hn_agg_size < 0) 964 size = UINT32_MAX; 965 else 966 size = sc->hn_agg_size; 967 968 if (sc->hn_rndis_agg_size < size) 969 size = sc->hn_rndis_agg_size; 970 971 /* NOTE: We only aggregate packets using chimney sending buffers. */ 972 if (size > (uint32_t)sc->hn_chim_szmax) 973 size = sc->hn_chim_szmax; 974 975 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 976 /* Disable */ 977 size = 0; 978 pkts = 0; 979 goto done; 980 } 981 982 /* NOTE: Type of the per TX ring setting is 'int'. */ 983 if (size > INT_MAX) 984 size = INT_MAX; 985 986 /* 987 * Setup aggregation packet count. 988 */ 989 if (sc->hn_agg_pkts < 0) 990 pkts = UINT32_MAX; 991 else 992 pkts = sc->hn_agg_pkts; 993 994 if (sc->hn_rndis_agg_pkts < pkts) 995 pkts = sc->hn_rndis_agg_pkts; 996 997 if (pkts <= 1) { 998 /* Disable */ 999 size = 0; 1000 pkts = 0; 1001 goto done; 1002 } 1003 1004 /* NOTE: Type of the per TX ring setting is 'short'. */ 1005 if (pkts > SHRT_MAX) 1006 pkts = SHRT_MAX; 1007 1008 done: 1009 /* NOTE: Type of the per TX ring setting is 'short'. */ 1010 if (sc->hn_rndis_agg_align > SHRT_MAX) { 1011 /* Disable */ 1012 size = 0; 1013 pkts = 0; 1014 } 1015 1016 if (bootverbose) { 1017 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1018 size, pkts, sc->hn_rndis_agg_align); 1019 } 1020 1021 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1022 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1023 1024 mtx_lock(&txr->hn_tx_lock); 1025 txr->hn_agg_szmax = size; 1026 txr->hn_agg_pktmax = pkts; 1027 txr->hn_agg_align = sc->hn_rndis_agg_align; 1028 mtx_unlock(&txr->hn_tx_lock); 1029 } 1030 } 1031 1032 static int 1033 hn_get_txswq_depth(const struct hn_tx_ring *txr) 1034 { 1035 1036 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1037 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1038 return txr->hn_txdesc_cnt; 1039 return hn_tx_swq_depth; 1040 } 1041 1042 static int 1043 hn_rss_reconfig(struct hn_softc *sc) 1044 { 1045 int error; 1046 1047 HN_LOCK_ASSERT(sc); 1048 1049 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1050 return (ENXIO); 1051 1052 /* 1053 * Disable RSS first. 1054 * 1055 * NOTE: 1056 * Direct reconfiguration by setting the UNCHG flags does 1057 * _not_ work properly. 1058 */ 1059 if (bootverbose) 1060 if_printf(sc->hn_ifp, "disable RSS\n"); 1061 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1062 if (error) { 1063 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1064 return (error); 1065 } 1066 1067 /* 1068 * Reenable the RSS w/ the updated RSS key or indirect 1069 * table. 1070 */ 1071 if (bootverbose) 1072 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1073 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1074 if (error) { 1075 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1076 return (error); 1077 } 1078 return (0); 1079 } 1080 1081 static void 1082 hn_rss_ind_fixup(struct hn_softc *sc) 1083 { 1084 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1085 int i, nchan; 1086 1087 nchan = sc->hn_rx_ring_inuse; 1088 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1089 1090 /* 1091 * Check indirect table to make sure that all channels in it 1092 * can be used. 1093 */ 1094 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1095 if (rss->rss_ind[i] >= nchan) { 1096 if_printf(sc->hn_ifp, 1097 "RSS indirect table %d fixup: %u -> %d\n", 1098 i, rss->rss_ind[i], nchan - 1); 1099 rss->rss_ind[i] = nchan - 1; 1100 } 1101 } 1102 } 1103 1104 static int 1105 hn_ifmedia_upd(if_t ifp __unused) 1106 { 1107 1108 return EOPNOTSUPP; 1109 } 1110 1111 static void 1112 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) 1113 { 1114 struct hn_softc *sc = if_getsoftc(ifp); 1115 1116 ifmr->ifm_status = IFM_AVALID; 1117 ifmr->ifm_active = IFM_ETHER; 1118 1119 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1120 ifmr->ifm_active |= IFM_NONE; 1121 return; 1122 } 1123 ifmr->ifm_status |= IFM_ACTIVE; 1124 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1125 } 1126 1127 static void 1128 hn_rxvf_set_task(void *xarg, int pending __unused) 1129 { 1130 struct hn_rxvf_setarg *arg = xarg; 1131 1132 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1133 } 1134 1135 static void 1136 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp) 1137 { 1138 struct hn_rx_ring *rxr; 1139 struct hn_rxvf_setarg arg; 1140 struct task task; 1141 int i; 1142 1143 HN_LOCK_ASSERT(sc); 1144 1145 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1146 1147 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1148 rxr = &sc->hn_rx_ring[i]; 1149 1150 if (i < sc->hn_rx_ring_inuse) { 1151 arg.rxr = rxr; 1152 arg.vf_ifp = vf_ifp; 1153 vmbus_chan_run_task(rxr->hn_chan, &task); 1154 } else { 1155 rxr->hn_rxvf_ifp = vf_ifp; 1156 } 1157 } 1158 } 1159 1160 static bool 1161 hn_ismyvf(const struct hn_softc *sc, const if_t ifp) 1162 { 1163 if_t hn_ifp; 1164 1165 hn_ifp = sc->hn_ifp; 1166 1167 if (ifp == hn_ifp) 1168 return (false); 1169 1170 if (if_getalloctype(ifp) != IFT_ETHER) 1171 return (false); 1172 1173 /* Ignore lagg/vlan interfaces */ 1174 if (strcmp(if_getdname(ifp), "lagg") == 0 || 1175 strcmp(if_getdname(ifp), "vlan") == 0) 1176 return (false); 1177 1178 /* 1179 * During detach events if_getifaddr(ifp) might be NULL. 1180 * Make sure the bcmp() below doesn't panic on that: 1181 */ 1182 if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL) 1183 return (false); 1184 1185 if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0) 1186 return (false); 1187 1188 return (true); 1189 } 1190 1191 static void 1192 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf) 1193 { 1194 if_t hn_ifp; 1195 1196 HN_LOCK(sc); 1197 1198 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1199 goto out; 1200 1201 if (!hn_ismyvf(sc, ifp)) 1202 goto out; 1203 hn_ifp = sc->hn_ifp; 1204 1205 if (rxvf) { 1206 if (sc->hn_flags & HN_FLAG_RXVF) 1207 goto out; 1208 1209 sc->hn_flags |= HN_FLAG_RXVF; 1210 hn_rxfilter_config(sc); 1211 } else { 1212 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1213 goto out; 1214 1215 sc->hn_flags &= ~HN_FLAG_RXVF; 1216 if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING) 1217 hn_rxfilter_config(sc); 1218 else 1219 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1220 } 1221 1222 hn_nvs_set_datapath(sc, 1223 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1224 1225 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1226 1227 if (rxvf) { 1228 hn_vf_rss_fixup(sc, true); 1229 hn_suspend_mgmt(sc); 1230 sc->hn_link_flags &= 1231 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1232 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1233 } else { 1234 hn_vf_rss_restore(sc); 1235 hn_resume_mgmt(sc); 1236 } 1237 1238 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), 1239 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1240 1241 if (bootverbose) { 1242 if_printf(hn_ifp, "datapath is switched %s %s\n", 1243 rxvf ? "to" : "from", if_name(ifp)); 1244 } 1245 out: 1246 HN_UNLOCK(sc); 1247 } 1248 1249 static void 1250 hn_ifnet_event(void *arg, if_t ifp, int event) 1251 { 1252 1253 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1254 return; 1255 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1256 } 1257 1258 static void 1259 hn_ifaddr_event(void *arg, if_t ifp) 1260 { 1261 1262 hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP); 1263 } 1264 1265 static int 1266 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1267 { 1268 if_t ifp, vf_ifp; 1269 uint64_t tmp; 1270 int error; 1271 1272 HN_LOCK_ASSERT(sc); 1273 ifp = sc->hn_ifp; 1274 vf_ifp = sc->hn_vf_ifp; 1275 1276 /* 1277 * Fix up requested capabilities w/ supported capabilities, 1278 * since the supported capabilities could have been changed. 1279 */ 1280 ifr->ifr_reqcap &= if_getcapabilities(ifp); 1281 /* Pass SIOCSIFCAP to VF. */ 1282 error = ifhwioctl(SIOCSIFCAP, vf_ifp, (caddr_t)ifr, curthread); 1283 1284 /* 1285 * NOTE: 1286 * The error will be propagated to the callers, however, it 1287 * is _not_ useful here. 1288 */ 1289 1290 /* 1291 * Merge VF's enabled capabilities. 1292 */ 1293 if_setcapenable(ifp, if_getcapenable(vf_ifp) & if_getcapabilities(ifp)); 1294 1295 tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP_HWASSIST(sc); 1296 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 1297 if_sethwassistbits(ifp, tmp, 0); 1298 else 1299 if_sethwassistbits(ifp, 0, tmp); 1300 1301 tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP6_HWASSIST(sc); 1302 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 1303 if_sethwassistbits(ifp, tmp, 0); 1304 else 1305 if_sethwassistbits(ifp, 0, tmp); 1306 1307 tmp = if_gethwassist(vf_ifp) & CSUM_IP_TSO; 1308 if (if_getcapenable(ifp) & IFCAP_TSO4) 1309 if_sethwassistbits(ifp, tmp, 0); 1310 else 1311 if_sethwassistbits(ifp, 0, tmp); 1312 1313 tmp = if_gethwassist(vf_ifp) & CSUM_IP6_TSO; 1314 if (if_getcapenable(ifp) & IFCAP_TSO6) 1315 if_sethwassistbits(ifp, tmp, 0); 1316 else 1317 if_sethwassistbits(ifp, 0, tmp); 1318 1319 return (error); 1320 } 1321 1322 static int 1323 hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1324 { 1325 if_t vf_ifp; 1326 struct ifreq ifr; 1327 1328 HN_LOCK_ASSERT(sc); 1329 vf_ifp = sc->hn_vf_ifp; 1330 1331 memset(&ifr, 0, sizeof(ifr)); 1332 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1333 ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff; 1334 ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16; 1335 return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread)); 1336 } 1337 1338 static void 1339 hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1340 { 1341 if_t ifp = sc->hn_ifp; 1342 int allmulti = 0; 1343 1344 HN_LOCK_ASSERT(sc); 1345 1346 /* XXX vlan(4) style mcast addr maintenance */ 1347 if (!if_maddr_empty(ifp)) 1348 allmulti = IFF_ALLMULTI; 1349 1350 /* Always set the VF's if_flags */ 1351 if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti); 1352 } 1353 1354 static void 1355 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m) 1356 { 1357 struct rm_priotracker pt; 1358 if_t hn_ifp = NULL; 1359 struct mbuf *mn; 1360 1361 /* 1362 * XXX racy, if hn(4) ever detached. 1363 */ 1364 rm_rlock(&hn_vfmap_lock, &pt); 1365 if (if_getindex(vf_ifp) < hn_vfmap_size) 1366 hn_ifp = hn_vfmap[if_getindex(vf_ifp)]; 1367 rm_runlock(&hn_vfmap_lock, &pt); 1368 1369 if (hn_ifp != NULL) { 1370 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1371 /* 1372 * Allow tapping on the VF. 1373 */ 1374 ETHER_BPF_MTAP(vf_ifp, mn); 1375 1376 /* 1377 * Update VF stats. 1378 */ 1379 if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) { 1380 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1381 mn->m_pkthdr.len); 1382 } 1383 /* 1384 * XXX IFCOUNTER_IMCAST 1385 * This stat updating is kinda invasive, since it 1386 * requires two checks on the mbuf: the length check 1387 * and the ethernet header check. As of this write, 1388 * all multicast packets go directly to hn(4), which 1389 * makes imcast stat updating in the VF a try in vian. 1390 */ 1391 1392 /* 1393 * Fix up rcvif and increase hn(4)'s ipackets. 1394 */ 1395 mn->m_pkthdr.rcvif = hn_ifp; 1396 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1397 } 1398 /* 1399 * Go through hn(4)'s if_input. 1400 */ 1401 if_input(hn_ifp, m); 1402 } else { 1403 /* 1404 * In the middle of the transition; free this 1405 * mbuf chain. 1406 */ 1407 while (m != NULL) { 1408 mn = m->m_nextpkt; 1409 m->m_nextpkt = NULL; 1410 m_freem(m); 1411 m = mn; 1412 } 1413 } 1414 } 1415 1416 static void 1417 hn_mtu_change_fixup(struct hn_softc *sc) 1418 { 1419 if_t ifp; 1420 1421 HN_LOCK_ASSERT(sc); 1422 ifp = sc->hn_ifp; 1423 1424 hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp)); 1425 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1426 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1427 } 1428 1429 static uint32_t 1430 hn_rss_type_fromndis(uint32_t rss_hash) 1431 { 1432 uint32_t types = 0; 1433 1434 if (rss_hash & NDIS_HASH_IPV4) 1435 types |= RSS_TYPE_IPV4; 1436 if (rss_hash & NDIS_HASH_TCP_IPV4) 1437 types |= RSS_TYPE_TCP_IPV4; 1438 if (rss_hash & NDIS_HASH_IPV6) 1439 types |= RSS_TYPE_IPV6; 1440 if (rss_hash & NDIS_HASH_IPV6_EX) 1441 types |= RSS_TYPE_IPV6_EX; 1442 if (rss_hash & NDIS_HASH_TCP_IPV6) 1443 types |= RSS_TYPE_TCP_IPV6; 1444 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1445 types |= RSS_TYPE_TCP_IPV6_EX; 1446 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1447 types |= RSS_TYPE_UDP_IPV4; 1448 return (types); 1449 } 1450 1451 static uint32_t 1452 hn_rss_type_tondis(uint32_t types) 1453 { 1454 uint32_t rss_hash = 0; 1455 1456 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1457 ("UDP6 and UDP6EX are not supported")); 1458 1459 if (types & RSS_TYPE_IPV4) 1460 rss_hash |= NDIS_HASH_IPV4; 1461 if (types & RSS_TYPE_TCP_IPV4) 1462 rss_hash |= NDIS_HASH_TCP_IPV4; 1463 if (types & RSS_TYPE_IPV6) 1464 rss_hash |= NDIS_HASH_IPV6; 1465 if (types & RSS_TYPE_IPV6_EX) 1466 rss_hash |= NDIS_HASH_IPV6_EX; 1467 if (types & RSS_TYPE_TCP_IPV6) 1468 rss_hash |= NDIS_HASH_TCP_IPV6; 1469 if (types & RSS_TYPE_TCP_IPV6_EX) 1470 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1471 if (types & RSS_TYPE_UDP_IPV4) 1472 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1473 return (rss_hash); 1474 } 1475 1476 static void 1477 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1478 { 1479 int i; 1480 1481 HN_LOCK_ASSERT(sc); 1482 1483 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1484 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1485 } 1486 1487 static void 1488 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1489 { 1490 if_t ifp, vf_ifp; 1491 struct ifrsshash ifrh; 1492 struct ifrsskey ifrk; 1493 int error; 1494 uint32_t my_types, diff_types, mbuf_types = 0; 1495 1496 HN_LOCK_ASSERT(sc); 1497 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1498 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1499 1500 if (sc->hn_rx_ring_inuse == 1) { 1501 /* No RSS on synthetic parts; done. */ 1502 return; 1503 } 1504 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1505 /* Synthetic parts do not support Toeplitz; done. */ 1506 return; 1507 } 1508 1509 ifp = sc->hn_ifp; 1510 vf_ifp = sc->hn_vf_ifp; 1511 1512 /* 1513 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1514 * supported. 1515 */ 1516 memset(&ifrk, 0, sizeof(ifrk)); 1517 strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name)); 1518 error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread); 1519 if (error) { 1520 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1521 if_name(vf_ifp), error); 1522 goto done; 1523 } 1524 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1525 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1526 if_name(vf_ifp), ifrk.ifrk_func); 1527 goto done; 1528 } 1529 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1530 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1531 if_name(vf_ifp), ifrk.ifrk_keylen); 1532 goto done; 1533 } 1534 1535 /* 1536 * Extract VF's RSS hash. Only Toeplitz is supported. 1537 */ 1538 memset(&ifrh, 0, sizeof(ifrh)); 1539 strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name)); 1540 error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread); 1541 if (error) { 1542 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1543 if_name(vf_ifp), error); 1544 goto done; 1545 } 1546 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1547 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1548 if_name(vf_ifp), ifrh.ifrh_func); 1549 goto done; 1550 } 1551 1552 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1553 if ((ifrh.ifrh_types & my_types) == 0) { 1554 /* This disables RSS; ignore it then */ 1555 if_printf(ifp, "%s intersection of RSS types failed. " 1556 "VF %#x, mine %#x\n", if_name(vf_ifp), 1557 ifrh.ifrh_types, my_types); 1558 goto done; 1559 } 1560 1561 diff_types = my_types ^ ifrh.ifrh_types; 1562 my_types &= ifrh.ifrh_types; 1563 mbuf_types = my_types; 1564 1565 /* 1566 * Detect RSS hash value/type confliction. 1567 * 1568 * NOTE: 1569 * We don't disable the hash type, but stop delivery the hash 1570 * value/type through mbufs on RX path. 1571 * 1572 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1573 * hash is delivered with type of TCP_IPV4. This means if 1574 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1575 * least to hn_mbuf_hash. However, given that _all_ of the 1576 * NICs implement TCP_IPV4, this will _not_ impose any issues 1577 * here. 1578 */ 1579 if ((my_types & RSS_TYPE_IPV4) && 1580 (diff_types & ifrh.ifrh_types & 1581 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1582 /* Conflict; disable IPV4 hash type/value delivery. */ 1583 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1584 mbuf_types &= ~RSS_TYPE_IPV4; 1585 } 1586 if ((my_types & RSS_TYPE_IPV6) && 1587 (diff_types & ifrh.ifrh_types & 1588 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1589 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1590 RSS_TYPE_IPV6_EX))) { 1591 /* Conflict; disable IPV6 hash type/value delivery. */ 1592 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1593 mbuf_types &= ~RSS_TYPE_IPV6; 1594 } 1595 if ((my_types & RSS_TYPE_IPV6_EX) && 1596 (diff_types & ifrh.ifrh_types & 1597 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1598 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1599 RSS_TYPE_IPV6))) { 1600 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1601 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1602 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1603 } 1604 if ((my_types & RSS_TYPE_TCP_IPV6) && 1605 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1606 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1607 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1608 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1609 } 1610 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1611 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1612 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1613 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1614 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1615 } 1616 if ((my_types & RSS_TYPE_UDP_IPV6) && 1617 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1618 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1619 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1620 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1621 } 1622 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1623 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1624 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1625 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1626 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1627 } 1628 1629 /* 1630 * Indirect table does not matter. 1631 */ 1632 1633 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1634 hn_rss_type_tondis(my_types); 1635 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1636 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1637 1638 if (reconf) { 1639 error = hn_rss_reconfig(sc); 1640 if (error) { 1641 /* XXX roll-back? */ 1642 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1643 /* XXX keep going. */ 1644 } 1645 } 1646 done: 1647 /* Hash deliverability for mbufs. */ 1648 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1649 } 1650 1651 static void 1652 hn_vf_rss_restore(struct hn_softc *sc) 1653 { 1654 1655 HN_LOCK_ASSERT(sc); 1656 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1657 ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); 1658 1659 if (sc->hn_rx_ring_inuse == 1) 1660 goto done; 1661 1662 /* 1663 * Restore hash types. Key does _not_ matter. 1664 */ 1665 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1666 int error; 1667 1668 sc->hn_rss_hash = sc->hn_rss_hcap; 1669 error = hn_rss_reconfig(sc); 1670 if (error) { 1671 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1672 error); 1673 /* XXX keep going. */ 1674 } 1675 } 1676 done: 1677 /* Hash deliverability for mbufs. */ 1678 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1679 } 1680 1681 static void 1682 hn_xpnt_vf_setready(struct hn_softc *sc) 1683 { 1684 if_t ifp, vf_ifp; 1685 struct ifreq ifr; 1686 1687 HN_LOCK_ASSERT(sc); 1688 ifp = sc->hn_ifp; 1689 vf_ifp = sc->hn_vf_ifp; 1690 1691 /* 1692 * Mark the VF ready. 1693 */ 1694 sc->hn_vf_rdytick = 0; 1695 1696 /* 1697 * Save information for restoration. 1698 */ 1699 sc->hn_saved_caps = if_getcapabilities(ifp); 1700 sc->hn_saved_tsomax = if_gethwtsomax(ifp); 1701 sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp); 1702 sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp); 1703 1704 /* 1705 * Intersect supported/enabled capabilities. 1706 * 1707 * NOTE: 1708 * if_hwassist is not changed here. 1709 */ 1710 if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp)); 1711 if_setcapenablebit(ifp, 0, if_getcapabilities(ifp)); 1712 1713 /* 1714 * Fix TSO settings. 1715 */ 1716 if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp)) 1717 if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp)); 1718 if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp)) 1719 if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp)); 1720 if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp)) 1721 if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp)); 1722 1723 /* 1724 * Change VF's enabled capabilities. 1725 */ 1726 memset(&ifr, 0, sizeof(ifr)); 1727 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1728 ifr.ifr_reqcap = if_getcapenable(ifp); 1729 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1730 1731 if (if_getmtu(ifp) != ETHERMTU) { 1732 int error; 1733 1734 /* 1735 * Change VF's MTU. 1736 */ 1737 memset(&ifr, 0, sizeof(ifr)); 1738 strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); 1739 ifr.ifr_mtu = if_getmtu(ifp); 1740 error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread); 1741 if (error) { 1742 if_printf(ifp, "%s SIOCSIFMTU %u failed\n", 1743 if_name(vf_ifp), if_getmtu(ifp)); 1744 if (if_getmtu(ifp) > ETHERMTU) { 1745 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1746 1747 /* 1748 * XXX 1749 * No need to adjust the synthetic parts' MTU; 1750 * failure of the adjustment will cause us 1751 * infinite headache. 1752 */ 1753 if_setmtu(ifp, ETHERMTU); 1754 hn_mtu_change_fixup(sc); 1755 } 1756 } 1757 } 1758 } 1759 1760 static bool 1761 hn_xpnt_vf_isready(struct hn_softc *sc) 1762 { 1763 1764 HN_LOCK_ASSERT(sc); 1765 1766 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1767 return (false); 1768 1769 if (sc->hn_vf_rdytick == 0) 1770 return (true); 1771 1772 if (sc->hn_vf_rdytick > ticks) 1773 return (false); 1774 1775 /* Mark VF as ready. */ 1776 hn_xpnt_vf_setready(sc); 1777 return (true); 1778 } 1779 1780 static void 1781 hn_xpnt_vf_setenable(struct hn_softc *sc) 1782 { 1783 int i; 1784 1785 HN_LOCK_ASSERT(sc); 1786 1787 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1788 rm_wlock(&sc->hn_vf_lock); 1789 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1790 rm_wunlock(&sc->hn_vf_lock); 1791 1792 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1793 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1794 } 1795 1796 static void 1797 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1798 { 1799 int i; 1800 1801 HN_LOCK_ASSERT(sc); 1802 1803 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1804 rm_wlock(&sc->hn_vf_lock); 1805 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1806 if (clear_vf) 1807 sc->hn_vf_ifp = NULL; 1808 rm_wunlock(&sc->hn_vf_lock); 1809 1810 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1811 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1812 } 1813 1814 static void 1815 hn_xpnt_vf_init(struct hn_softc *sc) 1816 { 1817 int error; 1818 1819 HN_LOCK_ASSERT(sc); 1820 1821 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1822 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1823 1824 if (bootverbose) { 1825 if_printf(sc->hn_ifp, "try bringing up %s\n", 1826 if_name(sc->hn_vf_ifp)); 1827 } 1828 1829 /* 1830 * Bring the VF up. 1831 */ 1832 hn_xpnt_vf_saveifflags(sc); 1833 if_setflagbits(sc->hn_ifp, IFF_UP, 0); 1834 error = hn_xpnt_vf_iocsetflags(sc); 1835 if (error) { 1836 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1837 if_name(sc->hn_vf_ifp), error); 1838 return; 1839 } 1840 1841 /* 1842 * NOTE: 1843 * Datapath setting must happen _after_ bringing the VF up. 1844 */ 1845 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1846 1847 /* 1848 * NOTE: 1849 * Fixup RSS related bits _after_ the VF is brought up, since 1850 * many VFs generate RSS key during it's initialization. 1851 */ 1852 hn_vf_rss_fixup(sc, true); 1853 1854 /* Mark transparent mode VF as enabled. */ 1855 hn_xpnt_vf_setenable(sc); 1856 } 1857 1858 static void 1859 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1860 { 1861 struct hn_softc *sc = xsc; 1862 1863 HN_LOCK(sc); 1864 1865 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1866 goto done; 1867 if (sc->hn_vf_ifp == NULL) 1868 goto done; 1869 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1870 goto done; 1871 1872 if (sc->hn_vf_rdytick != 0) { 1873 /* Mark VF as ready. */ 1874 hn_xpnt_vf_setready(sc); 1875 } 1876 1877 if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) { 1878 /* 1879 * Delayed VF initialization. 1880 */ 1881 if (bootverbose) { 1882 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1883 if_name(sc->hn_vf_ifp)); 1884 } 1885 hn_xpnt_vf_init(sc); 1886 } 1887 done: 1888 HN_UNLOCK(sc); 1889 } 1890 1891 static void 1892 hn_ifnet_attevent(void *xsc, if_t ifp) 1893 { 1894 struct hn_softc *sc = xsc; 1895 1896 HN_LOCK(sc); 1897 1898 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1899 goto done; 1900 1901 if (!hn_ismyvf(sc, ifp)) 1902 goto done; 1903 1904 if (sc->hn_vf_ifp != NULL) { 1905 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1906 if_name(sc->hn_vf_ifp)); 1907 goto done; 1908 } 1909 1910 if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) { 1911 /* 1912 * ifnet.if_start is _not_ supported by transparent 1913 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1914 */ 1915 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1916 "in transparent VF mode.\n", if_name(sc->hn_vf_ifp)); 1917 1918 goto done; 1919 } 1920 1921 rm_wlock(&hn_vfmap_lock); 1922 1923 if (if_getindex(ifp) >= hn_vfmap_size) { 1924 if_t *newmap; 1925 int newsize; 1926 1927 newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF; 1928 newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF, 1929 M_WAITOK | M_ZERO); 1930 1931 memcpy(newmap, hn_vfmap, 1932 sizeof(if_t) * hn_vfmap_size); 1933 free(hn_vfmap, M_DEVBUF); 1934 hn_vfmap = newmap; 1935 hn_vfmap_size = newsize; 1936 } 1937 KASSERT(hn_vfmap[if_getindex(ifp)] == NULL, 1938 ("%s: ifindex %d was mapped to %s", 1939 if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); 1940 hn_vfmap[if_getindex(ifp)] = sc->hn_ifp; 1941 1942 rm_wunlock(&hn_vfmap_lock); 1943 1944 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1945 rm_wlock(&sc->hn_vf_lock); 1946 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1947 ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); 1948 sc->hn_vf_ifp = ifp; 1949 rm_wunlock(&sc->hn_vf_lock); 1950 1951 if (hn_xpnt_vf) { 1952 int wait_ticks; 1953 1954 /* 1955 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1956 * Save vf_ifp's current if_input for later restoration. 1957 */ 1958 sc->hn_vf_input = if_getinputfn(ifp); 1959 if_setinputfn(ifp, hn_xpnt_vf_input); 1960 1961 /* 1962 * Stop link status management; use the VF's. 1963 */ 1964 hn_suspend_mgmt(sc); 1965 1966 /* 1967 * Give VF sometime to complete its attach routing. 1968 */ 1969 wait_ticks = hn_xpnt_vf_attwait * hz; 1970 sc->hn_vf_rdytick = ticks + wait_ticks; 1971 1972 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1973 wait_ticks); 1974 } 1975 done: 1976 HN_UNLOCK(sc); 1977 } 1978 1979 static void 1980 hn_ifnet_detevent(void *xsc, if_t ifp) 1981 { 1982 struct hn_softc *sc = xsc; 1983 1984 HN_LOCK(sc); 1985 1986 if (sc->hn_vf_ifp == NULL) 1987 goto done; 1988 1989 if (!hn_ismyvf(sc, ifp)) 1990 goto done; 1991 1992 if (hn_xpnt_vf) { 1993 /* 1994 * Make sure that the delayed initialization is not running. 1995 * 1996 * NOTE: 1997 * - This lock _must_ be released, since the hn_vf_init task 1998 * will try holding this lock. 1999 * - It is safe to release this lock here, since the 2000 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 2001 * 2002 * XXX racy, if hn(4) ever detached. 2003 */ 2004 HN_UNLOCK(sc); 2005 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 2006 HN_LOCK(sc); 2007 2008 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 2009 if_name(sc->hn_ifp))); 2010 if_setinputfn(ifp, sc->hn_vf_input); 2011 sc->hn_vf_input = NULL; 2012 2013 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 2014 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 2015 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2016 2017 if (sc->hn_vf_rdytick == 0) { 2018 /* 2019 * The VF was ready; restore some settings. 2020 */ 2021 if_setcapabilities(ifp, sc->hn_saved_caps); 2022 /* 2023 * NOTE: 2024 * There is _no_ need to fixup if_capenable and 2025 * if_hwassist, since the if_capabilities before 2026 * restoration was an intersection of the VF's 2027 * if_capabilites and the synthetic device's 2028 * if_capabilites. 2029 */ 2030 if_sethwtsomax(ifp, sc->hn_saved_tsomax); 2031 if_sethwtsomaxsegcount(sc->hn_ifp, 2032 sc->hn_saved_tsosegcnt); 2033 if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz); 2034 } 2035 2036 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2037 /* 2038 * Restore RSS settings. 2039 */ 2040 hn_vf_rss_restore(sc); 2041 2042 /* 2043 * Resume link status management, which was suspended 2044 * by hn_ifnet_attevent(). 2045 */ 2046 hn_resume_mgmt(sc); 2047 } 2048 } 2049 2050 /* Mark transparent mode VF as disabled. */ 2051 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2052 2053 rm_wlock(&hn_vfmap_lock); 2054 2055 KASSERT(if_getindex(ifp) < hn_vfmap_size, 2056 ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size)); 2057 if (hn_vfmap[if_getindex(ifp)] != NULL) { 2058 KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp, 2059 ("%s: ifindex %d was mapped to %s", 2060 if_name(ifp), if_getindex(ifp), 2061 if_name(hn_vfmap[if_getindex(ifp)]))); 2062 hn_vfmap[if_getindex(ifp)] = NULL; 2063 } 2064 2065 rm_wunlock(&hn_vfmap_lock); 2066 done: 2067 HN_UNLOCK(sc); 2068 } 2069 2070 static void 2071 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state) 2072 { 2073 struct hn_softc *sc = xsc; 2074 2075 if (sc->hn_vf_ifp == ifp) 2076 if_link_state_change(sc->hn_ifp, link_state); 2077 } 2078 2079 static int 2080 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS) 2081 { 2082 struct hn_softc *sc = arg1; 2083 unsigned int tsomax; 2084 int error; 2085 2086 tsomax = if_gethwtsomax(sc->hn_ifp); 2087 error = sysctl_handle_int(oidp, &tsomax, 0, req); 2088 return error; 2089 } 2090 2091 static int 2092 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS) 2093 { 2094 struct hn_softc *sc = arg1; 2095 unsigned int tsomaxsegcnt; 2096 int error; 2097 2098 tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp); 2099 error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req); 2100 return error; 2101 } 2102 2103 static int 2104 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS) 2105 { 2106 struct hn_softc *sc = arg1; 2107 unsigned int tsomaxsegsz; 2108 int error; 2109 2110 tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp); 2111 error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req); 2112 return error; 2113 } 2114 2115 static int 2116 hn_probe(device_t dev) 2117 { 2118 2119 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2120 device_set_desc(dev, "Hyper-V Network Interface"); 2121 return BUS_PROBE_DEFAULT; 2122 } 2123 return ENXIO; 2124 } 2125 2126 static int 2127 hn_attach(device_t dev) 2128 { 2129 struct hn_softc *sc = device_get_softc(dev); 2130 struct sysctl_oid_list *child; 2131 struct sysctl_ctx_list *ctx; 2132 uint8_t eaddr[ETHER_ADDR_LEN]; 2133 if_t ifp = NULL; 2134 int error, ring_cnt, tx_ring_cnt; 2135 uint32_t mtu; 2136 2137 sc->hn_dev = dev; 2138 sc->hn_prichan = vmbus_get_channel(dev); 2139 HN_LOCK_INIT(sc); 2140 rm_init(&sc->hn_vf_lock, "hnvf"); 2141 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2142 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2143 2144 /* 2145 * Initialize these tunables once. 2146 */ 2147 sc->hn_agg_size = hn_tx_agg_size; 2148 sc->hn_agg_pkts = hn_tx_agg_pkts; 2149 2150 /* 2151 * Setup taskqueue for transmission. 2152 */ 2153 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2154 int i; 2155 2156 sc->hn_tx_taskqs = 2157 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2158 M_DEVBUF, M_WAITOK); 2159 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2160 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2161 M_WAITOK, taskqueue_thread_enqueue, 2162 &sc->hn_tx_taskqs[i]); 2163 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2164 "%s tx%d", device_get_nameunit(dev), i); 2165 } 2166 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2167 sc->hn_tx_taskqs = hn_tx_taskque; 2168 } 2169 2170 /* 2171 * Setup taskqueue for mangement tasks, e.g. link status. 2172 */ 2173 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2174 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2175 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2176 device_get_nameunit(dev)); 2177 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2178 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2179 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2180 hn_netchg_status_taskfunc, sc); 2181 2182 if (hn_xpnt_vf) { 2183 /* 2184 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2185 */ 2186 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2187 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2188 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2189 device_get_nameunit(dev)); 2190 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2191 hn_xpnt_vf_init_taskfunc, sc); 2192 } 2193 2194 /* 2195 * Allocate ifnet and setup its name earlier, so that if_printf 2196 * can be used by functions, which will be called after 2197 * ether_ifattach(). 2198 */ 2199 ifp = sc->hn_ifp = if_alloc(IFT_ETHER); 2200 if_setsoftc(ifp, sc); 2201 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2202 2203 /* 2204 * Initialize ifmedia earlier so that it can be unconditionally 2205 * destroyed, if error happened later on. 2206 */ 2207 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2208 2209 /* 2210 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2211 * to use (tx_ring_cnt). 2212 * 2213 * NOTE: 2214 * The # of RX rings to use is same as the # of channels to use. 2215 */ 2216 ring_cnt = hn_chan_cnt; 2217 if (ring_cnt <= 0) { 2218 /* Default */ 2219 ring_cnt = mp_ncpus; 2220 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2221 ring_cnt = HN_RING_CNT_DEF_MAX; 2222 } else if (ring_cnt > mp_ncpus) { 2223 ring_cnt = mp_ncpus; 2224 } 2225 #ifdef RSS 2226 if (ring_cnt > rss_getnumbuckets()) 2227 ring_cnt = rss_getnumbuckets(); 2228 #endif 2229 2230 tx_ring_cnt = hn_tx_ring_cnt; 2231 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2232 tx_ring_cnt = ring_cnt; 2233 #ifdef HN_IFSTART_SUPPORT 2234 if (hn_use_if_start) { 2235 /* ifnet.if_start only needs one TX ring. */ 2236 tx_ring_cnt = 1; 2237 } 2238 #endif 2239 2240 /* 2241 * Set the leader CPU for channels. 2242 */ 2243 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2244 2245 /* 2246 * Create enough TX/RX rings, even if only limited number of 2247 * channels can be allocated. 2248 */ 2249 error = hn_create_tx_data(sc, tx_ring_cnt); 2250 if (error) 2251 goto failed; 2252 error = hn_create_rx_data(sc, ring_cnt); 2253 if (error) 2254 goto failed; 2255 2256 /* 2257 * Create transaction context for NVS and RNDIS transactions. 2258 */ 2259 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2260 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2261 if (sc->hn_xact == NULL) { 2262 error = ENXIO; 2263 goto failed; 2264 } 2265 2266 /* 2267 * Install orphan handler for the revocation of this device's 2268 * primary channel. 2269 * 2270 * NOTE: 2271 * The processing order is critical here: 2272 * Install the orphan handler, _before_ testing whether this 2273 * device's primary channel has been revoked or not. 2274 */ 2275 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2276 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2277 error = ENXIO; 2278 goto failed; 2279 } 2280 2281 /* 2282 * Attach the synthetic parts, i.e. NVS and RNDIS. 2283 */ 2284 error = hn_synth_attach(sc, ETHERMTU); 2285 if (error) 2286 goto failed; 2287 2288 error = hn_rndis_get_eaddr(sc, eaddr); 2289 if (error) 2290 goto failed; 2291 2292 error = hn_rndis_get_mtu(sc, &mtu); 2293 if (error) 2294 mtu = ETHERMTU; 2295 else if (bootverbose) 2296 device_printf(dev, "RNDIS mtu %u\n", mtu); 2297 2298 if (sc->hn_rx_ring_inuse > 1) { 2299 /* 2300 * Reduce TCP segment aggregation limit for multiple 2301 * RX rings to increase ACK timeliness. 2302 */ 2303 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2304 } 2305 2306 /* 2307 * Fixup TX/RX stuffs after synthetic parts are attached. 2308 */ 2309 hn_fixup_tx_data(sc); 2310 hn_fixup_rx_data(sc); 2311 2312 ctx = device_get_sysctl_ctx(dev); 2313 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2314 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2315 &sc->hn_nvs_ver, 0, "NVS version"); 2316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2317 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2318 hn_ndis_version_sysctl, "A", "NDIS version"); 2319 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2320 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2321 hn_caps_sysctl, "A", "capabilities"); 2322 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2323 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2324 hn_hwassist_sysctl, "A", "hwassist"); 2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max", 2326 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl, 2327 "IU", "max TSO size"); 2328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt", 2329 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl, 2330 "IU", "max # of TSO segments"); 2331 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz", 2332 CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl, 2333 "IU", "max size of TSO segment"); 2334 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2335 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2336 hn_rxfilter_sysctl, "A", "rxfilter"); 2337 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2338 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2339 hn_rss_hash_sysctl, "A", "RSS hash"); 2340 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2341 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2342 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2343 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2344 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2345 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2346 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2347 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2348 #ifndef RSS 2349 /* 2350 * Don't allow RSS key/indirect table changes, if RSS is defined. 2351 */ 2352 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2353 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2354 hn_rss_key_sysctl, "IU", "RSS key"); 2355 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2356 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2357 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2358 #endif 2359 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2360 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2361 "RNDIS offered packet transmission aggregation size limit"); 2362 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2363 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2364 "RNDIS offered packet transmission aggregation count limit"); 2365 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2366 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2367 "RNDIS packet transmission aggregation alignment"); 2368 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2369 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2370 hn_txagg_size_sysctl, "I", 2371 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2372 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2373 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2374 hn_txagg_pkts_sysctl, "I", 2375 "Packet transmission aggregation packets, " 2376 "0 -- disable, -1 -- auto"); 2377 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2378 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2379 hn_polling_sysctl, "I", 2380 "Polling frequency: [100,1000000], 0 disable polling"); 2381 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2382 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2383 hn_vf_sysctl, "A", "Virtual Function's name"); 2384 if (!hn_xpnt_vf) { 2385 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2386 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2387 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2388 } else { 2389 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2390 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2391 hn_xpnt_vf_enabled_sysctl, "I", 2392 "Transparent VF enabled"); 2393 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2394 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2395 hn_xpnt_vf_accbpf_sysctl, "I", 2396 "Accurate BPF for transparent VF"); 2397 } 2398 2399 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", 2400 CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", 2401 "switch to rsc"); 2402 2403 /* 2404 * Setup the ifmedia, which has been initialized earlier. 2405 */ 2406 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2407 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2408 /* XXX ifmedia_set really should do this for us */ 2409 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2410 2411 /* 2412 * Setup the ifnet for this interface. 2413 */ 2414 2415 if_setbaudrate(ifp, IF_Gbps(10)); 2416 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); 2417 if_setioctlfn(ifp, hn_ioctl); 2418 if_setinitfn(ifp, hn_init); 2419 #ifdef HN_IFSTART_SUPPORT 2420 if (hn_use_if_start) { 2421 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2422 2423 if_setstartfn(ifp, hn_start); 2424 if_setsendqlen(ifp, qdepth); 2425 if_setsendqready(ifp); 2426 } else 2427 #endif 2428 { 2429 if_settransmitfn(ifp, hn_transmit); 2430 if_setqflushfn(ifp, hn_xmit_qflush); 2431 } 2432 2433 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0); 2434 #ifdef foo 2435 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2436 if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); 2437 #endif 2438 if (sc->hn_caps & HN_CAP_VLAN) { 2439 /* XXX not sure about VLAN_MTU. */ 2440 if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0); 2441 } 2442 2443 if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist); 2444 if (if_gethwassist(ifp) & HN_CSUM_IP_MASK) 2445 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0); 2446 if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK) 2447 if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0); 2448 if (sc->hn_caps & HN_CAP_TSO4) { 2449 if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); 2450 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 2451 } 2452 if (sc->hn_caps & HN_CAP_TSO6) { 2453 if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); 2454 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 2455 } 2456 2457 /* Enable all available capabilities by default. */ 2458 if_setcapenable(ifp, if_getcapabilities(ifp)); 2459 2460 /* 2461 * Disable IPv6 TSO and TXCSUM by default, they still can 2462 * be enabled through SIOCSIFCAP. 2463 */ 2464 if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6)); 2465 if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO)); 2466 2467 if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) { 2468 /* 2469 * Lock hn_set_tso_maxsize() to simplify its 2470 * internal logic. 2471 */ 2472 HN_LOCK(sc); 2473 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2474 HN_UNLOCK(sc); 2475 if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX); 2476 if_sethwtsomaxsegsize(ifp, PAGE_SIZE); 2477 } 2478 2479 ether_ifattach(ifp, eaddr); 2480 2481 if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2482 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2483 if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp)); 2484 } 2485 if (mtu < ETHERMTU) { 2486 2487 if_setmtu(ifp, mtu); 2488 } 2489 2490 /* Inform the upper layer about the long frame support. */ 2491 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); 2492 2493 /* 2494 * Kick off link status check. 2495 */ 2496 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2497 hn_update_link_status(sc); 2498 2499 if (!hn_xpnt_vf) { 2500 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2501 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2502 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2503 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2504 } else { 2505 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2506 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2507 } 2508 2509 /* 2510 * NOTE: 2511 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2512 * since interface's LLADDR is needed; interface LLADDR is not 2513 * available when ifnet_arrival event is triggered. 2514 */ 2515 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2516 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2517 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2518 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2519 2520 return (0); 2521 failed: 2522 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2523 hn_synth_detach(sc); 2524 hn_detach(dev); 2525 return (error); 2526 } 2527 2528 static int 2529 hn_detach(device_t dev) 2530 { 2531 struct hn_softc *sc = device_get_softc(dev); 2532 if_t ifp = sc->hn_ifp, vf_ifp; 2533 2534 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2535 /* 2536 * In case that the vmbus missed the orphan handler 2537 * installation. 2538 */ 2539 vmbus_xact_ctx_orphan(sc->hn_xact); 2540 } 2541 2542 if (sc->hn_ifaddr_evthand != NULL) 2543 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2544 if (sc->hn_ifnet_evthand != NULL) 2545 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2546 if (sc->hn_ifnet_atthand != NULL) { 2547 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2548 sc->hn_ifnet_atthand); 2549 } 2550 if (sc->hn_ifnet_dethand != NULL) { 2551 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2552 sc->hn_ifnet_dethand); 2553 } 2554 if (sc->hn_ifnet_lnkhand != NULL) 2555 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2556 2557 vf_ifp = sc->hn_vf_ifp; 2558 __compiler_membar(); 2559 if (vf_ifp != NULL) 2560 hn_ifnet_detevent(sc, vf_ifp); 2561 2562 if (device_is_attached(dev)) { 2563 HN_LOCK(sc); 2564 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2565 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 2566 hn_stop(sc, true); 2567 /* 2568 * NOTE: 2569 * hn_stop() only suspends data, so managment 2570 * stuffs have to be suspended manually here. 2571 */ 2572 hn_suspend_mgmt(sc); 2573 hn_synth_detach(sc); 2574 } 2575 HN_UNLOCK(sc); 2576 ether_ifdetach(ifp); 2577 } 2578 2579 ifmedia_removeall(&sc->hn_media); 2580 hn_destroy_rx_data(sc); 2581 hn_destroy_tx_data(sc); 2582 2583 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2584 int i; 2585 2586 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2587 taskqueue_free(sc->hn_tx_taskqs[i]); 2588 free(sc->hn_tx_taskqs, M_DEVBUF); 2589 } 2590 taskqueue_free(sc->hn_mgmt_taskq0); 2591 if (sc->hn_vf_taskq != NULL) 2592 taskqueue_free(sc->hn_vf_taskq); 2593 2594 if (sc->hn_xact != NULL) { 2595 /* 2596 * Uninstall the orphan handler _before_ the xact is 2597 * destructed. 2598 */ 2599 vmbus_chan_unset_orphan(sc->hn_prichan); 2600 vmbus_xact_ctx_destroy(sc->hn_xact); 2601 } 2602 2603 if_free(ifp); 2604 2605 HN_LOCK_DESTROY(sc); 2606 rm_destroy(&sc->hn_vf_lock); 2607 return (0); 2608 } 2609 2610 static int 2611 hn_shutdown(device_t dev) 2612 { 2613 2614 return (0); 2615 } 2616 2617 static void 2618 hn_link_status(struct hn_softc *sc) 2619 { 2620 uint32_t link_status; 2621 int error; 2622 2623 error = hn_rndis_get_linkstatus(sc, &link_status); 2624 if (error) { 2625 /* XXX what to do? */ 2626 return; 2627 } 2628 2629 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2630 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2631 else 2632 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2633 if_link_state_change(sc->hn_ifp, 2634 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2635 LINK_STATE_UP : LINK_STATE_DOWN); 2636 } 2637 2638 static void 2639 hn_link_taskfunc(void *xsc, int pending __unused) 2640 { 2641 struct hn_softc *sc = xsc; 2642 2643 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2644 return; 2645 hn_link_status(sc); 2646 } 2647 2648 static void 2649 hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2650 { 2651 struct hn_softc *sc = xsc; 2652 2653 /* Prevent any link status checks from running. */ 2654 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2655 2656 /* 2657 * Fake up a [link down --> link up] state change; 5 seconds 2658 * delay is used, which closely simulates miibus reaction 2659 * upon link down event. 2660 */ 2661 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2662 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2663 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2664 &sc->hn_netchg_status, 5 * hz); 2665 } 2666 2667 static void 2668 hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2669 { 2670 struct hn_softc *sc = xsc; 2671 2672 /* Re-allow link status checks. */ 2673 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2674 hn_link_status(sc); 2675 } 2676 2677 static void 2678 hn_update_link_status(struct hn_softc *sc) 2679 { 2680 2681 if (sc->hn_mgmt_taskq != NULL) 2682 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2683 } 2684 2685 static void 2686 hn_change_network(struct hn_softc *sc) 2687 { 2688 2689 if (sc->hn_mgmt_taskq != NULL) 2690 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2691 } 2692 2693 static __inline int 2694 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2695 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2696 { 2697 struct mbuf *m = *m_head; 2698 int error; 2699 2700 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2701 2702 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2703 m, segs, nsegs, BUS_DMA_NOWAIT); 2704 if (error == EFBIG) { 2705 struct mbuf *m_new; 2706 2707 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2708 if (m_new == NULL) 2709 return ENOBUFS; 2710 else 2711 *m_head = m = m_new; 2712 txr->hn_tx_collapsed++; 2713 2714 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2715 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2716 } 2717 if (!error) { 2718 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2719 BUS_DMASYNC_PREWRITE); 2720 txd->flags |= HN_TXD_FLAG_DMAMAP; 2721 } 2722 return error; 2723 } 2724 2725 static __inline int 2726 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2727 { 2728 2729 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2730 ("put an onlist txd %#x", txd->flags)); 2731 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2732 ("put an onagg txd %#x", txd->flags)); 2733 2734 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2735 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2736 return 0; 2737 2738 if (!STAILQ_EMPTY(&txd->agg_list)) { 2739 struct hn_txdesc *tmp_txd; 2740 2741 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2742 int freed __diagused; 2743 2744 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2745 ("resursive aggregation on aggregated txdesc")); 2746 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2747 ("not aggregated txdesc")); 2748 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2749 ("aggregated txdesc uses dmamap")); 2750 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2751 ("aggregated txdesc consumes " 2752 "chimney sending buffer")); 2753 KASSERT(tmp_txd->chim_size == 0, 2754 ("aggregated txdesc has non-zero " 2755 "chimney sending size")); 2756 2757 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2758 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2759 freed = hn_txdesc_put(txr, tmp_txd); 2760 KASSERT(freed, ("failed to free aggregated txdesc")); 2761 } 2762 } 2763 2764 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2765 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2766 ("chim txd uses dmamap")); 2767 hn_chim_free(txr->hn_sc, txd->chim_index); 2768 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2769 txd->chim_size = 0; 2770 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2771 bus_dmamap_sync(txr->hn_tx_data_dtag, 2772 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2773 bus_dmamap_unload(txr->hn_tx_data_dtag, 2774 txd->data_dmap); 2775 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2776 } 2777 2778 if (txd->m != NULL) { 2779 m_freem(txd->m); 2780 txd->m = NULL; 2781 } 2782 2783 txd->flags |= HN_TXD_FLAG_ONLIST; 2784 #ifndef HN_USE_TXDESC_BUFRING 2785 mtx_lock_spin(&txr->hn_txlist_spin); 2786 KASSERT(txr->hn_txdesc_avail >= 0 && 2787 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2788 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2789 txr->hn_txdesc_avail++; 2790 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2791 mtx_unlock_spin(&txr->hn_txlist_spin); 2792 #else /* HN_USE_TXDESC_BUFRING */ 2793 #ifdef HN_DEBUG 2794 atomic_add_int(&txr->hn_txdesc_avail, 1); 2795 #endif 2796 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2797 #endif /* !HN_USE_TXDESC_BUFRING */ 2798 2799 return 1; 2800 } 2801 2802 static __inline struct hn_txdesc * 2803 hn_txdesc_get(struct hn_tx_ring *txr) 2804 { 2805 struct hn_txdesc *txd; 2806 2807 #ifndef HN_USE_TXDESC_BUFRING 2808 mtx_lock_spin(&txr->hn_txlist_spin); 2809 txd = SLIST_FIRST(&txr->hn_txlist); 2810 if (txd != NULL) { 2811 KASSERT(txr->hn_txdesc_avail > 0, 2812 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2813 txr->hn_txdesc_avail--; 2814 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2815 } 2816 mtx_unlock_spin(&txr->hn_txlist_spin); 2817 #else 2818 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2819 #endif 2820 2821 if (txd != NULL) { 2822 #ifdef HN_USE_TXDESC_BUFRING 2823 #ifdef HN_DEBUG 2824 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2825 #endif 2826 #endif /* HN_USE_TXDESC_BUFRING */ 2827 KASSERT(txd->m == NULL && txd->refs == 0 && 2828 STAILQ_EMPTY(&txd->agg_list) && 2829 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2830 txd->chim_size == 0 && 2831 (txd->flags & HN_TXD_FLAG_ONLIST) && 2832 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2833 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2834 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2835 txd->refs = 1; 2836 } 2837 return txd; 2838 } 2839 2840 static __inline void 2841 hn_txdesc_hold(struct hn_txdesc *txd) 2842 { 2843 2844 /* 0->1 transition will never work */ 2845 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2846 atomic_add_int(&txd->refs, 1); 2847 } 2848 2849 static __inline void 2850 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2851 { 2852 2853 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2854 ("recursive aggregation on aggregating txdesc")); 2855 2856 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2857 ("already aggregated")); 2858 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2859 ("recursive aggregation on to-be-aggregated txdesc")); 2860 2861 txd->flags |= HN_TXD_FLAG_ONAGG; 2862 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2863 } 2864 2865 static bool 2866 hn_tx_ring_pending(struct hn_tx_ring *txr) 2867 { 2868 bool pending = false; 2869 2870 #ifndef HN_USE_TXDESC_BUFRING 2871 mtx_lock_spin(&txr->hn_txlist_spin); 2872 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2873 pending = true; 2874 mtx_unlock_spin(&txr->hn_txlist_spin); 2875 #else 2876 if (!buf_ring_full(txr->hn_txdesc_br)) 2877 pending = true; 2878 #endif 2879 return (pending); 2880 } 2881 2882 static __inline void 2883 hn_txeof(struct hn_tx_ring *txr) 2884 { 2885 txr->hn_has_txeof = 0; 2886 txr->hn_txeof(txr); 2887 } 2888 2889 static void 2890 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2891 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2892 { 2893 struct hn_txdesc *txd = sndc->hn_cbarg; 2894 struct hn_tx_ring *txr; 2895 2896 txr = txd->txr; 2897 KASSERT(txr->hn_chan == chan, 2898 ("channel mismatch, on chan%u, should be chan%u", 2899 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2900 2901 txr->hn_has_txeof = 1; 2902 hn_txdesc_put(txr, txd); 2903 2904 ++txr->hn_txdone_cnt; 2905 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2906 txr->hn_txdone_cnt = 0; 2907 if (txr->hn_oactive) 2908 hn_txeof(txr); 2909 } 2910 } 2911 2912 static void 2913 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2914 { 2915 #if defined(INET) || defined(INET6) 2916 struct epoch_tracker et; 2917 2918 NET_EPOCH_ENTER(et); 2919 tcp_lro_flush_all(&rxr->hn_lro); 2920 NET_EPOCH_EXIT(et); 2921 #endif 2922 2923 /* 2924 * NOTE: 2925 * 'txr' could be NULL, if multiple channels and 2926 * ifnet.if_start method are enabled. 2927 */ 2928 if (txr == NULL || !txr->hn_has_txeof) 2929 return; 2930 2931 txr->hn_txdone_cnt = 0; 2932 hn_txeof(txr); 2933 } 2934 2935 static __inline uint32_t 2936 hn_rndis_pktmsg_offset(uint32_t ofs) 2937 { 2938 2939 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2940 ("invalid RNDIS packet msg offset %u", ofs)); 2941 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2942 } 2943 2944 static __inline void * 2945 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2946 size_t pi_dlen, uint32_t pi_type) 2947 { 2948 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2949 struct rndis_pktinfo *pi; 2950 2951 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2952 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2953 2954 /* 2955 * Per-packet-info does not move; it only grows. 2956 * 2957 * NOTE: 2958 * rm_pktinfooffset in this phase counts from the beginning 2959 * of rndis_packet_msg. 2960 */ 2961 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2962 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2963 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2964 pkt->rm_pktinfolen); 2965 pkt->rm_pktinfolen += pi_size; 2966 2967 pi->rm_size = pi_size; 2968 pi->rm_type = pi_type; 2969 pi->rm_internal = 0; 2970 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2971 2972 return (pi->rm_data); 2973 } 2974 2975 static __inline int 2976 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr) 2977 { 2978 struct hn_txdesc *txd; 2979 struct mbuf *m; 2980 int error, pkts; 2981 2982 txd = txr->hn_agg_txd; 2983 KASSERT(txd != NULL, ("no aggregate txdesc")); 2984 2985 /* 2986 * Since hn_txpkt() will reset this temporary stat, save 2987 * it now, so that oerrors can be updated properly, if 2988 * hn_txpkt() ever fails. 2989 */ 2990 pkts = txr->hn_stat_pkts; 2991 2992 /* 2993 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2994 * failure, save it for later freeing, if hn_txpkt() ever 2995 * fails. 2996 */ 2997 m = txd->m; 2998 error = hn_txpkt(ifp, txr, txd); 2999 if (__predict_false(error)) { 3000 /* txd is freed, but m is not. */ 3001 m_freem(m); 3002 3003 txr->hn_flush_failed++; 3004 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 3005 } 3006 3007 /* Reset all aggregation states. */ 3008 txr->hn_agg_txd = NULL; 3009 txr->hn_agg_szleft = 0; 3010 txr->hn_agg_pktleft = 0; 3011 txr->hn_agg_prevpkt = NULL; 3012 3013 return (error); 3014 } 3015 3016 static void * 3017 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3018 int pktsize) 3019 { 3020 void *chim; 3021 3022 if (txr->hn_agg_txd != NULL) { 3023 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 3024 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 3025 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 3026 int olen; 3027 3028 /* 3029 * Update the previous RNDIS packet's total length, 3030 * it can be increased due to the mandatory alignment 3031 * padding for this RNDIS packet. And update the 3032 * aggregating txdesc's chimney sending buffer size 3033 * accordingly. 3034 * 3035 * XXX 3036 * Zero-out the padding, as required by the RNDIS spec. 3037 */ 3038 olen = pkt->rm_len; 3039 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 3040 agg_txd->chim_size += pkt->rm_len - olen; 3041 3042 /* Link this txdesc to the parent. */ 3043 hn_txdesc_agg(agg_txd, txd); 3044 3045 chim = (uint8_t *)pkt + pkt->rm_len; 3046 /* Save the current packet for later fixup. */ 3047 txr->hn_agg_prevpkt = chim; 3048 3049 txr->hn_agg_pktleft--; 3050 txr->hn_agg_szleft -= pktsize; 3051 if (txr->hn_agg_szleft <= 3052 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3053 /* 3054 * Probably can't aggregate more packets, 3055 * flush this aggregating txdesc proactively. 3056 */ 3057 txr->hn_agg_pktleft = 0; 3058 } 3059 /* Done! */ 3060 return (chim); 3061 } 3062 hn_flush_txagg(ifp, txr); 3063 } 3064 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3065 3066 txr->hn_tx_chimney_tried++; 3067 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3068 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3069 return (NULL); 3070 txr->hn_tx_chimney++; 3071 3072 chim = txr->hn_sc->hn_chim + 3073 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3074 3075 if (txr->hn_agg_pktmax > 1 && 3076 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3077 txr->hn_agg_txd = txd; 3078 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3079 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3080 txr->hn_agg_prevpkt = chim; 3081 } 3082 return (chim); 3083 } 3084 3085 /* 3086 * NOTE: 3087 * If this function fails, then both txd and m_head0 will be freed. 3088 */ 3089 static int 3090 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3091 struct mbuf **m_head0) 3092 { 3093 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3094 int error, nsegs, i; 3095 struct mbuf *m_head = *m_head0; 3096 struct rndis_packet_msg *pkt; 3097 uint32_t *pi_data; 3098 void *chim = NULL; 3099 int pkt_hlen, pkt_size; 3100 3101 pkt = txd->rndis_pkt; 3102 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3103 if (pkt_size < txr->hn_chim_size) { 3104 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3105 if (chim != NULL) 3106 pkt = chim; 3107 } else { 3108 if (txr->hn_agg_txd != NULL) 3109 hn_flush_txagg(ifp, txr); 3110 } 3111 3112 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3113 pkt->rm_len = m_head->m_pkthdr.len; 3114 pkt->rm_dataoffset = 0; 3115 pkt->rm_datalen = m_head->m_pkthdr.len; 3116 pkt->rm_oobdataoffset = 0; 3117 pkt->rm_oobdatalen = 0; 3118 pkt->rm_oobdataelements = 0; 3119 pkt->rm_pktinfooffset = sizeof(*pkt); 3120 pkt->rm_pktinfolen = 0; 3121 pkt->rm_vchandle = 0; 3122 pkt->rm_reserved = 0; 3123 3124 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3125 /* 3126 * Set the hash value for this packet. 3127 */ 3128 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3129 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3130 3131 if (M_HASHTYPE_ISHASH(m_head)) 3132 /* 3133 * The flowid field contains the hash value host 3134 * set in the rx queue if it is a ip forwarding pkt. 3135 * Set the same hash value so host can send on the 3136 * cpu it was received. 3137 */ 3138 *pi_data = m_head->m_pkthdr.flowid; 3139 else 3140 /* 3141 * Otherwise just put the tx queue index. 3142 */ 3143 *pi_data = txr->hn_tx_idx; 3144 } 3145 3146 if (m_head->m_flags & M_VLANTAG) { 3147 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3148 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3149 *pi_data = NDIS_VLAN_INFO_MAKE( 3150 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3151 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3152 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3153 } 3154 3155 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3156 #if defined(INET6) || defined(INET) 3157 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3158 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3159 #ifdef INET 3160 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3161 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3162 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3163 m_head->m_pkthdr.tso_segsz); 3164 } 3165 #endif 3166 #if defined(INET6) && defined(INET) 3167 else 3168 #endif 3169 #ifdef INET6 3170 { 3171 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3172 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3173 m_head->m_pkthdr.tso_segsz); 3174 } 3175 #endif 3176 #endif /* INET6 || INET */ 3177 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3178 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3179 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3180 if (m_head->m_pkthdr.csum_flags & 3181 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3182 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3183 } else { 3184 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3185 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3186 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3187 } 3188 3189 if (m_head->m_pkthdr.csum_flags & 3190 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3191 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3192 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3193 } else if (m_head->m_pkthdr.csum_flags & 3194 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3195 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3196 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3197 } 3198 } 3199 3200 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3201 /* Fixup RNDIS packet message total length */ 3202 pkt->rm_len += pkt_hlen; 3203 /* Convert RNDIS packet message offsets */ 3204 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3205 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3206 3207 /* 3208 * Fast path: Chimney sending. 3209 */ 3210 if (chim != NULL) { 3211 struct hn_txdesc *tgt_txd = txd; 3212 3213 if (txr->hn_agg_txd != NULL) { 3214 tgt_txd = txr->hn_agg_txd; 3215 #ifdef INVARIANTS 3216 *m_head0 = NULL; 3217 #endif 3218 } 3219 3220 KASSERT(pkt == chim, 3221 ("RNDIS pkt not in chimney sending buffer")); 3222 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3223 ("chimney sending buffer is not used")); 3224 tgt_txd->chim_size += pkt->rm_len; 3225 3226 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3227 ((uint8_t *)chim) + pkt_hlen); 3228 3229 txr->hn_gpa_cnt = 0; 3230 txr->hn_sendpkt = hn_txpkt_chim; 3231 goto done; 3232 } 3233 3234 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3235 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3236 ("chimney buffer is used")); 3237 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3238 3239 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3240 if (__predict_false(error)) { 3241 int freed __diagused; 3242 3243 /* 3244 * This mbuf is not linked w/ the txd yet, so free it now. 3245 */ 3246 m_freem(m_head); 3247 *m_head0 = NULL; 3248 3249 freed = hn_txdesc_put(txr, txd); 3250 KASSERT(freed != 0, 3251 ("fail to free txd upon txdma error")); 3252 3253 txr->hn_txdma_failed++; 3254 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3255 return error; 3256 } 3257 *m_head0 = m_head; 3258 3259 /* +1 RNDIS packet message */ 3260 txr->hn_gpa_cnt = nsegs + 1; 3261 3262 /* send packet with page buffer */ 3263 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3264 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3265 txr->hn_gpa[0].gpa_len = pkt_hlen; 3266 3267 /* 3268 * Fill the page buffers with mbuf info after the page 3269 * buffer for RNDIS packet message. 3270 */ 3271 for (i = 0; i < nsegs; ++i) { 3272 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3273 3274 gpa->gpa_page = atop(segs[i].ds_addr); 3275 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3276 gpa->gpa_len = segs[i].ds_len; 3277 } 3278 3279 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3280 txd->chim_size = 0; 3281 txr->hn_sendpkt = hn_txpkt_sglist; 3282 done: 3283 txd->m = m_head; 3284 3285 /* Set the completion routine */ 3286 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3287 3288 /* Update temporary stats for later use. */ 3289 txr->hn_stat_pkts++; 3290 txr->hn_stat_size += m_head->m_pkthdr.len; 3291 if (m_head->m_flags & M_MCAST) 3292 txr->hn_stat_mcasts++; 3293 3294 return 0; 3295 } 3296 3297 /* 3298 * NOTE: 3299 * If this function fails, then txd will be freed, but the mbuf 3300 * associated w/ the txd will _not_ be freed. 3301 */ 3302 static int 3303 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3304 { 3305 int error, send_failed = 0, has_bpf; 3306 3307 again: 3308 has_bpf = bpf_peers_present(if_getbpf(ifp)); 3309 if (has_bpf) { 3310 /* 3311 * Make sure that this txd and any aggregated txds are not 3312 * freed before ETHER_BPF_MTAP. 3313 */ 3314 hn_txdesc_hold(txd); 3315 } 3316 error = txr->hn_sendpkt(txr, txd); 3317 if (!error) { 3318 if (has_bpf) { 3319 const struct hn_txdesc *tmp_txd; 3320 3321 ETHER_BPF_MTAP(ifp, txd->m); 3322 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3323 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3324 } 3325 3326 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3327 #ifdef HN_IFSTART_SUPPORT 3328 if (!hn_use_if_start) 3329 #endif 3330 { 3331 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3332 txr->hn_stat_size); 3333 if (txr->hn_stat_mcasts != 0) { 3334 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3335 txr->hn_stat_mcasts); 3336 } 3337 } 3338 txr->hn_pkts += txr->hn_stat_pkts; 3339 txr->hn_sends++; 3340 } 3341 if (has_bpf) 3342 hn_txdesc_put(txr, txd); 3343 3344 if (__predict_false(error)) { 3345 int freed __diagused; 3346 3347 /* 3348 * This should "really rarely" happen. 3349 * 3350 * XXX Too many RX to be acked or too many sideband 3351 * commands to run? Ask netvsc_channel_rollup() 3352 * to kick start later. 3353 */ 3354 txr->hn_has_txeof = 1; 3355 if (!send_failed) { 3356 txr->hn_send_failed++; 3357 send_failed = 1; 3358 /* 3359 * Try sending again after set hn_has_txeof; 3360 * in case that we missed the last 3361 * netvsc_channel_rollup(). 3362 */ 3363 goto again; 3364 } 3365 if_printf(ifp, "send failed\n"); 3366 3367 /* 3368 * Caller will perform further processing on the 3369 * associated mbuf, so don't free it in hn_txdesc_put(); 3370 * only unload it from the DMA map in hn_txdesc_put(), 3371 * if it was loaded. 3372 */ 3373 txd->m = NULL; 3374 freed = hn_txdesc_put(txr, txd); 3375 KASSERT(freed != 0, 3376 ("fail to free txd upon send error")); 3377 3378 txr->hn_send_failed++; 3379 } 3380 3381 /* Reset temporary stats, after this sending is done. */ 3382 txr->hn_stat_size = 0; 3383 txr->hn_stat_pkts = 0; 3384 txr->hn_stat_mcasts = 0; 3385 3386 return (error); 3387 } 3388 3389 /* 3390 * Append the specified data to the indicated mbuf chain, 3391 * Extend the mbuf chain if the new data does not fit in 3392 * existing space. 3393 * 3394 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3395 * There should be an equivalent in the kernel mbuf code, 3396 * but there does not appear to be one yet. 3397 * 3398 * Differs from m_append() in that additional mbufs are 3399 * allocated with cluster size MJUMPAGESIZE, and filled 3400 * accordingly. 3401 * 3402 * Return the last mbuf in the chain or NULL if failed to 3403 * allocate new mbuf. 3404 */ 3405 static struct mbuf * 3406 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3407 { 3408 struct mbuf *m, *n; 3409 int remainder, space; 3410 3411 for (m = m0; m->m_next != NULL; m = m->m_next) 3412 ; 3413 remainder = len; 3414 space = M_TRAILINGSPACE(m); 3415 if (space > 0) { 3416 /* 3417 * Copy into available space. 3418 */ 3419 if (space > remainder) 3420 space = remainder; 3421 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3422 m->m_len += space; 3423 cp += space; 3424 remainder -= space; 3425 } 3426 while (remainder > 0) { 3427 /* 3428 * Allocate a new mbuf; could check space 3429 * and allocate a cluster instead. 3430 */ 3431 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); 3432 if (n == NULL) 3433 return NULL; 3434 n->m_len = min(MJUMPAGESIZE, remainder); 3435 bcopy(cp, mtod(n, caddr_t), n->m_len); 3436 cp += n->m_len; 3437 remainder -= n->m_len; 3438 m->m_next = n; 3439 m = n; 3440 } 3441 3442 return m; 3443 } 3444 3445 #if defined(INET) || defined(INET6) 3446 static __inline int 3447 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3448 { 3449 if (hn_lro_mbufq_depth) { 3450 tcp_lro_queue_mbuf(lc, m); 3451 return 0; 3452 } 3453 return tcp_lro_rx(lc, m, 0); 3454 } 3455 #endif 3456 3457 static int 3458 hn_rxpkt(struct hn_rx_ring *rxr) 3459 { 3460 if_t ifp, hn_ifp = rxr->hn_ifp; 3461 struct mbuf *m_new, *n; 3462 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3463 int hash_type = M_HASHTYPE_NONE; 3464 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3465 int i; 3466 3467 ifp = hn_ifp; 3468 if (rxr->hn_rxvf_ifp != NULL) { 3469 /* 3470 * Non-transparent mode VF; pretend this packet is from 3471 * the VF. 3472 */ 3473 ifp = rxr->hn_rxvf_ifp; 3474 is_vf = 1; 3475 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3476 /* Transparent mode VF. */ 3477 is_vf = 1; 3478 } 3479 3480 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { 3481 /* 3482 * NOTE: 3483 * See the NOTE of hn_rndis_init_fixat(). This 3484 * function can be reached, immediately after the 3485 * RNDIS is initialized but before the ifnet is 3486 * setup on the hn_attach() path; drop the unexpected 3487 * packets. 3488 */ 3489 return (0); 3490 } 3491 3492 if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { 3493 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3494 return (0); 3495 } 3496 3497 if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { 3498 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3499 if (m_new == NULL) { 3500 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3501 return (0); 3502 } 3503 memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], 3504 rxr->rsc.frag_len[0]); 3505 m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; 3506 } else { 3507 /* 3508 * Get an mbuf with a cluster. For packets 2K or less, 3509 * get a standard 2K cluster. For anything larger, get a 3510 * 4K cluster. Any buffers larger than 4K can cause problems 3511 * if looped around to the Hyper-V TX channel, so avoid them. 3512 */ 3513 size = MCLBYTES; 3514 if (rxr->rsc.pktlen > MCLBYTES) { 3515 /* 4096 */ 3516 size = MJUMPAGESIZE; 3517 } 3518 3519 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3520 if (m_new == NULL) { 3521 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3522 return (0); 3523 } 3524 3525 n = m_new; 3526 for (i = 0; i < rxr->rsc.cnt; i++) { 3527 n = hv_m_append(n, rxr->rsc.frag_len[i], 3528 rxr->rsc.frag_data[i]); 3529 if (n == NULL) { 3530 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3531 return (0); 3532 } else { 3533 m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; 3534 } 3535 } 3536 } 3537 if (rxr->rsc.pktlen <= MHLEN) 3538 rxr->hn_small_pkts++; 3539 3540 m_new->m_pkthdr.rcvif = ifp; 3541 3542 if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0)) 3543 do_csum = 0; 3544 3545 /* receive side checksum offload */ 3546 if (rxr->rsc.csum_info != NULL) { 3547 /* IP csum offload */ 3548 if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3549 m_new->m_pkthdr.csum_flags |= 3550 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3551 rxr->hn_csum_ip++; 3552 } 3553 3554 /* TCP/UDP csum offload */ 3555 if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | 3556 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3557 m_new->m_pkthdr.csum_flags |= 3558 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3559 m_new->m_pkthdr.csum_data = 0xffff; 3560 if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) 3561 rxr->hn_csum_tcp++; 3562 else 3563 rxr->hn_csum_udp++; 3564 } 3565 3566 /* 3567 * XXX 3568 * As of this write (Oct 28th, 2016), host side will turn 3569 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3570 * the do_lro setting here is actually _not_ accurate. We 3571 * depend on the RSS hash type check to reset do_lro. 3572 */ 3573 if ((*(rxr->rsc.csum_info) & 3574 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3575 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3576 do_lro = 1; 3577 } else { 3578 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3579 if (l3proto == ETHERTYPE_IP) { 3580 if (l4proto == IPPROTO_TCP) { 3581 if (do_csum && 3582 (rxr->hn_trust_hcsum & 3583 HN_TRUST_HCSUM_TCP)) { 3584 rxr->hn_csum_trusted++; 3585 m_new->m_pkthdr.csum_flags |= 3586 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3587 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3588 m_new->m_pkthdr.csum_data = 0xffff; 3589 } 3590 do_lro = 1; 3591 } else if (l4proto == IPPROTO_UDP) { 3592 if (do_csum && 3593 (rxr->hn_trust_hcsum & 3594 HN_TRUST_HCSUM_UDP)) { 3595 rxr->hn_csum_trusted++; 3596 m_new->m_pkthdr.csum_flags |= 3597 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3598 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3599 m_new->m_pkthdr.csum_data = 0xffff; 3600 } 3601 } else if (l4proto != IPPROTO_DONE && do_csum && 3602 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3603 rxr->hn_csum_trusted++; 3604 m_new->m_pkthdr.csum_flags |= 3605 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3606 } 3607 } 3608 } 3609 3610 if (rxr->rsc.vlan_info != NULL) { 3611 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3612 NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), 3613 NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), 3614 NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); 3615 m_new->m_flags |= M_VLANTAG; 3616 } 3617 3618 /* 3619 * If VF is activated (tranparent/non-transparent mode does not 3620 * matter here). 3621 * 3622 * - Disable LRO 3623 * 3624 * hn(4) will only receive broadcast packets, multicast packets, 3625 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3626 * packet types. 3627 * 3628 * For non-transparent, we definitely _cannot_ enable LRO at 3629 * all, since the LRO flush will use hn(4) as the receiving 3630 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3631 */ 3632 if (is_vf) 3633 do_lro = 0; 3634 3635 /* 3636 * If VF is activated (tranparent/non-transparent mode does not 3637 * matter here), do _not_ mess with unsupported hash types or 3638 * functions. 3639 */ 3640 if (rxr->rsc.hash_info != NULL) { 3641 rxr->hn_rss_pkts++; 3642 m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); 3643 if (!is_vf) 3644 hash_type = M_HASHTYPE_OPAQUE_HASH; 3645 if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == 3646 NDIS_HASH_FUNCTION_TOEPLITZ) { 3647 uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & 3648 rxr->hn_mbuf_hash); 3649 3650 /* 3651 * NOTE: 3652 * do_lro is resetted, if the hash types are not TCP 3653 * related. See the comment in the above csum_flags 3654 * setup section. 3655 */ 3656 switch (type) { 3657 case NDIS_HASH_IPV4: 3658 hash_type = M_HASHTYPE_RSS_IPV4; 3659 do_lro = 0; 3660 break; 3661 3662 case NDIS_HASH_TCP_IPV4: 3663 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3664 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3665 int def_htype = M_HASHTYPE_OPAQUE_HASH; 3666 3667 if (is_vf) 3668 def_htype = M_HASHTYPE_NONE; 3669 3670 /* 3671 * UDP 4-tuple hash is delivered as 3672 * TCP 4-tuple hash. 3673 */ 3674 if (l3proto == ETHERTYPE_MAX) { 3675 hn_rxpkt_proto(m_new, 3676 &l3proto, &l4proto); 3677 } 3678 if (l3proto == ETHERTYPE_IP) { 3679 if (l4proto == IPPROTO_UDP && 3680 (rxr->hn_mbuf_hash & 3681 NDIS_HASH_UDP_IPV4_X)) { 3682 hash_type = 3683 M_HASHTYPE_RSS_UDP_IPV4; 3684 do_lro = 0; 3685 } else if (l4proto != 3686 IPPROTO_TCP) { 3687 hash_type = def_htype; 3688 do_lro = 0; 3689 } 3690 } else { 3691 hash_type = def_htype; 3692 do_lro = 0; 3693 } 3694 } 3695 break; 3696 3697 case NDIS_HASH_IPV6: 3698 hash_type = M_HASHTYPE_RSS_IPV6; 3699 do_lro = 0; 3700 break; 3701 3702 case NDIS_HASH_IPV6_EX: 3703 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3704 do_lro = 0; 3705 break; 3706 3707 case NDIS_HASH_TCP_IPV6: 3708 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3709 break; 3710 3711 case NDIS_HASH_TCP_IPV6_EX: 3712 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3713 break; 3714 } 3715 } 3716 } else if (!is_vf) { 3717 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3718 hash_type = M_HASHTYPE_OPAQUE; 3719 } 3720 M_HASHTYPE_SET(m_new, hash_type); 3721 3722 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3723 if (hn_ifp != ifp) { 3724 const struct ether_header *eh; 3725 3726 /* 3727 * Non-transparent mode VF is activated. 3728 */ 3729 3730 /* 3731 * Allow tapping on hn(4). 3732 */ 3733 ETHER_BPF_MTAP(hn_ifp, m_new); 3734 3735 /* 3736 * Update hn(4)'s stats. 3737 */ 3738 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3739 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3740 /* Checked at the beginning of this function. */ 3741 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3742 eh = mtod(m_new, struct ether_header *); 3743 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3744 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3745 } 3746 rxr->hn_pkts++; 3747 3748 if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) { 3749 #if defined(INET) || defined(INET6) 3750 struct lro_ctrl *lro = &rxr->hn_lro; 3751 3752 if (lro->lro_cnt) { 3753 rxr->hn_lro_tried++; 3754 if (hn_lro_rx(lro, m_new) == 0) { 3755 /* DONE! */ 3756 return 0; 3757 } 3758 } 3759 #endif 3760 } 3761 if_input(ifp, m_new); 3762 3763 return (0); 3764 } 3765 3766 static int 3767 hn_ioctl(if_t ifp, u_long cmd, caddr_t data) 3768 { 3769 struct hn_softc *sc = if_getsoftc(ifp); 3770 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3771 if_t vf_ifp; 3772 int mask, error = 0; 3773 struct ifrsskey *ifrk; 3774 struct ifrsshash *ifrh; 3775 uint32_t mtu; 3776 3777 switch (cmd) { 3778 case SIOCSIFMTU: 3779 if (ifr->ifr_mtu > HN_MTU_MAX) { 3780 error = EINVAL; 3781 break; 3782 } 3783 3784 HN_LOCK(sc); 3785 3786 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3787 HN_UNLOCK(sc); 3788 break; 3789 } 3790 3791 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3792 /* Can't change MTU */ 3793 HN_UNLOCK(sc); 3794 error = EOPNOTSUPP; 3795 break; 3796 } 3797 3798 if (if_getmtu(ifp) == ifr->ifr_mtu) { 3799 HN_UNLOCK(sc); 3800 break; 3801 } 3802 3803 if (hn_xpnt_vf_isready(sc)) { 3804 vf_ifp = sc->hn_vf_ifp; 3805 ifr_vf = *ifr; 3806 strlcpy(ifr_vf.ifr_name, if_name(vf_ifp), 3807 sizeof(ifr_vf.ifr_name)); 3808 error = ifhwioctl(SIOCSIFMTU,vf_ifp, 3809 (caddr_t)&ifr_vf, curthread); 3810 if (error) { 3811 HN_UNLOCK(sc); 3812 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3813 if_name(vf_ifp), ifr->ifr_mtu, error); 3814 break; 3815 } 3816 } 3817 3818 /* 3819 * Suspend this interface before the synthetic parts 3820 * are ripped. 3821 */ 3822 hn_suspend(sc); 3823 3824 /* 3825 * Detach the synthetics parts, i.e. NVS and RNDIS. 3826 */ 3827 hn_synth_detach(sc); 3828 3829 /* 3830 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3831 * with the new MTU setting. 3832 */ 3833 error = hn_synth_attach(sc, ifr->ifr_mtu); 3834 if (error) { 3835 HN_UNLOCK(sc); 3836 break; 3837 } 3838 3839 error = hn_rndis_get_mtu(sc, &mtu); 3840 if (error) 3841 mtu = ifr->ifr_mtu; 3842 else if (bootverbose) 3843 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3844 3845 /* 3846 * Commit the requested MTU, after the synthetic parts 3847 * have been successfully attached. 3848 */ 3849 if (mtu >= ifr->ifr_mtu) { 3850 mtu = ifr->ifr_mtu; 3851 } else { 3852 if_printf(ifp, "fixup mtu %d -> %u\n", 3853 ifr->ifr_mtu, mtu); 3854 } 3855 if_setmtu(ifp, mtu); 3856 3857 /* 3858 * Synthetic parts' reattach may change the chimney 3859 * sending size; update it. 3860 */ 3861 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3862 hn_set_chim_size(sc, sc->hn_chim_szmax); 3863 3864 /* 3865 * Make sure that various parameters based on MTU are 3866 * still valid, after the MTU change. 3867 */ 3868 hn_mtu_change_fixup(sc); 3869 3870 /* 3871 * All done! Resume the interface now. 3872 */ 3873 hn_resume(sc); 3874 3875 if ((sc->hn_flags & HN_FLAG_RXVF) || 3876 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3877 /* 3878 * Since we have reattached the NVS part, 3879 * change the datapath to VF again; in case 3880 * that it is lost, after the NVS was detached. 3881 */ 3882 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3883 } 3884 3885 HN_UNLOCK(sc); 3886 break; 3887 3888 case SIOCSIFFLAGS: 3889 HN_LOCK(sc); 3890 3891 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3892 HN_UNLOCK(sc); 3893 break; 3894 } 3895 3896 if (hn_xpnt_vf_isready(sc)) 3897 hn_xpnt_vf_saveifflags(sc); 3898 3899 if (if_getflags(ifp) & IFF_UP) { 3900 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3901 /* 3902 * Caller meight hold mutex, e.g. 3903 * bpf; use busy-wait for the RNDIS 3904 * reply. 3905 */ 3906 HN_NO_SLEEPING(sc); 3907 hn_rxfilter_config(sc); 3908 HN_SLEEPING_OK(sc); 3909 3910 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3911 error = hn_xpnt_vf_iocsetflags(sc); 3912 } else { 3913 hn_init_locked(sc); 3914 } 3915 } else { 3916 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 3917 hn_stop(sc, false); 3918 } 3919 sc->hn_if_flags = if_getflags(ifp); 3920 3921 HN_UNLOCK(sc); 3922 break; 3923 3924 case SIOCSIFCAP: 3925 HN_LOCK(sc); 3926 3927 if (hn_xpnt_vf_isready(sc)) { 3928 ifr_vf = *ifr; 3929 strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp), 3930 sizeof(ifr_vf.ifr_name)); 3931 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3932 HN_UNLOCK(sc); 3933 break; 3934 } 3935 3936 /* 3937 * Fix up requested capabilities w/ supported capabilities, 3938 * since the supported capabilities could have been changed. 3939 */ 3940 mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ 3941 if_getcapenable(ifp); 3942 3943 if (mask & IFCAP_TXCSUM) { 3944 if_togglecapenable(ifp, IFCAP_TXCSUM); 3945 if (if_getcapenable(ifp) & IFCAP_TXCSUM) 3946 if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0); 3947 else 3948 if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc)); 3949 } 3950 if (mask & IFCAP_TXCSUM_IPV6) { 3951 if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); 3952 if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) 3953 if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0); 3954 else 3955 if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc)); 3956 } 3957 3958 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3959 if (mask & IFCAP_RXCSUM) 3960 if_togglecapenable(ifp, IFCAP_RXCSUM); 3961 #ifdef foo 3962 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3963 if (mask & IFCAP_RXCSUM_IPV6) 3964 if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); 3965 #endif 3966 3967 if (mask & IFCAP_LRO) 3968 if_togglecapenable(ifp, IFCAP_LRO); 3969 3970 if (mask & IFCAP_TSO4) { 3971 if_togglecapenable(ifp, IFCAP_TSO4); 3972 if (if_getcapenable(ifp) & IFCAP_TSO4) 3973 if_sethwassistbits(ifp, CSUM_IP_TSO, 0); 3974 else 3975 if_sethwassistbits(ifp, 0, CSUM_IP_TSO); 3976 } 3977 if (mask & IFCAP_TSO6) { 3978 if_togglecapenable(ifp, IFCAP_TSO6); 3979 if (if_getcapenable(ifp) & IFCAP_TSO6) 3980 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); 3981 else 3982 if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); 3983 } 3984 3985 HN_UNLOCK(sc); 3986 break; 3987 3988 case SIOCADDMULTI: 3989 case SIOCDELMULTI: 3990 HN_LOCK(sc); 3991 3992 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3993 HN_UNLOCK(sc); 3994 break; 3995 } 3996 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { 3997 /* 3998 * Multicast uses mutex; use busy-wait for 3999 * the RNDIS reply. 4000 */ 4001 HN_NO_SLEEPING(sc); 4002 hn_rxfilter_config(sc); 4003 HN_SLEEPING_OK(sc); 4004 } 4005 4006 /* XXX vlan(4) style mcast addr maintenance */ 4007 if (hn_xpnt_vf_isready(sc)) { 4008 int old_if_flags; 4009 4010 old_if_flags = if_getflags(sc->hn_vf_ifp); 4011 hn_xpnt_vf_saveifflags(sc); 4012 4013 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 4014 ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) & 4015 IFF_ALLMULTI)) 4016 error = hn_xpnt_vf_iocsetflags(sc); 4017 } 4018 4019 HN_UNLOCK(sc); 4020 break; 4021 4022 case SIOCSIFMEDIA: 4023 case SIOCGIFMEDIA: 4024 HN_LOCK(sc); 4025 if (hn_xpnt_vf_isready(sc)) { 4026 /* 4027 * SIOCGIFMEDIA expects ifmediareq, so don't 4028 * create and pass ifr_vf to the VF here; just 4029 * replace the ifr_name. 4030 */ 4031 vf_ifp = sc->hn_vf_ifp; 4032 strlcpy(ifr->ifr_name, if_name(vf_ifp), 4033 sizeof(ifr->ifr_name)); 4034 error = ifhwioctl(cmd, vf_ifp, data, curthread); 4035 /* Restore the ifr_name. */ 4036 strlcpy(ifr->ifr_name, if_name(ifp), 4037 sizeof(ifr->ifr_name)); 4038 HN_UNLOCK(sc); 4039 break; 4040 } 4041 HN_UNLOCK(sc); 4042 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 4043 break; 4044 4045 case SIOCGIFRSSHASH: 4046 ifrh = (struct ifrsshash *)data; 4047 HN_LOCK(sc); 4048 if (sc->hn_rx_ring_inuse == 1) { 4049 HN_UNLOCK(sc); 4050 ifrh->ifrh_func = RSS_FUNC_NONE; 4051 ifrh->ifrh_types = 0; 4052 break; 4053 } 4054 4055 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4056 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 4057 else 4058 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 4059 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 4060 HN_UNLOCK(sc); 4061 break; 4062 4063 case SIOCGIFRSSKEY: 4064 ifrk = (struct ifrsskey *)data; 4065 HN_LOCK(sc); 4066 if (sc->hn_rx_ring_inuse == 1) { 4067 HN_UNLOCK(sc); 4068 ifrk->ifrk_func = RSS_FUNC_NONE; 4069 ifrk->ifrk_keylen = 0; 4070 break; 4071 } 4072 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 4073 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 4074 else 4075 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 4076 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 4077 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 4078 NDIS_HASH_KEYSIZE_TOEPLITZ); 4079 HN_UNLOCK(sc); 4080 break; 4081 4082 default: 4083 error = ether_ioctl(ifp, cmd, data); 4084 break; 4085 } 4086 return (error); 4087 } 4088 4089 static void 4090 hn_stop(struct hn_softc *sc, bool detaching) 4091 { 4092 if_t ifp = sc->hn_ifp; 4093 int i; 4094 4095 HN_LOCK_ASSERT(sc); 4096 4097 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4098 ("synthetic parts were not attached")); 4099 4100 /* Clear RUNNING bit ASAP. */ 4101 if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); 4102 4103 /* Disable polling. */ 4104 hn_polling(sc, 0); 4105 4106 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4107 KASSERT(sc->hn_vf_ifp != NULL, 4108 ("%s: VF is not attached", if_name(ifp))); 4109 4110 /* Mark transparent mode VF as disabled. */ 4111 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4112 4113 /* 4114 * NOTE: 4115 * Datapath setting must happen _before_ bringing 4116 * the VF down. 4117 */ 4118 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4119 4120 /* 4121 * Bring the VF down. 4122 */ 4123 hn_xpnt_vf_saveifflags(sc); 4124 if_setflagbits(ifp, 0, IFF_UP); 4125 hn_xpnt_vf_iocsetflags(sc); 4126 } 4127 4128 /* Suspend data transfers. */ 4129 hn_suspend_data(sc); 4130 4131 /* Clear OACTIVE bit. */ 4132 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4133 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4134 sc->hn_tx_ring[i].hn_oactive = 0; 4135 4136 /* 4137 * If the non-transparent mode VF is active, make sure 4138 * that the RX filter still allows packet reception. 4139 */ 4140 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4141 hn_rxfilter_config(sc); 4142 } 4143 4144 static void 4145 hn_init_locked(struct hn_softc *sc) 4146 { 4147 if_t ifp = sc->hn_ifp; 4148 int i; 4149 4150 HN_LOCK_ASSERT(sc); 4151 4152 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4153 return; 4154 4155 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) 4156 return; 4157 4158 /* Configure RX filter */ 4159 hn_rxfilter_config(sc); 4160 4161 /* Clear OACTIVE bit. */ 4162 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 4163 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4164 sc->hn_tx_ring[i].hn_oactive = 0; 4165 4166 /* Clear TX 'suspended' bit. */ 4167 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4168 4169 if (hn_xpnt_vf_isready(sc)) { 4170 /* Initialize transparent VF. */ 4171 hn_xpnt_vf_init(sc); 4172 } 4173 4174 /* Everything is ready; unleash! */ 4175 if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); 4176 4177 /* Re-enable polling if requested. */ 4178 if (sc->hn_pollhz > 0) 4179 hn_polling(sc, sc->hn_pollhz); 4180 } 4181 4182 static void 4183 hn_init(void *xsc) 4184 { 4185 struct hn_softc *sc = xsc; 4186 4187 HN_LOCK(sc); 4188 hn_init_locked(sc); 4189 HN_UNLOCK(sc); 4190 } 4191 4192 static int 4193 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4194 { 4195 struct hn_softc *sc = arg1; 4196 unsigned int lenlim; 4197 int error; 4198 4199 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4200 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4201 if (error || req->newptr == NULL) 4202 return error; 4203 4204 HN_LOCK(sc); 4205 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4206 lenlim > TCP_LRO_LENGTH_MAX) { 4207 HN_UNLOCK(sc); 4208 return EINVAL; 4209 } 4210 hn_set_lro_lenlim(sc, lenlim); 4211 HN_UNLOCK(sc); 4212 4213 return 0; 4214 } 4215 4216 static int 4217 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4218 { 4219 struct hn_softc *sc = arg1; 4220 int ackcnt, error, i; 4221 4222 /* 4223 * lro_ackcnt_lim is append count limit, 4224 * +1 to turn it into aggregation limit. 4225 */ 4226 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4227 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4228 if (error || req->newptr == NULL) 4229 return error; 4230 4231 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4232 return EINVAL; 4233 4234 /* 4235 * Convert aggregation limit back to append 4236 * count limit. 4237 */ 4238 --ackcnt; 4239 HN_LOCK(sc); 4240 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4241 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4242 HN_UNLOCK(sc); 4243 return 0; 4244 } 4245 4246 static int 4247 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4248 { 4249 struct hn_softc *sc = arg1; 4250 int hcsum = arg2; 4251 int on, error, i; 4252 4253 on = 0; 4254 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4255 on = 1; 4256 4257 error = sysctl_handle_int(oidp, &on, 0, req); 4258 if (error || req->newptr == NULL) 4259 return error; 4260 4261 HN_LOCK(sc); 4262 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4263 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4264 4265 if (on) 4266 rxr->hn_trust_hcsum |= hcsum; 4267 else 4268 rxr->hn_trust_hcsum &= ~hcsum; 4269 } 4270 HN_UNLOCK(sc); 4271 return 0; 4272 } 4273 4274 static int 4275 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4276 { 4277 struct hn_softc *sc = arg1; 4278 int chim_size, error; 4279 4280 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4281 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4282 if (error || req->newptr == NULL) 4283 return error; 4284 4285 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4286 return EINVAL; 4287 4288 HN_LOCK(sc); 4289 hn_set_chim_size(sc, chim_size); 4290 HN_UNLOCK(sc); 4291 return 0; 4292 } 4293 4294 static int 4295 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4296 { 4297 struct hn_softc *sc = arg1; 4298 int ofs = arg2, i, error; 4299 struct hn_rx_ring *rxr; 4300 uint64_t stat; 4301 4302 stat = 0; 4303 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4304 rxr = &sc->hn_rx_ring[i]; 4305 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4306 } 4307 4308 error = sysctl_handle_64(oidp, &stat, 0, req); 4309 if (error || req->newptr == NULL) 4310 return error; 4311 4312 /* Zero out this stat. */ 4313 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4314 rxr = &sc->hn_rx_ring[i]; 4315 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4316 } 4317 return 0; 4318 } 4319 4320 static int 4321 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4322 { 4323 struct hn_softc *sc = arg1; 4324 int ofs = arg2, i, error; 4325 struct hn_rx_ring *rxr; 4326 u_long stat; 4327 4328 stat = 0; 4329 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4330 rxr = &sc->hn_rx_ring[i]; 4331 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4332 } 4333 4334 error = sysctl_handle_long(oidp, &stat, 0, req); 4335 if (error || req->newptr == NULL) 4336 return error; 4337 4338 /* Zero out this stat. */ 4339 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4340 rxr = &sc->hn_rx_ring[i]; 4341 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4342 } 4343 return 0; 4344 } 4345 4346 static int 4347 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4348 { 4349 struct hn_softc *sc = arg1; 4350 int ofs = arg2, i, error; 4351 struct hn_tx_ring *txr; 4352 u_long stat; 4353 4354 stat = 0; 4355 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4356 txr = &sc->hn_tx_ring[i]; 4357 stat += *((u_long *)((uint8_t *)txr + ofs)); 4358 } 4359 4360 error = sysctl_handle_long(oidp, &stat, 0, req); 4361 if (error || req->newptr == NULL) 4362 return error; 4363 4364 /* Zero out this stat. */ 4365 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4366 txr = &sc->hn_tx_ring[i]; 4367 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4368 } 4369 return 0; 4370 } 4371 4372 static int 4373 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4374 { 4375 struct hn_softc *sc = arg1; 4376 int ofs = arg2, i, error, conf; 4377 struct hn_tx_ring *txr; 4378 4379 txr = &sc->hn_tx_ring[0]; 4380 conf = *((int *)((uint8_t *)txr + ofs)); 4381 4382 error = sysctl_handle_int(oidp, &conf, 0, req); 4383 if (error || req->newptr == NULL) 4384 return error; 4385 4386 HN_LOCK(sc); 4387 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4388 txr = &sc->hn_tx_ring[i]; 4389 *((int *)((uint8_t *)txr + ofs)) = conf; 4390 } 4391 HN_UNLOCK(sc); 4392 4393 return 0; 4394 } 4395 4396 static int 4397 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4398 { 4399 struct hn_softc *sc = arg1; 4400 int error, size; 4401 4402 size = sc->hn_agg_size; 4403 error = sysctl_handle_int(oidp, &size, 0, req); 4404 if (error || req->newptr == NULL) 4405 return (error); 4406 4407 HN_LOCK(sc); 4408 sc->hn_agg_size = size; 4409 hn_set_txagg(sc); 4410 HN_UNLOCK(sc); 4411 4412 return (0); 4413 } 4414 4415 static int 4416 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4417 { 4418 struct hn_softc *sc = arg1; 4419 int error, pkts; 4420 4421 pkts = sc->hn_agg_pkts; 4422 error = sysctl_handle_int(oidp, &pkts, 0, req); 4423 if (error || req->newptr == NULL) 4424 return (error); 4425 4426 HN_LOCK(sc); 4427 sc->hn_agg_pkts = pkts; 4428 hn_set_txagg(sc); 4429 HN_UNLOCK(sc); 4430 4431 return (0); 4432 } 4433 4434 static int 4435 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4436 { 4437 struct hn_softc *sc = arg1; 4438 int pkts; 4439 4440 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4441 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4442 } 4443 4444 static int 4445 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4446 { 4447 struct hn_softc *sc = arg1; 4448 int align; 4449 4450 align = sc->hn_tx_ring[0].hn_agg_align; 4451 return (sysctl_handle_int(oidp, &align, 0, req)); 4452 } 4453 4454 static void 4455 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4456 { 4457 if (pollhz == 0) 4458 vmbus_chan_poll_disable(chan); 4459 else 4460 vmbus_chan_poll_enable(chan, pollhz); 4461 } 4462 4463 static void 4464 hn_polling(struct hn_softc *sc, u_int pollhz) 4465 { 4466 int nsubch = sc->hn_rx_ring_inuse - 1; 4467 4468 HN_LOCK_ASSERT(sc); 4469 4470 if (nsubch > 0) { 4471 struct vmbus_channel **subch; 4472 int i; 4473 4474 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4475 for (i = 0; i < nsubch; ++i) 4476 hn_chan_polling(subch[i], pollhz); 4477 vmbus_subchan_rel(subch, nsubch); 4478 } 4479 hn_chan_polling(sc->hn_prichan, pollhz); 4480 } 4481 4482 static int 4483 hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4484 { 4485 struct hn_softc *sc = arg1; 4486 int pollhz, error; 4487 4488 pollhz = sc->hn_pollhz; 4489 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4490 if (error || req->newptr == NULL) 4491 return (error); 4492 4493 if (pollhz != 0 && 4494 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4495 return (EINVAL); 4496 4497 HN_LOCK(sc); 4498 if (sc->hn_pollhz != pollhz) { 4499 sc->hn_pollhz = pollhz; 4500 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && 4501 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4502 hn_polling(sc, sc->hn_pollhz); 4503 } 4504 HN_UNLOCK(sc); 4505 4506 return (0); 4507 } 4508 4509 static int 4510 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4511 { 4512 struct hn_softc *sc = arg1; 4513 char verstr[16]; 4514 4515 snprintf(verstr, sizeof(verstr), "%u.%u", 4516 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4517 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4518 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4519 } 4520 4521 static int 4522 hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4523 { 4524 struct hn_softc *sc = arg1; 4525 char caps_str[128]; 4526 uint32_t caps; 4527 4528 HN_LOCK(sc); 4529 caps = sc->hn_caps; 4530 HN_UNLOCK(sc); 4531 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4532 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4533 } 4534 4535 static int 4536 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4537 { 4538 struct hn_softc *sc = arg1; 4539 char assist_str[128]; 4540 uint32_t hwassist; 4541 4542 HN_LOCK(sc); 4543 hwassist = if_gethwassist(sc->hn_ifp); 4544 HN_UNLOCK(sc); 4545 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4546 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4547 } 4548 4549 static int 4550 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4551 { 4552 struct hn_softc *sc = arg1; 4553 char filter_str[128]; 4554 uint32_t filter; 4555 4556 HN_LOCK(sc); 4557 filter = sc->hn_rx_filter; 4558 HN_UNLOCK(sc); 4559 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4560 NDIS_PACKET_TYPES); 4561 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4562 } 4563 4564 static int 4565 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) 4566 { 4567 struct hn_softc *sc = arg1; 4568 uint32_t mtu; 4569 int error; 4570 HN_LOCK(sc); 4571 error = hn_rndis_get_mtu(sc, &mtu); 4572 if (error) { 4573 if_printf(sc->hn_ifp, "failed to get mtu\n"); 4574 goto back; 4575 } 4576 error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4577 if (error || req->newptr == NULL) 4578 goto back; 4579 4580 error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); 4581 if (error) 4582 goto back; 4583 error = hn_rndis_reconf_offload(sc, mtu); 4584 back: 4585 HN_UNLOCK(sc); 4586 return (error); 4587 } 4588 #ifndef RSS 4589 4590 static int 4591 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4592 { 4593 struct hn_softc *sc = arg1; 4594 int error; 4595 4596 HN_LOCK(sc); 4597 4598 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4599 if (error || req->newptr == NULL) 4600 goto back; 4601 4602 if ((sc->hn_flags & HN_FLAG_RXVF) || 4603 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4604 /* 4605 * RSS key is synchronized w/ VF's, don't allow users 4606 * to change it. 4607 */ 4608 error = EBUSY; 4609 goto back; 4610 } 4611 4612 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4613 if (error) 4614 goto back; 4615 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4616 4617 if (sc->hn_rx_ring_inuse > 1) { 4618 error = hn_rss_reconfig(sc); 4619 } else { 4620 /* Not RSS capable, at least for now; just save the RSS key. */ 4621 error = 0; 4622 } 4623 back: 4624 HN_UNLOCK(sc); 4625 return (error); 4626 } 4627 4628 static int 4629 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4630 { 4631 struct hn_softc *sc = arg1; 4632 int error; 4633 4634 HN_LOCK(sc); 4635 4636 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4637 if (error || req->newptr == NULL) 4638 goto back; 4639 4640 /* 4641 * Don't allow RSS indirect table change, if this interface is not 4642 * RSS capable currently. 4643 */ 4644 if (sc->hn_rx_ring_inuse == 1) { 4645 error = EOPNOTSUPP; 4646 goto back; 4647 } 4648 4649 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4650 if (error) 4651 goto back; 4652 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4653 4654 hn_rss_ind_fixup(sc); 4655 error = hn_rss_reconfig(sc); 4656 back: 4657 HN_UNLOCK(sc); 4658 return (error); 4659 } 4660 4661 #endif /* !RSS */ 4662 4663 static int 4664 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4665 { 4666 struct hn_softc *sc = arg1; 4667 char hash_str[128]; 4668 uint32_t hash; 4669 4670 HN_LOCK(sc); 4671 hash = sc->hn_rss_hash; 4672 HN_UNLOCK(sc); 4673 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4674 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4675 } 4676 4677 static int 4678 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4679 { 4680 struct hn_softc *sc = arg1; 4681 char hash_str[128]; 4682 uint32_t hash; 4683 4684 HN_LOCK(sc); 4685 hash = sc->hn_rss_hcap; 4686 HN_UNLOCK(sc); 4687 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4688 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4689 } 4690 4691 static int 4692 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4693 { 4694 struct hn_softc *sc = arg1; 4695 char hash_str[128]; 4696 uint32_t hash; 4697 4698 HN_LOCK(sc); 4699 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4700 HN_UNLOCK(sc); 4701 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4702 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4703 } 4704 4705 static int 4706 hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4707 { 4708 struct hn_softc *sc = arg1; 4709 char vf_name[IFNAMSIZ + 1]; 4710 if_t vf_ifp; 4711 4712 HN_LOCK(sc); 4713 vf_name[0] = '\0'; 4714 vf_ifp = sc->hn_vf_ifp; 4715 if (vf_ifp != NULL) 4716 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4717 HN_UNLOCK(sc); 4718 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4719 } 4720 4721 static int 4722 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4723 { 4724 struct hn_softc *sc = arg1; 4725 char vf_name[IFNAMSIZ + 1]; 4726 if_t vf_ifp; 4727 4728 HN_LOCK(sc); 4729 vf_name[0] = '\0'; 4730 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4731 if (vf_ifp != NULL) 4732 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); 4733 HN_UNLOCK(sc); 4734 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4735 } 4736 4737 static int 4738 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4739 { 4740 struct rm_priotracker pt; 4741 struct sbuf *sb; 4742 int error, i; 4743 bool first; 4744 4745 error = sysctl_wire_old_buffer(req, 0); 4746 if (error != 0) 4747 return (error); 4748 4749 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4750 if (sb == NULL) 4751 return (ENOMEM); 4752 4753 rm_rlock(&hn_vfmap_lock, &pt); 4754 4755 first = true; 4756 for (i = 0; i < hn_vfmap_size; ++i) { 4757 struct epoch_tracker et; 4758 if_t ifp; 4759 4760 if (hn_vfmap[i] == NULL) 4761 continue; 4762 4763 NET_EPOCH_ENTER(et); 4764 ifp = ifnet_byindex(i); 4765 if (ifp != NULL) { 4766 if (first) 4767 sbuf_printf(sb, "%s", if_name(ifp)); 4768 else 4769 sbuf_printf(sb, " %s", if_name(ifp)); 4770 first = false; 4771 } 4772 NET_EPOCH_EXIT(et); 4773 } 4774 4775 rm_runlock(&hn_vfmap_lock, &pt); 4776 4777 error = sbuf_finish(sb); 4778 sbuf_delete(sb); 4779 return (error); 4780 } 4781 4782 static int 4783 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4784 { 4785 struct rm_priotracker pt; 4786 struct sbuf *sb; 4787 int error, i; 4788 bool first; 4789 4790 error = sysctl_wire_old_buffer(req, 0); 4791 if (error != 0) 4792 return (error); 4793 4794 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4795 if (sb == NULL) 4796 return (ENOMEM); 4797 4798 rm_rlock(&hn_vfmap_lock, &pt); 4799 4800 first = true; 4801 for (i = 0; i < hn_vfmap_size; ++i) { 4802 struct epoch_tracker et; 4803 if_t ifp, hn_ifp; 4804 4805 hn_ifp = hn_vfmap[i]; 4806 if (hn_ifp == NULL) 4807 continue; 4808 4809 NET_EPOCH_ENTER(et); 4810 ifp = ifnet_byindex(i); 4811 if (ifp != NULL) { 4812 if (first) { 4813 sbuf_printf(sb, "%s:%s", if_name(ifp), 4814 if_name(hn_ifp)); 4815 } else { 4816 sbuf_printf(sb, " %s:%s", if_name(ifp), 4817 if_name(hn_ifp)); 4818 } 4819 first = false; 4820 } 4821 NET_EPOCH_EXIT(et); 4822 } 4823 4824 rm_runlock(&hn_vfmap_lock, &pt); 4825 4826 error = sbuf_finish(sb); 4827 sbuf_delete(sb); 4828 return (error); 4829 } 4830 4831 static int 4832 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4833 { 4834 struct hn_softc *sc = arg1; 4835 int error, onoff = 0; 4836 4837 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4838 onoff = 1; 4839 error = sysctl_handle_int(oidp, &onoff, 0, req); 4840 if (error || req->newptr == NULL) 4841 return (error); 4842 4843 HN_LOCK(sc); 4844 /* NOTE: hn_vf_lock for hn_transmit() */ 4845 rm_wlock(&sc->hn_vf_lock); 4846 if (onoff) 4847 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4848 else 4849 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4850 rm_wunlock(&sc->hn_vf_lock); 4851 HN_UNLOCK(sc); 4852 4853 return (0); 4854 } 4855 4856 static int 4857 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4858 { 4859 struct hn_softc *sc = arg1; 4860 int enabled = 0; 4861 4862 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4863 enabled = 1; 4864 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4865 } 4866 4867 static int 4868 hn_check_iplen(const struct mbuf *m, int hoff) 4869 { 4870 const struct ip *ip; 4871 int len, iphlen, iplen; 4872 const struct tcphdr *th; 4873 int thoff; /* TCP data offset */ 4874 4875 len = hoff + sizeof(struct ip); 4876 4877 /* The packet must be at least the size of an IP header. */ 4878 if (m->m_pkthdr.len < len) 4879 return IPPROTO_DONE; 4880 4881 /* The fixed IP header must reside completely in the first mbuf. */ 4882 if (m->m_len < len) 4883 return IPPROTO_DONE; 4884 4885 ip = mtodo(m, hoff); 4886 4887 /* Bound check the packet's stated IP header length. */ 4888 iphlen = ip->ip_hl << 2; 4889 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4890 return IPPROTO_DONE; 4891 4892 /* The full IP header must reside completely in the one mbuf. */ 4893 if (m->m_len < hoff + iphlen) 4894 return IPPROTO_DONE; 4895 4896 iplen = ntohs(ip->ip_len); 4897 4898 /* 4899 * Check that the amount of data in the buffers is as 4900 * at least much as the IP header would have us expect. 4901 */ 4902 if (m->m_pkthdr.len < hoff + iplen) 4903 return IPPROTO_DONE; 4904 4905 /* 4906 * Ignore IP fragments. 4907 */ 4908 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4909 return IPPROTO_DONE; 4910 4911 /* 4912 * The TCP/IP or UDP/IP header must be entirely contained within 4913 * the first fragment of a packet. 4914 */ 4915 switch (ip->ip_p) { 4916 case IPPROTO_TCP: 4917 if (iplen < iphlen + sizeof(struct tcphdr)) 4918 return IPPROTO_DONE; 4919 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4920 return IPPROTO_DONE; 4921 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4922 thoff = th->th_off << 2; 4923 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4924 return IPPROTO_DONE; 4925 if (m->m_len < hoff + iphlen + thoff) 4926 return IPPROTO_DONE; 4927 break; 4928 case IPPROTO_UDP: 4929 if (iplen < iphlen + sizeof(struct udphdr)) 4930 return IPPROTO_DONE; 4931 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4932 return IPPROTO_DONE; 4933 break; 4934 default: 4935 if (iplen < iphlen) 4936 return IPPROTO_DONE; 4937 break; 4938 } 4939 return ip->ip_p; 4940 } 4941 4942 static void 4943 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4944 { 4945 const struct ether_header *eh; 4946 uint16_t etype; 4947 int hoff; 4948 4949 hoff = sizeof(*eh); 4950 /* Checked at the beginning of this function. */ 4951 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4952 4953 eh = mtod(m_new, const struct ether_header *); 4954 etype = ntohs(eh->ether_type); 4955 if (etype == ETHERTYPE_VLAN) { 4956 const struct ether_vlan_header *evl; 4957 4958 hoff = sizeof(*evl); 4959 if (m_new->m_len < hoff) 4960 return; 4961 evl = mtod(m_new, const struct ether_vlan_header *); 4962 etype = ntohs(evl->evl_proto); 4963 } 4964 *l3proto = etype; 4965 4966 if (etype == ETHERTYPE_IP) 4967 *l4proto = hn_check_iplen(m_new, hoff); 4968 else 4969 *l4proto = IPPROTO_DONE; 4970 } 4971 4972 static int 4973 hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4974 { 4975 struct sysctl_oid_list *child; 4976 struct sysctl_ctx_list *ctx; 4977 device_t dev = sc->hn_dev; 4978 #if defined(INET) || defined(INET6) 4979 int lroent_cnt; 4980 #endif 4981 int i; 4982 4983 /* 4984 * Create RXBUF for reception. 4985 * 4986 * NOTE: 4987 * - It is shared by all channels. 4988 * - A large enough buffer is allocated, certain version of NVSes 4989 * may further limit the usable space. 4990 */ 4991 sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 4992 0ul, ~0ul, PAGE_SIZE, 0); 4993 if (sc->hn_rxbuf == NULL) { 4994 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4995 return (ENOMEM); 4996 } 4997 4998 sc->hn_rx_ring_cnt = ring_cnt; 4999 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 5000 5001 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 5002 M_DEVBUF, M_WAITOK | M_ZERO); 5003 5004 #if defined(INET) || defined(INET6) 5005 lroent_cnt = hn_lro_entry_count; 5006 if (lroent_cnt < TCP_LRO_ENTRIES) 5007 lroent_cnt = TCP_LRO_ENTRIES; 5008 if (bootverbose) 5009 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 5010 #endif /* INET || INET6 */ 5011 5012 ctx = device_get_sysctl_ctx(dev); 5013 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 5014 5015 /* Create dev.hn.UNIT.rx sysctl tree */ 5016 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 5017 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5018 5019 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5020 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5021 5022 rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF, 5023 M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); 5024 if (rxr->hn_br == NULL) { 5025 device_printf(dev, "allocate bufring failed\n"); 5026 return (ENOMEM); 5027 } 5028 5029 if (hn_trust_hosttcp) 5030 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 5031 if (hn_trust_hostudp) 5032 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 5033 if (hn_trust_hostip) 5034 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 5035 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 5036 rxr->hn_ifp = sc->hn_ifp; 5037 if (i < sc->hn_tx_ring_cnt) 5038 rxr->hn_txr = &sc->hn_tx_ring[i]; 5039 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 5040 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 5041 rxr->hn_rx_idx = i; 5042 rxr->hn_rxbuf = sc->hn_rxbuf; 5043 5044 /* 5045 * Initialize LRO. 5046 */ 5047 #if defined(INET) || defined(INET6) 5048 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 5049 hn_lro_mbufq_depth); 5050 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 5051 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 5052 #endif /* INET || INET6 */ 5053 5054 if (sc->hn_rx_sysctl_tree != NULL) { 5055 char name[16]; 5056 5057 /* 5058 * Create per RX ring sysctl tree: 5059 * dev.hn.UNIT.rx.RINGID 5060 */ 5061 snprintf(name, sizeof(name), "%d", i); 5062 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 5063 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 5064 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5065 5066 if (rxr->hn_rx_sysctl_tree != NULL) { 5067 SYSCTL_ADD_ULONG(ctx, 5068 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5069 OID_AUTO, "packets", 5070 CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, 5071 "# of packets received"); 5072 SYSCTL_ADD_ULONG(ctx, 5073 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5074 OID_AUTO, "rss_pkts", 5075 CTLFLAG_RW | CTLFLAG_STATS, 5076 &rxr->hn_rss_pkts, 5077 "# of packets w/ RSS info received"); 5078 SYSCTL_ADD_ULONG(ctx, 5079 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5080 OID_AUTO, "rsc_pkts", 5081 CTLFLAG_RW | CTLFLAG_STATS, 5082 &rxr->hn_rsc_pkts, 5083 "# of RSC packets received"); 5084 SYSCTL_ADD_ULONG(ctx, 5085 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5086 OID_AUTO, "rsc_drop", 5087 CTLFLAG_RW | CTLFLAG_STATS, 5088 &rxr->hn_rsc_drop, 5089 "# of RSC fragments dropped"); 5090 SYSCTL_ADD_INT(ctx, 5091 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5092 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5093 &rxr->hn_pktbuf_len, 0, 5094 "Temporary channel packet buffer length"); 5095 } 5096 } 5097 } 5098 5099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5100 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5101 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5102 hn_rx_stat_u64_sysctl, 5103 "LU", "LRO queued"); 5104 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5105 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5106 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5107 hn_rx_stat_u64_sysctl, 5108 "LU", "LRO flushed"); 5109 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5110 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5111 __offsetof(struct hn_rx_ring, hn_lro_tried), 5112 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5113 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5114 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5115 hn_lro_lenlim_sysctl, "IU", 5116 "Max # of data bytes to be aggregated by LRO"); 5117 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5118 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5119 hn_lro_ackcnt_sysctl, "I", 5120 "Max # of ACKs to be aggregated by LRO"); 5121 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5122 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5123 hn_trust_hcsum_sysctl, "I", 5124 "Trust tcp segment verification on host side, " 5125 "when csum info is missing"); 5126 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5127 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5128 hn_trust_hcsum_sysctl, "I", 5129 "Trust udp datagram verification on host side, " 5130 "when csum info is missing"); 5131 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5132 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5133 hn_trust_hcsum_sysctl, "I", 5134 "Trust ip packet verification on host side, " 5135 "when csum info is missing"); 5136 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5137 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5138 __offsetof(struct hn_rx_ring, hn_csum_ip), 5139 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5140 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5141 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5142 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5143 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5144 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5145 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5146 __offsetof(struct hn_rx_ring, hn_csum_udp), 5147 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5148 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5149 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5150 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5151 hn_rx_stat_ulong_sysctl, "LU", 5152 "# of packets that we trust host's csum verification"); 5153 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5154 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5155 __offsetof(struct hn_rx_ring, hn_small_pkts), 5156 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5157 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5158 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, 5159 __offsetof(struct hn_rx_ring, hn_ack_failed), 5160 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5161 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5162 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5163 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5164 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5165 5166 return (0); 5167 } 5168 5169 static void 5170 hn_destroy_rx_data(struct hn_softc *sc) 5171 { 5172 int i; 5173 5174 if (sc->hn_rxbuf != NULL) { 5175 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5176 contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF); 5177 else 5178 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5179 sc->hn_rxbuf = NULL; 5180 } 5181 5182 if (sc->hn_rx_ring_cnt == 0) 5183 return; 5184 5185 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5186 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5187 5188 if (rxr->hn_br == NULL) 5189 continue; 5190 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5191 contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE, 5192 M_DEVBUF); 5193 } else { 5194 device_printf(sc->hn_dev, 5195 "%dth channel bufring is referenced", i); 5196 } 5197 rxr->hn_br = NULL; 5198 5199 #if defined(INET) || defined(INET6) 5200 tcp_lro_free(&rxr->hn_lro); 5201 #endif 5202 free(rxr->hn_pktbuf, M_DEVBUF); 5203 } 5204 free(sc->hn_rx_ring, M_DEVBUF); 5205 sc->hn_rx_ring = NULL; 5206 5207 sc->hn_rx_ring_cnt = 0; 5208 sc->hn_rx_ring_inuse = 0; 5209 } 5210 5211 static int 5212 hn_tx_ring_create(struct hn_softc *sc, int id) 5213 { 5214 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5215 device_t dev = sc->hn_dev; 5216 bus_dma_tag_t parent_dtag; 5217 int error, i; 5218 5219 txr->hn_sc = sc; 5220 txr->hn_tx_idx = id; 5221 5222 #ifndef HN_USE_TXDESC_BUFRING 5223 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5224 #endif 5225 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5226 5227 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5228 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5229 M_DEVBUF, M_WAITOK | M_ZERO); 5230 #ifndef HN_USE_TXDESC_BUFRING 5231 SLIST_INIT(&txr->hn_txlist); 5232 #else 5233 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5234 M_WAITOK, &txr->hn_tx_lock); 5235 #endif 5236 5237 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5238 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5239 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5240 } else { 5241 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5242 } 5243 5244 #ifdef HN_IFSTART_SUPPORT 5245 if (hn_use_if_start) { 5246 txr->hn_txeof = hn_start_txeof; 5247 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5248 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5249 } else 5250 #endif 5251 { 5252 int br_depth; 5253 5254 txr->hn_txeof = hn_xmit_txeof; 5255 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5256 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5257 5258 br_depth = hn_get_txswq_depth(txr); 5259 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5260 M_WAITOK, &txr->hn_tx_lock); 5261 } 5262 5263 txr->hn_direct_tx_size = hn_direct_tx_size; 5264 5265 /* 5266 * Always schedule transmission instead of trying to do direct 5267 * transmission. This one gives the best performance so far. 5268 */ 5269 txr->hn_sched_tx = 1; 5270 5271 parent_dtag = bus_get_dma_tag(dev); 5272 5273 /* DMA tag for RNDIS packet messages. */ 5274 error = bus_dma_tag_create(parent_dtag, /* parent */ 5275 HN_RNDIS_PKT_ALIGN, /* alignment */ 5276 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5277 BUS_SPACE_MAXADDR, /* lowaddr */ 5278 BUS_SPACE_MAXADDR, /* highaddr */ 5279 NULL, NULL, /* filter, filterarg */ 5280 HN_RNDIS_PKT_LEN, /* maxsize */ 5281 1, /* nsegments */ 5282 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5283 0, /* flags */ 5284 NULL, /* lockfunc */ 5285 NULL, /* lockfuncarg */ 5286 &txr->hn_tx_rndis_dtag); 5287 if (error) { 5288 device_printf(dev, "failed to create rndis dmatag\n"); 5289 return error; 5290 } 5291 5292 /* DMA tag for data. */ 5293 error = bus_dma_tag_create(parent_dtag, /* parent */ 5294 1, /* alignment */ 5295 HN_TX_DATA_BOUNDARY, /* boundary */ 5296 BUS_SPACE_MAXADDR, /* lowaddr */ 5297 BUS_SPACE_MAXADDR, /* highaddr */ 5298 NULL, NULL, /* filter, filterarg */ 5299 HN_TX_DATA_MAXSIZE, /* maxsize */ 5300 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5301 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5302 0, /* flags */ 5303 NULL, /* lockfunc */ 5304 NULL, /* lockfuncarg */ 5305 &txr->hn_tx_data_dtag); 5306 if (error) { 5307 device_printf(dev, "failed to create data dmatag\n"); 5308 return error; 5309 } 5310 5311 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5312 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5313 5314 txd->txr = txr; 5315 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5316 STAILQ_INIT(&txd->agg_list); 5317 5318 /* 5319 * Allocate and load RNDIS packet message. 5320 */ 5321 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5322 (void **)&txd->rndis_pkt, 5323 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5324 &txd->rndis_pkt_dmap); 5325 if (error) { 5326 device_printf(dev, 5327 "failed to allocate rndis_packet_msg, %d\n", i); 5328 return error; 5329 } 5330 5331 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5332 txd->rndis_pkt_dmap, 5333 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5334 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5335 BUS_DMA_NOWAIT); 5336 if (error) { 5337 device_printf(dev, 5338 "failed to load rndis_packet_msg, %d\n", i); 5339 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5340 txd->rndis_pkt, txd->rndis_pkt_dmap); 5341 return error; 5342 } 5343 5344 /* DMA map for TX data. */ 5345 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5346 &txd->data_dmap); 5347 if (error) { 5348 device_printf(dev, 5349 "failed to allocate tx data dmamap\n"); 5350 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5351 txd->rndis_pkt_dmap); 5352 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5353 txd->rndis_pkt, txd->rndis_pkt_dmap); 5354 return error; 5355 } 5356 5357 /* All set, put it to list */ 5358 txd->flags |= HN_TXD_FLAG_ONLIST; 5359 #ifndef HN_USE_TXDESC_BUFRING 5360 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5361 #else 5362 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5363 #endif 5364 } 5365 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5366 5367 if (sc->hn_tx_sysctl_tree != NULL) { 5368 struct sysctl_oid_list *child; 5369 struct sysctl_ctx_list *ctx; 5370 char name[16]; 5371 5372 /* 5373 * Create per TX ring sysctl tree: 5374 * dev.hn.UNIT.tx.RINGID 5375 */ 5376 ctx = device_get_sysctl_ctx(dev); 5377 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5378 5379 snprintf(name, sizeof(name), "%d", id); 5380 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5381 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5382 5383 if (txr->hn_tx_sysctl_tree != NULL) { 5384 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5385 5386 #ifdef HN_DEBUG 5387 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5388 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5389 "# of available TX descs"); 5390 #endif 5391 #ifdef HN_IFSTART_SUPPORT 5392 if (!hn_use_if_start) 5393 #endif 5394 { 5395 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5396 CTLFLAG_RD, &txr->hn_oactive, 0, 5397 "over active"); 5398 } 5399 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5400 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, 5401 "# of packets transmitted"); 5402 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5403 CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, 5404 "# of sends"); 5405 } 5406 } 5407 5408 return 0; 5409 } 5410 5411 static void 5412 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5413 { 5414 struct hn_tx_ring *txr = txd->txr; 5415 5416 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5417 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5418 5419 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5420 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5421 txd->rndis_pkt_dmap); 5422 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5423 } 5424 5425 static void 5426 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5427 { 5428 5429 KASSERT(txd->refs == 0 || txd->refs == 1, 5430 ("invalid txd refs %d", txd->refs)); 5431 5432 /* Aggregated txds will be freed by their aggregating txd. */ 5433 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5434 int freed __diagused; 5435 5436 freed = hn_txdesc_put(txr, txd); 5437 KASSERT(freed, ("can't free txdesc")); 5438 } 5439 } 5440 5441 static void 5442 hn_tx_ring_destroy(struct hn_tx_ring *txr) 5443 { 5444 int i; 5445 5446 if (txr->hn_txdesc == NULL) 5447 return; 5448 5449 /* 5450 * NOTE: 5451 * Because the freeing of aggregated txds will be deferred 5452 * to the aggregating txd, two passes are used here: 5453 * - The first pass GCes any pending txds. This GC is necessary, 5454 * since if the channels are revoked, hypervisor will not 5455 * deliver send-done for all pending txds. 5456 * - The second pass frees the busdma stuffs, i.e. after all txds 5457 * were freed. 5458 */ 5459 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5460 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5461 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5462 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5463 5464 if (txr->hn_tx_data_dtag != NULL) 5465 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5466 if (txr->hn_tx_rndis_dtag != NULL) 5467 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5468 5469 #ifdef HN_USE_TXDESC_BUFRING 5470 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5471 #endif 5472 5473 free(txr->hn_txdesc, M_DEVBUF); 5474 txr->hn_txdesc = NULL; 5475 5476 if (txr->hn_mbuf_br != NULL) 5477 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5478 5479 #ifndef HN_USE_TXDESC_BUFRING 5480 mtx_destroy(&txr->hn_txlist_spin); 5481 #endif 5482 mtx_destroy(&txr->hn_tx_lock); 5483 } 5484 5485 static int 5486 hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5487 { 5488 struct sysctl_oid_list *child; 5489 struct sysctl_ctx_list *ctx; 5490 int i; 5491 5492 /* 5493 * Create TXBUF for chimney sending. 5494 * 5495 * NOTE: It is shared by all channels. 5496 */ 5497 sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, 5498 0ul, ~0ul, PAGE_SIZE, 0); 5499 if (sc->hn_chim == NULL) { 5500 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5501 return (ENOMEM); 5502 } 5503 5504 sc->hn_tx_ring_cnt = ring_cnt; 5505 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5506 5507 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5508 M_DEVBUF, M_WAITOK | M_ZERO); 5509 5510 ctx = device_get_sysctl_ctx(sc->hn_dev); 5511 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5512 5513 /* Create dev.hn.UNIT.tx sysctl tree */ 5514 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5515 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5516 5517 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5518 int error; 5519 5520 error = hn_tx_ring_create(sc, i); 5521 if (error) 5522 return error; 5523 } 5524 5525 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5526 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5527 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5528 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5529 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5530 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5531 __offsetof(struct hn_tx_ring, hn_send_failed), 5532 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5533 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5534 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5535 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5536 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5537 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5538 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5539 __offsetof(struct hn_tx_ring, hn_flush_failed), 5540 hn_tx_stat_ulong_sysctl, "LU", 5541 "# of packet transmission aggregation flush failure"); 5542 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5543 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5544 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5545 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5546 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5547 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5548 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5549 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5550 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5551 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, 5552 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5553 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5554 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5555 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5556 "# of total TX descs"); 5557 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5558 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5559 "Chimney send packet size upper boundary"); 5560 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5561 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5562 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5563 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5564 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5565 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5566 hn_tx_conf_int_sysctl, "I", 5567 "Size of the packet for direct transmission"); 5568 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5569 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5570 __offsetof(struct hn_tx_ring, hn_sched_tx), 5571 hn_tx_conf_int_sysctl, "I", 5572 "Always schedule transmission " 5573 "instead of doing direct transmission"); 5574 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5575 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5576 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5577 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5578 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5579 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5580 "Applied packet transmission aggregation size"); 5581 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5582 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5583 hn_txagg_pktmax_sysctl, "I", 5584 "Applied packet transmission aggregation packets"); 5585 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5586 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5587 hn_txagg_align_sysctl, "I", 5588 "Applied packet transmission aggregation alignment"); 5589 5590 return 0; 5591 } 5592 5593 static void 5594 hn_set_chim_size(struct hn_softc *sc, int chim_size) 5595 { 5596 int i; 5597 5598 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5599 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5600 } 5601 5602 static void 5603 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5604 { 5605 if_t ifp = sc->hn_ifp; 5606 u_int hw_tsomax; 5607 int tso_minlen; 5608 5609 HN_LOCK_ASSERT(sc); 5610 5611 if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5612 return; 5613 5614 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5615 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5616 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5617 5618 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5619 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5620 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5621 5622 if (tso_maxlen < tso_minlen) 5623 tso_maxlen = tso_minlen; 5624 else if (tso_maxlen > IP_MAXPACKET) 5625 tso_maxlen = IP_MAXPACKET; 5626 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5627 tso_maxlen = sc->hn_ndis_tso_szmax; 5628 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5629 5630 if (hn_xpnt_vf_isready(sc)) { 5631 if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp)) 5632 hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp); 5633 } 5634 if_sethwtsomax(ifp, hw_tsomax); 5635 if (bootverbose) 5636 if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp)); 5637 } 5638 5639 static void 5640 hn_fixup_tx_data(struct hn_softc *sc) 5641 { 5642 uint64_t csum_assist; 5643 int i; 5644 5645 hn_set_chim_size(sc, sc->hn_chim_szmax); 5646 if (hn_tx_chimney_size > 0 && 5647 hn_tx_chimney_size < sc->hn_chim_szmax) 5648 hn_set_chim_size(sc, hn_tx_chimney_size); 5649 5650 csum_assist = 0; 5651 if (sc->hn_caps & HN_CAP_IPCS) 5652 csum_assist |= CSUM_IP; 5653 if (sc->hn_caps & HN_CAP_TCP4CS) 5654 csum_assist |= CSUM_IP_TCP; 5655 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5656 csum_assist |= CSUM_IP_UDP; 5657 if (sc->hn_caps & HN_CAP_TCP6CS) 5658 csum_assist |= CSUM_IP6_TCP; 5659 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5660 csum_assist |= CSUM_IP6_UDP; 5661 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5662 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5663 5664 if (sc->hn_caps & HN_CAP_HASHVAL) { 5665 /* 5666 * Support HASHVAL pktinfo on TX path. 5667 */ 5668 if (bootverbose) 5669 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5670 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5671 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5672 } 5673 } 5674 5675 static void 5676 hn_fixup_rx_data(struct hn_softc *sc) 5677 { 5678 5679 if (sc->hn_caps & HN_CAP_UDPHASH) { 5680 int i; 5681 5682 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5683 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5684 } 5685 } 5686 5687 static void 5688 hn_destroy_tx_data(struct hn_softc *sc) 5689 { 5690 int i; 5691 5692 if (sc->hn_chim != NULL) { 5693 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5694 contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF); 5695 } else { 5696 device_printf(sc->hn_dev, 5697 "chimney sending buffer is referenced"); 5698 } 5699 sc->hn_chim = NULL; 5700 } 5701 5702 if (sc->hn_tx_ring_cnt == 0) 5703 return; 5704 5705 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5706 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5707 5708 free(sc->hn_tx_ring, M_DEVBUF); 5709 sc->hn_tx_ring = NULL; 5710 5711 sc->hn_tx_ring_cnt = 0; 5712 sc->hn_tx_ring_inuse = 0; 5713 } 5714 5715 #ifdef HN_IFSTART_SUPPORT 5716 5717 static void 5718 hn_start_taskfunc(void *xtxr, int pending __unused) 5719 { 5720 struct hn_tx_ring *txr = xtxr; 5721 5722 mtx_lock(&txr->hn_tx_lock); 5723 hn_start_locked(txr, 0); 5724 mtx_unlock(&txr->hn_tx_lock); 5725 } 5726 5727 static int 5728 hn_start_locked(struct hn_tx_ring *txr, int len) 5729 { 5730 struct hn_softc *sc = txr->hn_sc; 5731 if_t ifp = sc->hn_ifp; 5732 int sched = 0; 5733 5734 KASSERT(hn_use_if_start, 5735 ("hn_start_locked is called, when if_start is disabled")); 5736 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5737 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5738 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5739 5740 if (__predict_false(txr->hn_suspended)) 5741 return (0); 5742 5743 if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5744 IFF_DRV_RUNNING) 5745 return (0); 5746 5747 while (!if_sendq_empty(ifp)) { 5748 struct hn_txdesc *txd; 5749 struct mbuf *m_head; 5750 int error; 5751 5752 m_head = if_dequeue(ifp); 5753 if (m_head == NULL) 5754 break; 5755 5756 if (len > 0 && m_head->m_pkthdr.len > len) { 5757 /* 5758 * This sending could be time consuming; let callers 5759 * dispatch this packet sending (and sending of any 5760 * following up packets) to tx taskqueue. 5761 */ 5762 if_sendq_prepend(ifp, m_head); 5763 sched = 1; 5764 break; 5765 } 5766 5767 #if defined(INET6) || defined(INET) 5768 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5769 m_head = hn_tso_fixup(m_head); 5770 if (__predict_false(m_head == NULL)) { 5771 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5772 continue; 5773 } 5774 } else if (m_head->m_pkthdr.csum_flags & 5775 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5776 m_head = hn_set_hlen(m_head); 5777 if (__predict_false(m_head == NULL)) { 5778 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5779 continue; 5780 } 5781 } 5782 #endif 5783 5784 txd = hn_txdesc_get(txr); 5785 if (txd == NULL) { 5786 txr->hn_no_txdescs++; 5787 if_sendq_prepend(ifp, m_head); 5788 if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); 5789 break; 5790 } 5791 5792 error = hn_encap(ifp, txr, txd, &m_head); 5793 if (error) { 5794 /* Both txd and m_head are freed */ 5795 KASSERT(txr->hn_agg_txd == NULL, 5796 ("encap failed w/ pending aggregating txdesc")); 5797 continue; 5798 } 5799 5800 if (txr->hn_agg_pktleft == 0) { 5801 if (txr->hn_agg_txd != NULL) { 5802 KASSERT(m_head == NULL, 5803 ("pending mbuf for aggregating txdesc")); 5804 error = hn_flush_txagg(ifp, txr); 5805 if (__predict_false(error)) { 5806 if_setdrvflagbits(ifp, 5807 IFF_DRV_OACTIVE, 0); 5808 break; 5809 } 5810 } else { 5811 KASSERT(m_head != NULL, ("mbuf was freed")); 5812 error = hn_txpkt(ifp, txr, txd); 5813 if (__predict_false(error)) { 5814 /* txd is freed, but m_head is not */ 5815 if_sendq_prepend(ifp, m_head); 5816 if_setdrvflagbits(ifp, 5817 IFF_DRV_OACTIVE, 0); 5818 break; 5819 } 5820 } 5821 } 5822 #ifdef INVARIANTS 5823 else { 5824 KASSERT(txr->hn_agg_txd != NULL, 5825 ("no aggregating txdesc")); 5826 KASSERT(m_head == NULL, 5827 ("pending mbuf for aggregating txdesc")); 5828 } 5829 #endif 5830 } 5831 5832 /* Flush pending aggerated transmission. */ 5833 if (txr->hn_agg_txd != NULL) 5834 hn_flush_txagg(ifp, txr); 5835 return (sched); 5836 } 5837 5838 static void 5839 hn_start(if_t ifp) 5840 { 5841 struct hn_softc *sc = if_getsoftc(ifp); 5842 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5843 5844 if (txr->hn_sched_tx) 5845 goto do_sched; 5846 5847 if (mtx_trylock(&txr->hn_tx_lock)) { 5848 int sched; 5849 5850 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5851 mtx_unlock(&txr->hn_tx_lock); 5852 if (!sched) 5853 return; 5854 } 5855 do_sched: 5856 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5857 } 5858 5859 static void 5860 hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5861 { 5862 struct hn_tx_ring *txr = xtxr; 5863 5864 mtx_lock(&txr->hn_tx_lock); 5865 if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE); 5866 hn_start_locked(txr, 0); 5867 mtx_unlock(&txr->hn_tx_lock); 5868 } 5869 5870 static void 5871 hn_start_txeof(struct hn_tx_ring *txr) 5872 { 5873 struct hn_softc *sc = txr->hn_sc; 5874 if_t ifp = sc->hn_ifp; 5875 5876 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5877 5878 if (txr->hn_sched_tx) 5879 goto do_sched; 5880 5881 if (mtx_trylock(&txr->hn_tx_lock)) { 5882 int sched; 5883 5884 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5885 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5886 mtx_unlock(&txr->hn_tx_lock); 5887 if (sched) { 5888 taskqueue_enqueue(txr->hn_tx_taskq, 5889 &txr->hn_tx_task); 5890 } 5891 } else { 5892 do_sched: 5893 /* 5894 * Release the OACTIVE earlier, with the hope, that 5895 * others could catch up. The task will clear the 5896 * flag again with the hn_tx_lock to avoid possible 5897 * races. 5898 */ 5899 if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); 5900 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5901 } 5902 } 5903 5904 #endif /* HN_IFSTART_SUPPORT */ 5905 5906 static int 5907 hn_xmit(struct hn_tx_ring *txr, int len) 5908 { 5909 struct hn_softc *sc = txr->hn_sc; 5910 if_t ifp = sc->hn_ifp; 5911 struct mbuf *m_head; 5912 int sched = 0; 5913 5914 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5915 #ifdef HN_IFSTART_SUPPORT 5916 KASSERT(hn_use_if_start == 0, 5917 ("hn_xmit is called, when if_start is enabled")); 5918 #endif 5919 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5920 5921 if (__predict_false(txr->hn_suspended)) 5922 return (0); 5923 5924 if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5925 return (0); 5926 5927 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5928 struct hn_txdesc *txd; 5929 int error; 5930 5931 if (len > 0 && m_head->m_pkthdr.len > len) { 5932 /* 5933 * This sending could be time consuming; let callers 5934 * dispatch this packet sending (and sending of any 5935 * following up packets) to tx taskqueue. 5936 */ 5937 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5938 sched = 1; 5939 break; 5940 } 5941 5942 txd = hn_txdesc_get(txr); 5943 if (txd == NULL) { 5944 txr->hn_no_txdescs++; 5945 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5946 txr->hn_oactive = 1; 5947 break; 5948 } 5949 5950 error = hn_encap(ifp, txr, txd, &m_head); 5951 if (error) { 5952 /* Both txd and m_head are freed; discard */ 5953 KASSERT(txr->hn_agg_txd == NULL, 5954 ("encap failed w/ pending aggregating txdesc")); 5955 drbr_advance(ifp, txr->hn_mbuf_br); 5956 continue; 5957 } 5958 5959 if (txr->hn_agg_pktleft == 0) { 5960 if (txr->hn_agg_txd != NULL) { 5961 KASSERT(m_head == NULL, 5962 ("pending mbuf for aggregating txdesc")); 5963 error = hn_flush_txagg(ifp, txr); 5964 if (__predict_false(error)) { 5965 txr->hn_oactive = 1; 5966 break; 5967 } 5968 } else { 5969 KASSERT(m_head != NULL, ("mbuf was freed")); 5970 error = hn_txpkt(ifp, txr, txd); 5971 if (__predict_false(error)) { 5972 /* txd is freed, but m_head is not */ 5973 drbr_putback(ifp, txr->hn_mbuf_br, 5974 m_head); 5975 txr->hn_oactive = 1; 5976 break; 5977 } 5978 } 5979 } 5980 #ifdef INVARIANTS 5981 else { 5982 KASSERT(txr->hn_agg_txd != NULL, 5983 ("no aggregating txdesc")); 5984 KASSERT(m_head == NULL, 5985 ("pending mbuf for aggregating txdesc")); 5986 } 5987 #endif 5988 5989 /* Sent */ 5990 drbr_advance(ifp, txr->hn_mbuf_br); 5991 } 5992 5993 /* Flush pending aggerated transmission. */ 5994 if (txr->hn_agg_txd != NULL) 5995 hn_flush_txagg(ifp, txr); 5996 return (sched); 5997 } 5998 5999 static int 6000 hn_transmit(if_t ifp, struct mbuf *m) 6001 { 6002 struct hn_softc *sc = if_getsoftc(ifp); 6003 struct hn_tx_ring *txr; 6004 int error, idx = 0; 6005 6006 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 6007 struct rm_priotracker pt; 6008 6009 rm_rlock(&sc->hn_vf_lock, &pt); 6010 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6011 struct mbuf *m_bpf = NULL; 6012 int obytes, omcast; 6013 6014 obytes = m->m_pkthdr.len; 6015 omcast = (m->m_flags & M_MCAST) != 0; 6016 6017 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 6018 if (bpf_peers_present(if_getbpf(ifp))) { 6019 m_bpf = m_copypacket(m, M_NOWAIT); 6020 if (m_bpf == NULL) { 6021 /* 6022 * Failed to grab a shallow 6023 * copy; tap now. 6024 */ 6025 ETHER_BPF_MTAP(ifp, m); 6026 } 6027 } 6028 } else { 6029 ETHER_BPF_MTAP(ifp, m); 6030 } 6031 6032 error = if_transmit(sc->hn_vf_ifp, m); 6033 rm_runlock(&sc->hn_vf_lock, &pt); 6034 6035 if (m_bpf != NULL) { 6036 if (!error) 6037 ETHER_BPF_MTAP(ifp, m_bpf); 6038 m_freem(m_bpf); 6039 } 6040 6041 if (error == ENOBUFS) { 6042 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6043 } else if (error) { 6044 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6045 } else { 6046 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 6047 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 6048 if (omcast) { 6049 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 6050 omcast); 6051 } 6052 } 6053 return (error); 6054 } 6055 rm_runlock(&sc->hn_vf_lock, &pt); 6056 } 6057 6058 #if defined(INET6) || defined(INET) 6059 /* 6060 * Perform TSO packet header fixup or get l2/l3 header length now, 6061 * since packet headers should be cache-hot. 6062 */ 6063 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 6064 m = hn_tso_fixup(m); 6065 if (__predict_false(m == NULL)) { 6066 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6067 return EIO; 6068 } 6069 } else if (m->m_pkthdr.csum_flags & 6070 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6071 m = hn_set_hlen(m); 6072 if (__predict_false(m == NULL)) { 6073 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6074 return EIO; 6075 } 6076 } 6077 #endif 6078 6079 /* 6080 * Select the TX ring based on flowid 6081 */ 6082 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6083 #ifdef RSS 6084 uint32_t bid; 6085 6086 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 6087 &bid) == 0) 6088 idx = bid % sc->hn_tx_ring_inuse; 6089 else 6090 #endif 6091 { 6092 #if defined(INET6) || defined(INET) 6093 int tcpsyn = 0; 6094 6095 if (m->m_pkthdr.len < 128 && 6096 (m->m_pkthdr.csum_flags & 6097 (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6098 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6099 m = hn_check_tcpsyn(m, &tcpsyn); 6100 if (__predict_false(m == NULL)) { 6101 if_inc_counter(ifp, 6102 IFCOUNTER_OERRORS, 1); 6103 return (EIO); 6104 } 6105 } 6106 #else 6107 const int tcpsyn = 0; 6108 #endif 6109 if (tcpsyn) 6110 idx = 0; 6111 else 6112 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6113 } 6114 } 6115 txr = &sc->hn_tx_ring[idx]; 6116 6117 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6118 if (error) { 6119 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6120 return error; 6121 } 6122 6123 if (txr->hn_oactive) 6124 return 0; 6125 6126 if (txr->hn_sched_tx) 6127 goto do_sched; 6128 6129 if (mtx_trylock(&txr->hn_tx_lock)) { 6130 int sched; 6131 6132 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6133 mtx_unlock(&txr->hn_tx_lock); 6134 if (!sched) 6135 return 0; 6136 } 6137 do_sched: 6138 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6139 return 0; 6140 } 6141 6142 static void 6143 hn_tx_ring_qflush(struct hn_tx_ring *txr) 6144 { 6145 struct mbuf *m; 6146 6147 mtx_lock(&txr->hn_tx_lock); 6148 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6149 m_freem(m); 6150 mtx_unlock(&txr->hn_tx_lock); 6151 } 6152 6153 static void 6154 hn_xmit_qflush(if_t ifp) 6155 { 6156 struct hn_softc *sc = if_getsoftc(ifp); 6157 struct rm_priotracker pt; 6158 int i; 6159 6160 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6161 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6162 if_qflush(ifp); 6163 6164 rm_rlock(&sc->hn_vf_lock, &pt); 6165 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6166 if_qflush(sc->hn_vf_ifp); 6167 rm_runlock(&sc->hn_vf_lock, &pt); 6168 } 6169 6170 static void 6171 hn_xmit_txeof(struct hn_tx_ring *txr) 6172 { 6173 6174 if (txr->hn_sched_tx) 6175 goto do_sched; 6176 6177 if (mtx_trylock(&txr->hn_tx_lock)) { 6178 int sched; 6179 6180 txr->hn_oactive = 0; 6181 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6182 mtx_unlock(&txr->hn_tx_lock); 6183 if (sched) { 6184 taskqueue_enqueue(txr->hn_tx_taskq, 6185 &txr->hn_tx_task); 6186 } 6187 } else { 6188 do_sched: 6189 /* 6190 * Release the oactive earlier, with the hope, that 6191 * others could catch up. The task will clear the 6192 * oactive again with the hn_tx_lock to avoid possible 6193 * races. 6194 */ 6195 txr->hn_oactive = 0; 6196 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6197 } 6198 } 6199 6200 static void 6201 hn_xmit_taskfunc(void *xtxr, int pending __unused) 6202 { 6203 struct hn_tx_ring *txr = xtxr; 6204 6205 mtx_lock(&txr->hn_tx_lock); 6206 hn_xmit(txr, 0); 6207 mtx_unlock(&txr->hn_tx_lock); 6208 } 6209 6210 static void 6211 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6212 { 6213 struct hn_tx_ring *txr = xtxr; 6214 6215 mtx_lock(&txr->hn_tx_lock); 6216 txr->hn_oactive = 0; 6217 hn_xmit(txr, 0); 6218 mtx_unlock(&txr->hn_tx_lock); 6219 } 6220 6221 static int 6222 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6223 { 6224 struct vmbus_chan_br cbr; 6225 struct hn_rx_ring *rxr; 6226 struct hn_tx_ring *txr = NULL; 6227 int idx, error; 6228 6229 idx = vmbus_chan_subidx(chan); 6230 6231 /* 6232 * Link this channel to RX/TX ring. 6233 */ 6234 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6235 ("invalid channel index %d, should > 0 && < %d", 6236 idx, sc->hn_rx_ring_inuse)); 6237 rxr = &sc->hn_rx_ring[idx]; 6238 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6239 ("RX ring %d already attached", idx)); 6240 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6241 rxr->hn_chan = chan; 6242 6243 if (bootverbose) { 6244 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6245 idx, vmbus_chan_id(chan)); 6246 } 6247 6248 if (idx < sc->hn_tx_ring_inuse) { 6249 txr = &sc->hn_tx_ring[idx]; 6250 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6251 ("TX ring %d already attached", idx)); 6252 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6253 6254 txr->hn_chan = chan; 6255 if (bootverbose) { 6256 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6257 idx, vmbus_chan_id(chan)); 6258 } 6259 } 6260 6261 /* Bind this channel to a proper CPU. */ 6262 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6263 6264 /* 6265 * Open this channel 6266 */ 6267 cbr.cbr = rxr->hn_br; 6268 cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br); 6269 cbr.cbr_txsz = HN_TXBR_SIZE; 6270 cbr.cbr_rxsz = HN_RXBR_SIZE; 6271 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6272 if (error) { 6273 if (error == EISCONN) { 6274 if_printf(sc->hn_ifp, "bufring is connected after " 6275 "chan%u open failure\n", vmbus_chan_id(chan)); 6276 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6277 } else { 6278 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6279 vmbus_chan_id(chan), error); 6280 } 6281 } 6282 return (error); 6283 } 6284 6285 static void 6286 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6287 { 6288 struct hn_rx_ring *rxr; 6289 int idx, error; 6290 6291 idx = vmbus_chan_subidx(chan); 6292 6293 /* 6294 * Link this channel to RX/TX ring. 6295 */ 6296 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6297 ("invalid channel index %d, should > 0 && < %d", 6298 idx, sc->hn_rx_ring_inuse)); 6299 rxr = &sc->hn_rx_ring[idx]; 6300 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6301 ("RX ring %d is not attached", idx)); 6302 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6303 6304 if (idx < sc->hn_tx_ring_inuse) { 6305 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6306 6307 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6308 ("TX ring %d is not attached attached", idx)); 6309 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6310 } 6311 6312 /* 6313 * Close this channel. 6314 * 6315 * NOTE: 6316 * Channel closing does _not_ destroy the target channel. 6317 */ 6318 error = vmbus_chan_close_direct(chan); 6319 if (error == EISCONN) { 6320 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6321 "after being closed\n", vmbus_chan_id(chan)); 6322 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6323 } else if (error) { 6324 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6325 vmbus_chan_id(chan), error); 6326 } 6327 } 6328 6329 static int 6330 hn_attach_subchans(struct hn_softc *sc) 6331 { 6332 struct vmbus_channel **subchans; 6333 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6334 int i, error = 0; 6335 6336 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6337 6338 /* Attach the sub-channels. */ 6339 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6340 for (i = 0; i < subchan_cnt; ++i) { 6341 int error1; 6342 6343 error1 = hn_chan_attach(sc, subchans[i]); 6344 if (error1) { 6345 error = error1; 6346 /* Move on; all channels will be detached later. */ 6347 } 6348 } 6349 vmbus_subchan_rel(subchans, subchan_cnt); 6350 6351 if (error) { 6352 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6353 } else { 6354 if (bootverbose) { 6355 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6356 subchan_cnt); 6357 } 6358 } 6359 return (error); 6360 } 6361 6362 static void 6363 hn_detach_allchans(struct hn_softc *sc) 6364 { 6365 struct vmbus_channel **subchans; 6366 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6367 int i; 6368 6369 if (subchan_cnt == 0) 6370 goto back; 6371 6372 /* Detach the sub-channels. */ 6373 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6374 for (i = 0; i < subchan_cnt; ++i) 6375 hn_chan_detach(sc, subchans[i]); 6376 vmbus_subchan_rel(subchans, subchan_cnt); 6377 6378 back: 6379 /* 6380 * Detach the primary channel, _after_ all sub-channels 6381 * are detached. 6382 */ 6383 hn_chan_detach(sc, sc->hn_prichan); 6384 6385 /* Wait for sub-channels to be destroyed, if any. */ 6386 vmbus_subchan_drain(sc->hn_prichan); 6387 6388 #ifdef INVARIANTS 6389 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6390 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6391 HN_RX_FLAG_ATTACHED) == 0, 6392 ("%dth RX ring is still attached", i)); 6393 } 6394 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6395 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6396 HN_TX_FLAG_ATTACHED) == 0, 6397 ("%dth TX ring is still attached", i)); 6398 } 6399 #endif 6400 } 6401 6402 static int 6403 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6404 { 6405 struct vmbus_channel **subchans; 6406 int nchan, rxr_cnt, error; 6407 6408 nchan = *nsubch + 1; 6409 if (nchan == 1) { 6410 /* 6411 * Multiple RX/TX rings are not requested. 6412 */ 6413 *nsubch = 0; 6414 return (0); 6415 } 6416 6417 /* 6418 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6419 * table entries. 6420 */ 6421 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6422 if (error) { 6423 /* No RSS; this is benign. */ 6424 *nsubch = 0; 6425 return (0); 6426 } 6427 if (bootverbose) { 6428 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6429 rxr_cnt, nchan); 6430 } 6431 6432 if (nchan > rxr_cnt) 6433 nchan = rxr_cnt; 6434 if (nchan == 1) { 6435 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6436 *nsubch = 0; 6437 return (0); 6438 } 6439 6440 /* 6441 * Allocate sub-channels from NVS. 6442 */ 6443 *nsubch = nchan - 1; 6444 error = hn_nvs_alloc_subchans(sc, nsubch); 6445 if (error || *nsubch == 0) { 6446 /* Failed to allocate sub-channels. */ 6447 *nsubch = 0; 6448 return (0); 6449 } 6450 6451 /* 6452 * Wait for all sub-channels to become ready before moving on. 6453 */ 6454 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6455 vmbus_subchan_rel(subchans, *nsubch); 6456 return (0); 6457 } 6458 6459 static bool 6460 hn_synth_attachable(const struct hn_softc *sc) 6461 { 6462 int i; 6463 6464 if (sc->hn_flags & HN_FLAG_ERRORS) 6465 return (false); 6466 6467 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6468 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6469 6470 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6471 return (false); 6472 } 6473 return (true); 6474 } 6475 6476 /* 6477 * Make sure that the RX filter is zero after the successful 6478 * RNDIS initialization. 6479 * 6480 * NOTE: 6481 * Under certain conditions on certain versions of Hyper-V, 6482 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6483 * after the successful RNDIS initialization, which breaks 6484 * the assumption of any following code (well, it breaks the 6485 * RNDIS API contract actually). Clear the RNDIS rxfilter 6486 * explicitly, drain packets sneaking through, and drain the 6487 * interrupt taskqueues scheduled due to the stealth packets. 6488 */ 6489 static void 6490 hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6491 { 6492 6493 hn_disable_rx(sc); 6494 hn_drain_rxtx(sc, nchan); 6495 } 6496 6497 static int 6498 hn_synth_attach(struct hn_softc *sc, int mtu) 6499 { 6500 #define ATTACHED_NVS 0x0002 6501 #define ATTACHED_RNDIS 0x0004 6502 6503 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6504 int error, nsubch, nchan = 1, i, rndis_inited; 6505 uint32_t old_caps, attached = 0; 6506 6507 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6508 ("synthetic parts were attached")); 6509 6510 if (!hn_synth_attachable(sc)) 6511 return (ENXIO); 6512 6513 /* Save capabilities for later verification. */ 6514 old_caps = sc->hn_caps; 6515 sc->hn_caps = 0; 6516 6517 /* Clear RSS stuffs. */ 6518 sc->hn_rss_ind_size = 0; 6519 sc->hn_rss_hash = 0; 6520 sc->hn_rss_hcap = 0; 6521 6522 /* 6523 * Attach the primary channel _before_ attaching NVS and RNDIS. 6524 */ 6525 error = hn_chan_attach(sc, sc->hn_prichan); 6526 if (error) 6527 goto failed; 6528 6529 /* 6530 * Attach NVS. 6531 */ 6532 error = hn_nvs_attach(sc, mtu); 6533 if (error) 6534 goto failed; 6535 attached |= ATTACHED_NVS; 6536 6537 /* 6538 * Attach RNDIS _after_ NVS is attached. 6539 */ 6540 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6541 if (rndis_inited) 6542 attached |= ATTACHED_RNDIS; 6543 if (error) 6544 goto failed; 6545 6546 /* 6547 * Make sure capabilities are not changed. 6548 */ 6549 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6550 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6551 old_caps, sc->hn_caps); 6552 error = ENXIO; 6553 goto failed; 6554 } 6555 6556 /* 6557 * Allocate sub-channels for multi-TX/RX rings. 6558 * 6559 * NOTE: 6560 * The # of RX rings that can be used is equivalent to the # of 6561 * channels to be requested. 6562 */ 6563 nsubch = sc->hn_rx_ring_cnt - 1; 6564 error = hn_synth_alloc_subchans(sc, &nsubch); 6565 if (error) 6566 goto failed; 6567 /* NOTE: _Full_ synthetic parts detach is required now. */ 6568 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6569 6570 /* 6571 * Set the # of TX/RX rings that could be used according to 6572 * the # of channels that NVS offered. 6573 */ 6574 nchan = nsubch + 1; 6575 hn_set_ring_inuse(sc, nchan); 6576 if (nchan == 1) { 6577 /* Only the primary channel can be used; done */ 6578 goto back; 6579 } 6580 6581 /* 6582 * Attach the sub-channels. 6583 * 6584 * NOTE: hn_set_ring_inuse() _must_ have been called. 6585 */ 6586 error = hn_attach_subchans(sc); 6587 if (error) 6588 goto failed; 6589 6590 /* 6591 * Configure RSS key and indirect table _after_ all sub-channels 6592 * are attached. 6593 */ 6594 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6595 /* 6596 * RSS key is not set yet; set it to the default RSS key. 6597 */ 6598 if (bootverbose) 6599 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6600 #ifdef RSS 6601 rss_getkey(rss->rss_key); 6602 #else 6603 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6604 #endif 6605 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6606 } 6607 6608 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6609 /* 6610 * RSS indirect table is not set yet; set it up in round- 6611 * robin fashion. 6612 */ 6613 if (bootverbose) { 6614 if_printf(sc->hn_ifp, "setup default RSS indirect " 6615 "table\n"); 6616 } 6617 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 6618 uint32_t subidx; 6619 6620 #ifdef RSS 6621 subidx = rss_get_indirection_to_bucket(i); 6622 #else 6623 subidx = i; 6624 #endif 6625 rss->rss_ind[i] = subidx % nchan; 6626 } 6627 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6628 } else { 6629 /* 6630 * # of usable channels may be changed, so we have to 6631 * make sure that all entries in RSS indirect table 6632 * are valid. 6633 * 6634 * NOTE: hn_set_ring_inuse() _must_ have been called. 6635 */ 6636 hn_rss_ind_fixup(sc); 6637 } 6638 6639 sc->hn_rss_hash = sc->hn_rss_hcap; 6640 if ((sc->hn_flags & HN_FLAG_RXVF) || 6641 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6642 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6643 hn_vf_rss_fixup(sc, false); 6644 } 6645 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6646 if (error) 6647 goto failed; 6648 back: 6649 /* 6650 * Fixup transmission aggregation setup. 6651 */ 6652 hn_set_txagg(sc); 6653 hn_rndis_init_fixat(sc, nchan); 6654 return (0); 6655 6656 failed: 6657 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6658 hn_rndis_init_fixat(sc, nchan); 6659 hn_synth_detach(sc); 6660 } else { 6661 if (attached & ATTACHED_RNDIS) { 6662 hn_rndis_init_fixat(sc, nchan); 6663 hn_rndis_detach(sc); 6664 } 6665 if (attached & ATTACHED_NVS) 6666 hn_nvs_detach(sc); 6667 hn_chan_detach(sc, sc->hn_prichan); 6668 /* Restore old capabilities. */ 6669 sc->hn_caps = old_caps; 6670 } 6671 return (error); 6672 6673 #undef ATTACHED_RNDIS 6674 #undef ATTACHED_NVS 6675 } 6676 6677 /* 6678 * NOTE: 6679 * The interface must have been suspended though hn_suspend(), before 6680 * this function get called. 6681 */ 6682 static void 6683 hn_synth_detach(struct hn_softc *sc) 6684 { 6685 6686 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6687 ("synthetic parts were not attached")); 6688 6689 /* Detach the RNDIS first. */ 6690 hn_rndis_detach(sc); 6691 6692 /* Detach NVS. */ 6693 hn_nvs_detach(sc); 6694 6695 /* Detach all of the channels. */ 6696 hn_detach_allchans(sc); 6697 6698 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { 6699 /* 6700 * Host is post-Win2016, disconnect RXBUF from primary channel here. 6701 */ 6702 int error; 6703 6704 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6705 sc->hn_rxbuf_gpadl); 6706 if (error) { 6707 if_printf(sc->hn_ifp, 6708 "rxbuf gpadl disconn failed: %d\n", error); 6709 sc->hn_flags |= HN_FLAG_RXBUF_REF; 6710 } 6711 sc->hn_rxbuf_gpadl = 0; 6712 } 6713 6714 if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { 6715 /* 6716 * Host is post-Win2016, disconnect chimney sending buffer from 6717 * primary channel here. 6718 */ 6719 int error; 6720 6721 error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, 6722 sc->hn_chim_gpadl); 6723 if (error) { 6724 if_printf(sc->hn_ifp, 6725 "chim gpadl disconn failed: %d\n", error); 6726 sc->hn_flags |= HN_FLAG_CHIM_REF; 6727 } 6728 sc->hn_chim_gpadl = 0; 6729 } 6730 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6731 } 6732 6733 static void 6734 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6735 { 6736 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6737 ("invalid ring count %d", ring_cnt)); 6738 6739 if (sc->hn_tx_ring_cnt > ring_cnt) 6740 sc->hn_tx_ring_inuse = ring_cnt; 6741 else 6742 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6743 sc->hn_rx_ring_inuse = ring_cnt; 6744 6745 #ifdef RSS 6746 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { 6747 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " 6748 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, 6749 rss_getnumbuckets()); 6750 } 6751 #endif 6752 6753 if (bootverbose) { 6754 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6755 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6756 } 6757 } 6758 6759 static void 6760 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6761 { 6762 6763 /* 6764 * NOTE: 6765 * The TX bufring will not be drained by the hypervisor, 6766 * if the primary channel is revoked. 6767 */ 6768 while (!vmbus_chan_rx_empty(chan) || 6769 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6770 !vmbus_chan_tx_empty(chan))) 6771 pause("waitch", 1); 6772 vmbus_chan_intr_drain(chan); 6773 } 6774 6775 static void 6776 hn_disable_rx(struct hn_softc *sc) 6777 { 6778 6779 /* 6780 * Disable RX by clearing RX filter forcefully. 6781 */ 6782 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6783 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6784 6785 /* 6786 * Give RNDIS enough time to flush all pending data packets. 6787 */ 6788 pause("waitrx", (200 * hz) / 1000); 6789 } 6790 6791 /* 6792 * NOTE: 6793 * RX/TX _must_ have been suspended/disabled, before this function 6794 * is called. 6795 */ 6796 static void 6797 hn_drain_rxtx(struct hn_softc *sc, int nchan) 6798 { 6799 struct vmbus_channel **subch = NULL; 6800 int nsubch; 6801 6802 /* 6803 * Drain RX/TX bufrings and interrupts. 6804 */ 6805 nsubch = nchan - 1; 6806 if (nsubch > 0) 6807 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6808 6809 if (subch != NULL) { 6810 int i; 6811 6812 for (i = 0; i < nsubch; ++i) 6813 hn_chan_drain(sc, subch[i]); 6814 } 6815 hn_chan_drain(sc, sc->hn_prichan); 6816 6817 if (subch != NULL) 6818 vmbus_subchan_rel(subch, nsubch); 6819 } 6820 6821 static void 6822 hn_suspend_data(struct hn_softc *sc) 6823 { 6824 struct hn_tx_ring *txr; 6825 int i; 6826 6827 HN_LOCK_ASSERT(sc); 6828 6829 /* 6830 * Suspend TX. 6831 */ 6832 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6833 txr = &sc->hn_tx_ring[i]; 6834 6835 mtx_lock(&txr->hn_tx_lock); 6836 txr->hn_suspended = 1; 6837 mtx_unlock(&txr->hn_tx_lock); 6838 /* No one is able send more packets now. */ 6839 6840 /* 6841 * Wait for all pending sends to finish. 6842 * 6843 * NOTE: 6844 * We will _not_ receive all pending send-done, if the 6845 * primary channel is revoked. 6846 */ 6847 while (hn_tx_ring_pending(txr) && 6848 !vmbus_chan_is_revoked(sc->hn_prichan)) 6849 pause("hnwtx", 1 /* 1 tick */); 6850 } 6851 6852 /* 6853 * Disable RX. 6854 */ 6855 hn_disable_rx(sc); 6856 6857 /* 6858 * Drain RX/TX. 6859 */ 6860 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6861 6862 /* 6863 * Drain any pending TX tasks. 6864 * 6865 * NOTE: 6866 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6867 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6868 */ 6869 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6870 txr = &sc->hn_tx_ring[i]; 6871 6872 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6873 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6874 } 6875 } 6876 6877 static void 6878 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6879 { 6880 6881 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6882 } 6883 6884 static void 6885 hn_suspend_mgmt(struct hn_softc *sc) 6886 { 6887 struct task task; 6888 6889 HN_LOCK_ASSERT(sc); 6890 6891 /* 6892 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6893 * through hn_mgmt_taskq. 6894 */ 6895 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6896 vmbus_chan_run_task(sc->hn_prichan, &task); 6897 6898 /* 6899 * Make sure that all pending management tasks are completed. 6900 */ 6901 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6902 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6903 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6904 } 6905 6906 static void 6907 hn_suspend(struct hn_softc *sc) 6908 { 6909 6910 /* Disable polling. */ 6911 hn_polling(sc, 0); 6912 6913 /* 6914 * If the non-transparent mode VF is activated, the synthetic 6915 * device is receiving packets, so the data path of the 6916 * synthetic device must be suspended. 6917 */ 6918 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 6919 (sc->hn_flags & HN_FLAG_RXVF)) 6920 hn_suspend_data(sc); 6921 hn_suspend_mgmt(sc); 6922 } 6923 6924 static void 6925 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6926 { 6927 int i; 6928 6929 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6930 ("invalid TX ring count %d", tx_ring_cnt)); 6931 6932 for (i = 0; i < tx_ring_cnt; ++i) { 6933 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6934 6935 mtx_lock(&txr->hn_tx_lock); 6936 txr->hn_suspended = 0; 6937 mtx_unlock(&txr->hn_tx_lock); 6938 } 6939 } 6940 6941 static void 6942 hn_resume_data(struct hn_softc *sc) 6943 { 6944 int i; 6945 6946 HN_LOCK_ASSERT(sc); 6947 6948 /* 6949 * Re-enable RX. 6950 */ 6951 hn_rxfilter_config(sc); 6952 6953 /* 6954 * Make sure to clear suspend status on "all" TX rings, 6955 * since hn_tx_ring_inuse can be changed after 6956 * hn_suspend_data(). 6957 */ 6958 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6959 6960 #ifdef HN_IFSTART_SUPPORT 6961 if (!hn_use_if_start) 6962 #endif 6963 { 6964 /* 6965 * Flush unused drbrs, since hn_tx_ring_inuse may be 6966 * reduced. 6967 */ 6968 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6969 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6970 } 6971 6972 /* 6973 * Kick start TX. 6974 */ 6975 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6976 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6977 6978 /* 6979 * Use txeof task, so that any pending oactive can be 6980 * cleared properly. 6981 */ 6982 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6983 } 6984 } 6985 6986 static void 6987 hn_resume_mgmt(struct hn_softc *sc) 6988 { 6989 6990 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6991 6992 /* 6993 * Kick off network change detection, if it was pending. 6994 * If no network change was pending, start link status 6995 * checks, which is more lightweight than network change 6996 * detection. 6997 */ 6998 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6999 hn_change_network(sc); 7000 else 7001 hn_update_link_status(sc); 7002 } 7003 7004 static void 7005 hn_resume(struct hn_softc *sc) 7006 { 7007 7008 /* 7009 * If the non-transparent mode VF is activated, the synthetic 7010 * device have to receive packets, so the data path of the 7011 * synthetic device must be resumed. 7012 */ 7013 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || 7014 (sc->hn_flags & HN_FLAG_RXVF)) 7015 hn_resume_data(sc); 7016 7017 /* 7018 * Don't resume link status change if VF is attached/activated. 7019 * - In the non-transparent VF mode, the synthetic device marks 7020 * link down until the VF is deactivated; i.e. VF is down. 7021 * - In transparent VF mode, VF's media status is used until 7022 * the VF is detached. 7023 */ 7024 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 7025 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 7026 hn_resume_mgmt(sc); 7027 7028 /* 7029 * Re-enable polling if this interface is running and 7030 * the polling is requested. 7031 */ 7032 if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 7033 hn_polling(sc, sc->hn_pollhz); 7034 } 7035 7036 static void 7037 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 7038 { 7039 const struct rndis_status_msg *msg; 7040 int ofs; 7041 7042 if (dlen < sizeof(*msg)) { 7043 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 7044 return; 7045 } 7046 msg = data; 7047 7048 switch (msg->rm_status) { 7049 case RNDIS_STATUS_MEDIA_CONNECT: 7050 case RNDIS_STATUS_MEDIA_DISCONNECT: 7051 hn_update_link_status(sc); 7052 break; 7053 7054 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 7055 case RNDIS_STATUS_LINK_SPEED_CHANGE: 7056 /* Not really useful; ignore. */ 7057 break; 7058 7059 case RNDIS_STATUS_NETWORK_CHANGE: 7060 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 7061 if (dlen < ofs + msg->rm_stbuflen || 7062 msg->rm_stbuflen < sizeof(uint32_t)) { 7063 if_printf(sc->hn_ifp, "network changed\n"); 7064 } else { 7065 uint32_t change; 7066 7067 memcpy(&change, ((const uint8_t *)msg) + ofs, 7068 sizeof(change)); 7069 if_printf(sc->hn_ifp, "network changed, change %u\n", 7070 change); 7071 } 7072 hn_change_network(sc); 7073 break; 7074 7075 default: 7076 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 7077 msg->rm_status); 7078 break; 7079 } 7080 } 7081 7082 static int 7083 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 7084 { 7085 const struct rndis_pktinfo *pi = info_data; 7086 uint32_t mask = 0; 7087 7088 while (info_dlen != 0) { 7089 const void *data; 7090 uint32_t dlen; 7091 7092 if (__predict_false(info_dlen < sizeof(*pi))) 7093 return (EINVAL); 7094 if (__predict_false(info_dlen < pi->rm_size)) 7095 return (EINVAL); 7096 info_dlen -= pi->rm_size; 7097 7098 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 7099 return (EINVAL); 7100 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 7101 return (EINVAL); 7102 dlen = pi->rm_size - pi->rm_pktinfooffset; 7103 data = pi->rm_data; 7104 7105 if (pi->rm_internal == 1) { 7106 switch (pi->rm_type) { 7107 case NDIS_PKTINFO_IT_PKTINFO_ID: 7108 if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) 7109 return (EINVAL); 7110 info->pktinfo_id = 7111 (const struct packet_info_id *)data; 7112 mask |= HN_RXINFO_PKTINFO_ID; 7113 break; 7114 7115 default: 7116 goto next; 7117 } 7118 } else { 7119 switch (pi->rm_type) { 7120 case NDIS_PKTINFO_TYPE_VLAN: 7121 if (__predict_false(dlen 7122 < NDIS_VLAN_INFO_SIZE)) 7123 return (EINVAL); 7124 info->vlan_info = (const uint32_t *)data; 7125 mask |= HN_RXINFO_VLAN; 7126 break; 7127 7128 case NDIS_PKTINFO_TYPE_CSUM: 7129 if (__predict_false(dlen 7130 < NDIS_RXCSUM_INFO_SIZE)) 7131 return (EINVAL); 7132 info->csum_info = (const uint32_t *)data; 7133 mask |= HN_RXINFO_CSUM; 7134 break; 7135 7136 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 7137 if (__predict_false(dlen 7138 < HN_NDIS_HASH_VALUE_SIZE)) 7139 return (EINVAL); 7140 info->hash_value = (const uint32_t *)data; 7141 mask |= HN_RXINFO_HASHVAL; 7142 break; 7143 7144 case HN_NDIS_PKTINFO_TYPE_HASHINF: 7145 if (__predict_false(dlen 7146 < HN_NDIS_HASH_INFO_SIZE)) 7147 return (EINVAL); 7148 info->hash_info = (const uint32_t *)data; 7149 mask |= HN_RXINFO_HASHINF; 7150 break; 7151 7152 default: 7153 goto next; 7154 } 7155 } 7156 7157 if (mask == HN_RXINFO_ALL) { 7158 /* All found; done */ 7159 break; 7160 } 7161 next: 7162 pi = (const struct rndis_pktinfo *) 7163 ((const uint8_t *)pi + pi->rm_size); 7164 } 7165 7166 /* 7167 * Final fixup. 7168 * - If there is no hash value, invalidate the hash info. 7169 */ 7170 if ((mask & HN_RXINFO_HASHVAL) == 0) 7171 info->hash_info = NULL; 7172 return (0); 7173 } 7174 7175 static __inline bool 7176 hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7177 { 7178 7179 if (off < check_off) { 7180 if (__predict_true(off + len <= check_off)) 7181 return (false); 7182 } else if (off > check_off) { 7183 if (__predict_true(check_off + check_len <= off)) 7184 return (false); 7185 } 7186 return (true); 7187 } 7188 7189 static __inline void 7190 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, 7191 uint32_t len, struct hn_rxinfo *info) 7192 { 7193 uint32_t cnt = rxr->rsc.cnt; 7194 7195 if (cnt) { 7196 rxr->rsc.pktlen += len; 7197 } else { 7198 rxr->rsc.vlan_info = info->vlan_info; 7199 rxr->rsc.csum_info = info->csum_info; 7200 rxr->rsc.hash_info = info->hash_info; 7201 rxr->rsc.hash_value = info->hash_value; 7202 rxr->rsc.pktlen = len; 7203 } 7204 7205 rxr->rsc.frag_data[cnt] = data; 7206 rxr->rsc.frag_len[cnt] = len; 7207 rxr->rsc.cnt++; 7208 } 7209 7210 static void 7211 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7212 { 7213 const struct rndis_packet_msg *pkt; 7214 struct hn_rxinfo info; 7215 int data_off, pktinfo_off, data_len, pktinfo_len; 7216 bool rsc_more= false; 7217 7218 /* 7219 * Check length. 7220 */ 7221 if (__predict_false(dlen < sizeof(*pkt))) { 7222 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7223 return; 7224 } 7225 pkt = data; 7226 7227 if (__predict_false(dlen < pkt->rm_len)) { 7228 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7229 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7230 return; 7231 } 7232 if (__predict_false(pkt->rm_len < 7233 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7234 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7235 "msglen %u, data %u, oob %u, pktinfo %u\n", 7236 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7237 pkt->rm_pktinfolen); 7238 return; 7239 } 7240 if (__predict_false(pkt->rm_datalen == 0)) { 7241 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7242 return; 7243 } 7244 7245 /* 7246 * Check offests. 7247 */ 7248 #define IS_OFFSET_INVALID(ofs) \ 7249 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7250 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7251 7252 /* XXX Hyper-V does not meet data offset alignment requirement */ 7253 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7254 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7255 "data offset %u\n", pkt->rm_dataoffset); 7256 return; 7257 } 7258 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7259 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7260 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7261 "oob offset %u\n", pkt->rm_oobdataoffset); 7262 return; 7263 } 7264 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7265 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7266 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7267 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7268 return; 7269 } 7270 7271 #undef IS_OFFSET_INVALID 7272 7273 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7274 data_len = pkt->rm_datalen; 7275 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7276 pktinfo_len = pkt->rm_pktinfolen; 7277 7278 /* 7279 * Check OOB coverage. 7280 */ 7281 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7282 int oob_off, oob_len; 7283 7284 if_printf(rxr->hn_ifp, "got oobdata\n"); 7285 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7286 oob_len = pkt->rm_oobdatalen; 7287 7288 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7289 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7290 "oob overflow, msglen %u, oob abs %d len %d\n", 7291 pkt->rm_len, oob_off, oob_len); 7292 return; 7293 } 7294 7295 /* 7296 * Check against data. 7297 */ 7298 if (hn_rndis_check_overlap(oob_off, oob_len, 7299 data_off, data_len)) { 7300 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7301 "oob overlaps data, oob abs %d len %d, " 7302 "data abs %d len %d\n", 7303 oob_off, oob_len, data_off, data_len); 7304 return; 7305 } 7306 7307 /* 7308 * Check against pktinfo. 7309 */ 7310 if (pktinfo_len != 0 && 7311 hn_rndis_check_overlap(oob_off, oob_len, 7312 pktinfo_off, pktinfo_len)) { 7313 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7314 "oob overlaps pktinfo, oob abs %d len %d, " 7315 "pktinfo abs %d len %d\n", 7316 oob_off, oob_len, pktinfo_off, pktinfo_len); 7317 return; 7318 } 7319 } 7320 7321 /* 7322 * Check per-packet-info coverage and find useful per-packet-info. 7323 */ 7324 info.vlan_info = NULL; 7325 info.csum_info = NULL; 7326 info.hash_info = NULL; 7327 info.pktinfo_id = NULL; 7328 7329 if (__predict_true(pktinfo_len != 0)) { 7330 bool overlap; 7331 int error; 7332 7333 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7334 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7335 "pktinfo overflow, msglen %u, " 7336 "pktinfo abs %d len %d\n", 7337 pkt->rm_len, pktinfo_off, pktinfo_len); 7338 return; 7339 } 7340 7341 /* 7342 * Check packet info coverage. 7343 */ 7344 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7345 data_off, data_len); 7346 if (__predict_false(overlap)) { 7347 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7348 "pktinfo overlap data, pktinfo abs %d len %d, " 7349 "data abs %d len %d\n", 7350 pktinfo_off, pktinfo_len, data_off, data_len); 7351 return; 7352 } 7353 7354 /* 7355 * Find useful per-packet-info. 7356 */ 7357 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7358 pktinfo_len, &info); 7359 if (__predict_false(error)) { 7360 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7361 "pktinfo\n"); 7362 return; 7363 } 7364 } 7365 7366 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7367 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7368 "data overflow, msglen %u, data abs %d len %d\n", 7369 pkt->rm_len, data_off, data_len); 7370 return; 7371 } 7372 7373 /* Identify RSC fragments, drop invalid packets */ 7374 if ((info.pktinfo_id != NULL) && 7375 (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { 7376 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { 7377 rxr->rsc.cnt = 0; 7378 rxr->hn_rsc_pkts++; 7379 } else if (rxr->rsc.cnt == 0) 7380 goto drop; 7381 7382 rsc_more = true; 7383 7384 if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) 7385 rsc_more = false; 7386 7387 if (rsc_more && rxr->rsc.is_last) 7388 goto drop; 7389 } else { 7390 rxr->rsc.cnt = 0; 7391 } 7392 7393 if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) 7394 goto drop; 7395 7396 /* Store data in per rx ring structure */ 7397 hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, 7398 data_len, &info); 7399 7400 if (rsc_more) 7401 return; 7402 7403 hn_rxpkt(rxr); 7404 rxr->rsc.cnt = 0; 7405 return; 7406 drop: 7407 rxr->hn_rsc_drop++; 7408 return; 7409 } 7410 7411 static __inline void 7412 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7413 { 7414 const struct rndis_msghdr *hdr; 7415 7416 if (__predict_false(dlen < sizeof(*hdr))) { 7417 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7418 return; 7419 } 7420 hdr = data; 7421 7422 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7423 /* Hot data path. */ 7424 hn_rndis_rx_data(rxr, data, dlen); 7425 /* Done! */ 7426 return; 7427 } 7428 7429 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7430 hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen); 7431 else 7432 hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen); 7433 } 7434 7435 static void 7436 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7437 { 7438 const struct hn_nvs_hdr *hdr; 7439 7440 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7441 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7442 return; 7443 } 7444 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7445 7446 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7447 /* Useless; ignore */ 7448 return; 7449 } 7450 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7451 } 7452 7453 static void 7454 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7455 const struct vmbus_chanpkt_hdr *pkt) 7456 { 7457 struct hn_nvs_sendctx *sndc; 7458 7459 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7460 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7461 VMBUS_CHANPKT_DATALEN(pkt)); 7462 /* 7463 * NOTE: 7464 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7465 * its callback. 7466 */ 7467 } 7468 7469 static void 7470 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7471 const struct vmbus_chanpkt_hdr *pkthdr) 7472 { 7473 struct epoch_tracker et; 7474 const struct vmbus_chanpkt_rxbuf *pkt; 7475 const struct hn_nvs_hdr *nvs_hdr; 7476 int count, i, hlen; 7477 7478 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7479 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7480 return; 7481 } 7482 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7483 7484 /* Make sure that this is a RNDIS message. */ 7485 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7486 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7487 nvs_hdr->nvs_type); 7488 return; 7489 } 7490 7491 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7492 if (__predict_false(hlen < sizeof(*pkt))) { 7493 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7494 return; 7495 } 7496 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7497 7498 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7499 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7500 pkt->cp_rxbuf_id); 7501 return; 7502 } 7503 7504 count = pkt->cp_rxbuf_cnt; 7505 if (__predict_false(hlen < 7506 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7507 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7508 return; 7509 } 7510 7511 NET_EPOCH_ENTER(et); 7512 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7513 for (i = 0; i < count; ++i) { 7514 int ofs, len; 7515 7516 ofs = pkt->cp_rxbuf[i].rb_ofs; 7517 len = pkt->cp_rxbuf[i].rb_len; 7518 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7519 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7520 "ofs %d, len %d\n", i, ofs, len); 7521 continue; 7522 } 7523 7524 rxr->rsc.is_last = (i == (count - 1)); 7525 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7526 } 7527 NET_EPOCH_EXIT(et); 7528 7529 /* 7530 * Ack the consumed RXBUF associated w/ this channel packet, 7531 * so that this RXBUF can be recycled by the hypervisor. 7532 */ 7533 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7534 } 7535 7536 static void 7537 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7538 uint64_t tid) 7539 { 7540 struct hn_nvs_rndis_ack ack; 7541 int retries, error; 7542 7543 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7544 ack.nvs_status = HN_NVS_STATUS_OK; 7545 7546 retries = 0; 7547 again: 7548 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7549 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7550 if (__predict_false(error == EAGAIN)) { 7551 /* 7552 * NOTE: 7553 * This should _not_ happen in real world, since the 7554 * consumption of the TX bufring from the TX path is 7555 * controlled. 7556 */ 7557 if (rxr->hn_ack_failed == 0) 7558 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7559 rxr->hn_ack_failed++; 7560 retries++; 7561 if (retries < 10) { 7562 DELAY(100); 7563 goto again; 7564 } 7565 /* RXBUF leaks! */ 7566 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7567 } 7568 } 7569 7570 static void 7571 hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7572 { 7573 struct hn_rx_ring *rxr = xrxr; 7574 struct hn_softc *sc = if_getsoftc(rxr->hn_ifp); 7575 7576 for (;;) { 7577 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7578 int error, pktlen; 7579 7580 pktlen = rxr->hn_pktbuf_len; 7581 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7582 if (__predict_false(error == ENOBUFS)) { 7583 void *nbuf; 7584 int nlen; 7585 7586 /* 7587 * Expand channel packet buffer. 7588 * 7589 * XXX 7590 * Use M_WAITOK here, since allocation failure 7591 * is fatal. 7592 */ 7593 nlen = rxr->hn_pktbuf_len * 2; 7594 while (nlen < pktlen) 7595 nlen *= 2; 7596 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7597 7598 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7599 rxr->hn_pktbuf_len, nlen); 7600 7601 free(rxr->hn_pktbuf, M_DEVBUF); 7602 rxr->hn_pktbuf = nbuf; 7603 rxr->hn_pktbuf_len = nlen; 7604 /* Retry! */ 7605 continue; 7606 } else if (__predict_false(error == EAGAIN)) { 7607 /* No more channel packets; done! */ 7608 break; 7609 } 7610 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7611 7612 switch (pkt->cph_type) { 7613 case VMBUS_CHANPKT_TYPE_COMP: 7614 hn_nvs_handle_comp(sc, chan, pkt); 7615 break; 7616 7617 case VMBUS_CHANPKT_TYPE_RXBUF: 7618 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7619 break; 7620 7621 case VMBUS_CHANPKT_TYPE_INBAND: 7622 hn_nvs_handle_notify(sc, pkt); 7623 break; 7624 7625 default: 7626 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7627 pkt->cph_type); 7628 break; 7629 } 7630 } 7631 hn_chan_rollup(rxr, rxr->hn_txr); 7632 } 7633 7634 static void 7635 hn_sysinit(void *arg __unused) 7636 { 7637 int i; 7638 7639 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7640 7641 #ifdef HN_IFSTART_SUPPORT 7642 /* 7643 * Don't use ifnet.if_start if transparent VF mode is requested; 7644 * mainly due to the IFF_DRV_OACTIVE flag. 7645 */ 7646 if (hn_xpnt_vf && hn_use_if_start) { 7647 hn_use_if_start = 0; 7648 printf("hn: tranparent VF mode, if_transmit will be used, " 7649 "instead of if_start\n"); 7650 } 7651 #endif 7652 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7653 printf("hn: invalid transparent VF attach routing " 7654 "wait timeout %d, reset to %d\n", 7655 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7656 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7657 } 7658 7659 /* 7660 * Initialize VF map. 7661 */ 7662 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7663 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7664 hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF, 7665 M_WAITOK | M_ZERO); 7666 7667 /* 7668 * Fix the # of TX taskqueues. 7669 */ 7670 if (hn_tx_taskq_cnt <= 0) 7671 hn_tx_taskq_cnt = 1; 7672 else if (hn_tx_taskq_cnt > mp_ncpus) 7673 hn_tx_taskq_cnt = mp_ncpus; 7674 7675 /* 7676 * Fix the TX taskqueue mode. 7677 */ 7678 switch (hn_tx_taskq_mode) { 7679 case HN_TX_TASKQ_M_INDEP: 7680 case HN_TX_TASKQ_M_GLOBAL: 7681 case HN_TX_TASKQ_M_EVTTQ: 7682 break; 7683 default: 7684 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7685 break; 7686 } 7687 7688 if (vm_guest != VM_GUEST_HV) 7689 return; 7690 7691 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7692 return; 7693 7694 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7695 M_DEVBUF, M_WAITOK); 7696 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7697 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7698 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7699 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7700 "hn tx%d", i); 7701 } 7702 } 7703 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7704 7705 static void 7706 hn_sysuninit(void *arg __unused) 7707 { 7708 7709 if (hn_tx_taskque != NULL) { 7710 int i; 7711 7712 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7713 taskqueue_free(hn_tx_taskque[i]); 7714 free(hn_tx_taskque, M_DEVBUF); 7715 } 7716 7717 if (hn_vfmap != NULL) 7718 free(hn_vfmap, M_DEVBUF); 7719 rm_destroy(&hn_vfmap_lock); 7720 7721 counter_u64_free(hn_udpcs_fixup); 7722 } 7723 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7724