/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * An implementation of the IPoIB standard based on PSARC 2001/289. */ #include #include #include #include #include #include #include #include #include #include #include #include /* for HCK_PARTIALCKSUM */ #include /* for offsetof */ #include /* for async thread pri */ #include /* for atomic_add*() */ #include /* for ETHERTYPE_IP */ #include /* for netinet/ip.h below */ #include /* for struct ip */ #include /* for struct udphdr */ #include /* for inet/ip.h below */ #include /* for ipha_t */ #include /* for IP6_DL_SAP */ #include /* for ip6_t */ #include /* for icmp6_t */ #include #include #include #include /* for SM_INIT_TYPE_* */ #include #include #include #include /* for ibd_get_portspeed */ /* * Modes of hardware/driver/software checksum, useful for debugging * and performance studies. * * none: h/w (Tavor) and driver does not do checksum, IP software must. * partial: driver does data checksum, IP must provide psuedo header. * perf_partial: driver uses IP provided psuedo cksum as data checksum * (thus, real checksumming is not done). */ typedef enum { IBD_CSUM_NONE, IBD_CSUM_PARTIAL, IBD_CSUM_PERF_PARTIAL } ibd_csum_type_t; typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t; /* * Per interface tunable parameters. */ uint_t ibd_rx_threshold = 16; uint_t ibd_tx_current_copy_threshold = 0x10000000; /* should less than max Tavor CQsize and be 2^n - 1 */ uint_t ibd_num_rwqe = 511; uint_t ibd_num_swqe = 511; uint_t ibd_num_ah = 16; uint_t ibd_hash_size = 16; uint_t ibd_srv_fifos = 0x0; uint_t ibd_fifo_depth = 0; ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE; ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE; /* * The driver can use separate CQs for send and receive queueus. * While using separate CQs, it is possible to put the send CQ * in polling mode, ie not to enable notifications on that CQ. * If both CQs are interrupt driven, currently it is not possible * for their handlers to be invoked concurrently (since Tavor ties * both interrupts to the same PCI intr line); but the handlers * are not coded with a single interrupt cpu assumption (eg * id_num_intrs is incremented atomically). * * The driver private struct uses id_scq_hdl to track the separate * CQ being used for send; the id_rcq_hdl tracks the receive CQ * if using separate CQs, or it tracks the single CQ when using * combined CQ. The id_wcs completion array is used in the combined * CQ case, and for fetching Rx completions in the separate CQs case; * the id_txwcs is used to fetch Tx completions in the separate CQs * case. */ uint_t ibd_separate_cqs = 1; uint_t ibd_txcomp_poll = 0; /* * the softintr is introduced to avoid Event Queue overflow. It * should not have heavy load in CQ event handle function. * If service fifos is enabled, this is not required, because * mac_rx() will be called by service threads. */ uint_t ibd_rx_softintr = 1; uint_t ibd_tx_softintr = 1; /* * Initial number of IBA resources allocated. */ #define IBD_NUM_RWQE ibd_num_rwqe #define IBD_NUM_SWQE ibd_num_swqe #define IBD_NUM_AH ibd_num_ah /* when <= threshold, it's faster to copy to a premapped buffer */ #define IBD_TX_COPY_THRESHOLD ibd_tx_current_copy_threshold /* * When the number of WQEs on the rxlist < IBD_RX_THRESHOLD, ibd will * allocate a new WQE to put on the the rxlist. This value must be <= * IBD_NUM_RWQE/id_num_rwqe. */ #define IBD_RX_THRESHOLD ibd_rx_threshold /* * Hash table size for the active AH list. */ #define IBD_HASH_SIZE ibd_hash_size #define IBD_TXPOLL_THRESHOLD 64 /* * PAD routine called during send/recv context */ #define IBD_SEND 0 #define IBD_RECV 1 /* * fill / clear in and in multicast/broadcast address. */ #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ { \ *(uint32_t *)((char *)(maddr) + 4) |= \ htonl((uint32_t)(scope) << 16); \ *(uint32_t *)((char *)(maddr) + 8) |= \ htonl((uint32_t)(pkey) << 16); \ } #define IBD_CLEAR_SCOPE_PKEY(maddr) \ { \ *(uint32_t *)((char *)(maddr) + 4) &= \ htonl(~((uint32_t)0xF << 16)); \ *(uint32_t *)((char *)(maddr) + 8) &= \ htonl(~((uint32_t)0xFFFF << 16)); \ } /* * when free tx wqes >= threshold and reschedule flag is set, * ibd will call mac_tx_update to re-enable Tx. */ #define IBD_TX_UPDATE_THRESHOLD 1 /* Driver State Pointer */ void *ibd_list; /* Required system entry points */ static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); /* Required driver entry points for GLDv3 */ static int ibd_m_start(void *); static void ibd_m_stop(void *); static int ibd_m_unicst(void *, const uint8_t *); static int ibd_m_multicst(void *, boolean_t, const uint8_t *); static int ibd_m_promisc(void *, boolean_t); static int ibd_m_stat(void *, uint_t, uint64_t *); static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); static mblk_t *ibd_m_tx(void *, mblk_t *); /* Private driver entry points for GLDv3 */ static boolean_t ibd_send(ibd_state_t *, mblk_t *); static uint_t ibd_intr(char *); static uint_t ibd_tx_recycle(char *); static int ibd_state_init(ibd_state_t *, dev_info_t *); static void ibd_state_fini(ibd_state_t *); static int ibd_drv_init(ibd_state_t *); static void ibd_drv_fini(ibd_state_t *); static void ibd_rcq_handler(ibt_cq_hdl_t, void *); static void ibd_scq_handler(ibt_cq_hdl_t, void *); static void ibd_snet_notices_handler(void *, ib_gid_t, ibt_subnet_event_code_t, ibt_subnet_event_t *); static int ibd_init_txlist(ibd_state_t *); static void ibd_fini_txlist(ibd_state_t *); static int ibd_init_rxlist(ibd_state_t *); static void ibd_fini_rxlist(ibd_state_t *); static void ibd_freemsg_cb(char *); static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **); static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); static int ibd_acache_init(ibd_state_t *); static void ibd_acache_fini(ibd_state_t *); static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); static void ibd_async_unsetprom(ibd_state_t *); static void ibd_async_setprom(ibd_state_t *); static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); static void ibd_async_txsched(ibd_state_t *); static void ibd_async_trap(ibd_state_t *, ibd_req_t *); static void ibd_async_work(ibd_state_t *); static void ibd_async_link(ibd_state_t *, ibd_req_t *); static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); static boolean_t ibd_get_allroutergroup(ibd_state_t *, ipoib_mac_t *, ipoib_mac_t *); static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); static void ibd_deregister_mr(ibd_state_t *, ibd_swqe_t *); static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); static uint64_t ibd_get_portspeed(ibd_state_t *); #ifdef RUN_PERFORMANCE static void ibd_perf(ibd_state_t *); #endif DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); /* Module Driver Info */ static struct modldrv ibd_modldrv = { &mod_driverops, /* This one is a driver */ "InfiniBand GLDv3 Driver", /* short description */ &ibd_dev_ops /* driver specific ops */ }; /* Module Linkage */ static struct modlinkage ibd_modlinkage = { MODREV_1, (void *)&ibd_modldrv, NULL }; /* * Module Info passed to IBTL during IBT_ATTACH. * NOTE: This data must be static (i.e. IBTL just keeps a pointer to this * data). */ static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { IBTI_V_CURR, IBT_NETWORK, ibd_async_handler, NULL, "IPIB" }; /* * Async operation types. */ #define ASYNC_GETAH 1 #define ASYNC_JOIN 2 #define ASYNC_LEAVE 3 #define ASYNC_PROMON 4 #define ASYNC_PROMOFF 5 #define ASYNC_REAP 6 #define ASYNC_TRAP 8 #define ASYNC_SCHED 9 #define ASYNC_LINK 10 #define ASYNC_EXIT 11 /* * Async operation states */ #define NOTSTARTED 0 #define ONGOING 1 #define COMPLETED 2 #define ERRORED 3 #define ROUTERED 4 #define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) static mac_callbacks_t ib_m_callbacks = { IBD_M_CALLBACK_FLAGS, ibd_m_stat, ibd_m_start, ibd_m_stop, ibd_m_promisc, ibd_m_multicst, ibd_m_unicst, ibd_m_tx, NULL, ibd_m_getcapab }; #ifdef DEBUG static int rxpack = 1, txpack = 1; int ibd_debuglevel = 100; static void debug_print(int l, char *fmt, ...) { va_list ap; if (l < ibd_debuglevel) return; va_start(ap, fmt); vcmn_err(CE_CONT, fmt, ap); va_end(ap); } #define INCRXPACK (rxpack++) #define INCTXPACK (txpack++) #define DPRINT debug_print #else /* DEBUG */ #define INCRXPACK 0 #define INCTXPACK 0 #define DPRINT #endif /* DEBUG */ /* * Common routine to print warning messages; adds in hca guid, port number * and pkey to be able to identify the IBA interface. */ static void ibd_print_warn(ibd_state_t *state, char *fmt, ...) { ib_guid_t hca_guid; char ibd_print_buf[256]; int len; va_list ap; hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 0, "hca-guid", 0); len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), "%s%d: HCA GUID %016llx port %d PKEY %02x ", ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), (u_longlong_t)hca_guid, state->id_port, state->id_pkey); va_start(ap, fmt); (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, fmt, ap); cmn_err(CE_NOTE, "!%s", ibd_print_buf); va_end(ap); } /* warlock directives */ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_active)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_free)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, ibd_state_t::id_req_list)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, ibd_state_t::id_acache_req_cv)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, ibd_state_t::id_mc_full)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, ibd_state_t::id_mc_non)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, ibd_state_t::id_link_state)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, ibd_state_s::id_tx_list)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, ibd_state_s::id_rx_list)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_prom_op)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_short)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_rx_list)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_tx_list)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_op)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_gid)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_rq::rq_ptr)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_mce)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_acache_s::ac_ref)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_wqe_s)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_rwqe_s)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_swqe_s)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_mac)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ipoib_pgrh)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ib_gid_s)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_req)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_fullreap)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id)) #ifdef DEBUG _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", rxpack)) _NOTE(SCHEME_PROTECTS_DATA("Protected_by_Scheme", txpack)) #endif int _init() { int status; /* * Sanity check some parameter settings. Tx completion polling * only makes sense with separate CQs for Tx and Rx. */ if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { cmn_err(CE_NOTE, "!ibd: %s", "Setting ibd_txcomp_poll = 0 for combined CQ"); ibd_txcomp_poll = 0; } status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); if (status != 0) { DPRINT(10, "_init:failed in ddi_soft_state_init()"); return (status); } mac_init_ops(&ibd_dev_ops, "ibd"); status = mod_install(&ibd_modlinkage); if (status != 0) { DPRINT(10, "_init:failed in mod_install()"); ddi_soft_state_fini(&ibd_list); mac_fini_ops(&ibd_dev_ops); return (status); } return (0); } int _info(struct modinfo *modinfop) { return (mod_info(&ibd_modlinkage, modinfop)); } int _fini() { int status; status = mod_remove(&ibd_modlinkage); if (status != 0) return (status); mac_fini_ops(&ibd_dev_ops); ddi_soft_state_fini(&ibd_list); return (0); } /* * Convert the GID part of the mac address from network byte order * to host order. */ static void ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) { ib_sn_prefix_t nbopref; ib_guid_t nboguid; bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); dgid->gid_prefix = b2h64(nbopref); dgid->gid_guid = b2h64(nboguid); } /* * Create the IPoIB address in network byte order from host order inputs. */ static void ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, ib_guid_t guid) { ib_sn_prefix_t nbopref; ib_guid_t nboguid; mac->ipoib_qpn = htonl(qpn); nbopref = h2b64(prefix); nboguid = h2b64(guid); bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); } /* * Send to the appropriate all-routers group when the IBA multicast group * does not exist, based on whether the target group is v4 or v6. */ static boolean_t ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, ipoib_mac_t *rmac) { boolean_t retval = B_TRUE; uint32_t adjscope = state->id_scope << 16; uint32_t topword; /* * Copy the first 4 bytes in without assuming any alignment of * input mac address; this will have IPoIB signature, flags and * scope bits. */ bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); topword = ntohl(topword); /* * Generate proper address for IPv4/v6, adding in the Pkey properly. */ if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | ((uint32_t)(state->id_pkey << 16))), (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); else /* * Does not have proper bits in the mgid address. */ retval = B_FALSE; return (retval); } /* * Implementation of various (software) flavors of send and receive side * checksumming. */ #define IBD_CKSUM_SEND(mp) { \ uint32_t start, stuff, end, value, flags; \ uint32_t cksum, sum; \ uchar_t *dp, *buf; \ uint16_t *up; \ \ if (ibd_csum_send == IBD_CSUM_NONE) \ goto punt_send; \ \ /* \ * Query IP whether Tx cksum needs to be done. \ */ \ hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, \ &value, &flags); \ \ if (flags == HCK_PARTIALCKSUM) { \ dp = ((uchar_t *)mp->b_rptr + IPOIB_HDRSIZE); \ up = (uint16_t *)(dp + stuff); \ if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ end = ((uchar_t *)mp->b_wptr - dp - start); \ cksum = *up; \ *up = 0; \ /* \ * Does NOT handle chained mblks/more than one \ * SGL. Applicable only for a single SGL \ * entry/mblk, where the stuff offset is \ * within the range of buf. \ */ \ buf = (dp + start); \ sum = IP_BCSUM_PARTIAL(buf, end, cksum); \ } else { \ sum = *up; \ } \ DPRINT(10, "strt %d stff %d end %d sum: %x csm %x \n", \ start, stuff, end, sum, cksum); \ sum = ~(sum); \ *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ } \ punt_send: \ ; \ } #define IBD_CKSUM_RECV(mp) { \ uchar_t *dp, *buf; \ uint32_t start, end, value, stuff, flags; \ uint16_t *up, frag; \ ipha_t *iphp; \ ipoib_hdr_t *ipibh; \ \ if (ibd_csum_recv == IBD_CSUM_NONE) \ goto punt_recv; \ \ ipibh = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);\ if (ntohs(ipibh->ipoib_type) != ETHERTYPE_IP) \ goto punt_recv; \ \ dp = ((uchar_t *)ipibh + IPOIB_HDRSIZE); \ iphp = (ipha_t *)dp; \ frag = ntohs(iphp->ipha_fragment_offset_and_flags); \ if ((frag) & (~IPH_DF)) \ goto punt_recv; \ start = IPH_HDR_LENGTH(iphp); \ if (iphp->ipha_protocol == IPPROTO_TCP) \ stuff = start + 16; \ else if (iphp->ipha_protocol == IPPROTO_UDP) \ stuff = start + 6; \ else \ goto punt_recv; \ \ flags = HCK_PARTIALCKSUM; \ end = ntohs(iphp->ipha_length); \ up = (uint16_t *)(dp + stuff); \ \ if (ibd_csum_recv == IBD_CSUM_PARTIAL) { \ buf = (dp + start); \ value = IP_BCSUM_PARTIAL(buf, end - start, 0); \ } else { \ value = (*up); \ } \ if (hcksum_assoc(mp, NULL, NULL, start, stuff, end, \ value, flags, 0) != 0) \ DPRINT(10, "cksum_recv: value: %x\n", value); \ punt_recv: \ ; \ } /* * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at * front of optional src/tgt link layer address. Right now Solaris inserts * padding by default at the end. The routine which is doing is nce_xmit() * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when * the packet comes down from IP layer to the IBD driver, it is in the * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result * machdr is not 4 byte aligned and had 2 bytes of padding at the end. * * The send routine at IBD driver changes this packet as follows: * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte * aligned. * * At the receiving side again ibd_process_rx takes the above packet and * removes the two bytes of front padding and inserts it at the end. This * is since the IP layer does not understand padding at the front. */ #define IBD_PAD_NSNA(ip6h, len, type) { \ uchar_t *nd_lla_ptr; \ icmp6_t *icmp6; \ nd_opt_hdr_t *opt; \ int i; \ \ icmp6 = (icmp6_t *)&ip6h[1]; \ len -= sizeof (nd_neighbor_advert_t); \ if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ (len != 0)) { \ opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ ASSERT(opt != NULL); \ nd_lla_ptr = (uchar_t *)&opt[1]; \ if (type == 0) { \ for (i = IPOIB_ADDRL; i > 0; i--) \ *(nd_lla_ptr + i + 1) = \ *(nd_lla_ptr + i - 1); \ } else { \ for (i = 0; i < IPOIB_ADDRL; i++) \ *(nd_lla_ptr + i) = \ *(nd_lla_ptr + i + 2); \ } \ *(nd_lla_ptr + i) = 0; \ *(nd_lla_ptr + i + 1) = 0; \ } \ } /* * The service fifo code is copied verbatim from Cassini. This can be * enhanced by doing a cpu_bind_thread() to bind each fifo to a cpu. */ typedef caddr_t fifo_obj_t, *p_fifo_obj_t; typedef struct _srv_fifo_t { kmutex_t fifo_lock; kcondvar_t fifo_cv; size_t size; uint_t max_index; uint_t rd_index; uint_t wr_index; uint_t objs_pending; p_fifo_obj_t fifo_objs; kthread_t *fifo_thread; void (*drain_func)(caddr_t drain_func_arg); caddr_t drain_func_arg; boolean_t running; callb_cpr_t cprinfo; } srv_fifo_t, *p_srv_fifo_t; _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::fifo_cv)) _NOTE(MUTEX_PROTECTS_DATA(_srv_fifo_t::fifo_lock, _srv_fifo_t::cprinfo)) static int _ddi_srv_fifo_create(p_srv_fifo_t *handle, size_t size, void (*drain_func)(), caddr_t drain_func_arg) { int status; p_srv_fifo_t srv_fifo; status = DDI_SUCCESS; srv_fifo = (p_srv_fifo_t)kmem_zalloc(sizeof (srv_fifo_t), KM_SLEEP); srv_fifo->size = size; srv_fifo->max_index = size - 1; srv_fifo->fifo_objs = (p_fifo_obj_t)kmem_zalloc( size * sizeof (fifo_obj_t), KM_SLEEP); mutex_init(&srv_fifo->fifo_lock, "srv_fifo", MUTEX_DRIVER, NULL); cv_init(&srv_fifo->fifo_cv, "srv_fifo", CV_DRIVER, NULL); srv_fifo->drain_func = drain_func; srv_fifo->drain_func_arg = drain_func_arg; srv_fifo->running = DDI_SUCCESS; srv_fifo->fifo_thread = thread_create(NULL, 0, drain_func, (caddr_t)srv_fifo, 0, &p0, TS_RUN, 60); if (srv_fifo->fifo_thread == NULL) { cv_destroy(&srv_fifo->fifo_cv); mutex_destroy(&srv_fifo->fifo_lock); kmem_free(srv_fifo->fifo_objs, size * sizeof (fifo_obj_t)); kmem_free(srv_fifo, sizeof (srv_fifo_t)); srv_fifo = NULL; status = DDI_FAILURE; } else *handle = srv_fifo; return (status); } static void _ddi_srv_fifo_destroy(p_srv_fifo_t handle) { kt_did_t tid = handle->fifo_thread->t_did; mutex_enter(&handle->fifo_lock); handle->running = DDI_FAILURE; cv_signal(&handle->fifo_cv); while (handle->running == DDI_FAILURE) cv_wait(&handle->fifo_cv, &handle->fifo_lock); mutex_exit(&handle->fifo_lock); if (handle->objs_pending != 0) cmn_err(CE_NOTE, "!Thread Exit with work undone."); cv_destroy(&handle->fifo_cv); mutex_destroy(&handle->fifo_lock); kmem_free(handle->fifo_objs, handle->size * sizeof (fifo_obj_t)); kmem_free(handle, sizeof (srv_fifo_t)); thread_join(tid); } static caddr_t _ddi_srv_fifo_begin(p_srv_fifo_t handle) { #ifndef __lock_lint CALLB_CPR_INIT(&handle->cprinfo, &handle->fifo_lock, callb_generic_cpr, "srv_fifo"); #endif /* ! _lock_lint */ return (handle->drain_func_arg); } static void _ddi_srv_fifo_end(p_srv_fifo_t handle) { callb_cpr_t cprinfo; mutex_enter(&handle->fifo_lock); cprinfo = handle->cprinfo; handle->running = DDI_SUCCESS; cv_signal(&handle->fifo_cv); #ifndef __lock_lint CALLB_CPR_EXIT(&cprinfo); #endif /* ! _lock_lint */ thread_exit(); _NOTE(NOT_REACHED) } static int _ddi_put_fifo(p_srv_fifo_t handle, fifo_obj_t ptr, boolean_t signal) { int status; mutex_enter(&handle->fifo_lock); status = handle->running; if (status == DDI_SUCCESS) { if (ptr) { if (handle->objs_pending < handle->size) { if (handle->wr_index == handle->max_index) handle->wr_index = 0; else handle->wr_index++; handle->fifo_objs[handle->wr_index] = ptr; handle->objs_pending++; } else status = DDI_FAILURE; if (signal) cv_signal(&handle->fifo_cv); } else { if (signal && (handle->objs_pending > 0)) cv_signal(&handle->fifo_cv); } } mutex_exit(&handle->fifo_lock); return (status); } static int _ddi_get_fifo(p_srv_fifo_t handle, p_fifo_obj_t ptr) { int status; mutex_enter(&handle->fifo_lock); status = handle->running; if (status == DDI_SUCCESS) { if (handle->objs_pending == 0) { #ifndef __lock_lint CALLB_CPR_SAFE_BEGIN(&handle->cprinfo); cv_wait(&handle->fifo_cv, &handle->fifo_lock); CALLB_CPR_SAFE_END(&handle->cprinfo, &handle->fifo_lock); #endif /* !_lock_lint */ *ptr = NULL; } if (handle->objs_pending > 0) { if (handle->rd_index == handle->max_index) handle->rd_index = 0; else handle->rd_index++; *ptr = handle->fifo_objs[handle->rd_index]; handle->objs_pending--; } status = handle->running; } else { if (handle->objs_pending) { if (handle->rd_index == handle->max_index) handle->rd_index = 0; else handle->rd_index++; *ptr = handle->fifo_objs[handle->rd_index]; handle->objs_pending--; status = DDI_SUCCESS; } else status = DDI_FAILURE; } mutex_exit(&handle->fifo_lock); return (status); } /* * [un]map_rx_srv_fifos has been modified from its CE version. */ static void drain_fifo(p_srv_fifo_t handle) { ibd_state_t *state; mblk_t *mp; state = (ibd_state_t *)_ddi_srv_fifo_begin(handle); while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) { /* * Hand off to GLDv3. */ IBD_CKSUM_RECV(mp); mac_rx(state->id_mh, NULL, mp); } _ddi_srv_fifo_end(handle); } static p_srv_fifo_t * map_rx_srv_fifos(int *nfifos, void *private) { p_srv_fifo_t *srv_fifos; int i, inst_taskqs, depth; /* * Default behavior on both sparc and amd cpus in terms of * of worker thread is as follows: (N) indicates worker thread * not enabled , (Y) indicates worker thread enabled. Default of * ibd_srv_fifo is set to 0xffff. The default behavior can be * overridden by setting ibd_srv_fifos to 0 or 1 as shown below. * Worker thread model assigns lower priority to network * processing making system more usable at higher network * loads. * ________________________________________________________ * |Value of ibd_srv_fifo | 0 | 1 | 0xffff| 0 | 1 | 0xfffff | * |----------------------|---|---|-------|---|---|---------| * | | Sparc | x86 | * |----------------------|---|---|-------|---|---|---------| * | Single CPU |N | Y | N | N | Y | N | * |----------------------|---|---|-------|---|---|---------| * | Multi CPU |N | Y | Y | N | Y | Y | * |______________________|___|___|_______|___|___|_________| */ if ((((inst_taskqs = ncpus) == 1) && (ibd_srv_fifos != 1)) || (ibd_srv_fifos == 0)) { *nfifos = 0; return ((p_srv_fifo_t *)1); } *nfifos = inst_taskqs; srv_fifos = kmem_zalloc(inst_taskqs * sizeof (p_srv_fifo_t), KM_SLEEP); /* * If the administrator has specified a fifo depth, use * that, else just decide what should be the depth. */ if (ibd_fifo_depth == 0) depth = (IBD_NUM_RWQE / inst_taskqs) + 16; else depth = ibd_fifo_depth; for (i = 0; i < inst_taskqs; i++) if (_ddi_srv_fifo_create(&srv_fifos[i], depth, drain_fifo, (caddr_t)private) != DDI_SUCCESS) break; if (i < inst_taskqs) goto map_rx_srv_fifos_fail1; goto map_rx_srv_fifos_exit; map_rx_srv_fifos_fail1: i--; for (; i >= 0; i--) { _ddi_srv_fifo_destroy(srv_fifos[i]); } kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); srv_fifos = NULL; map_rx_srv_fifos_exit: return (srv_fifos); } static void unmap_rx_srv_fifos(int inst_taskqs, p_srv_fifo_t *srv_fifos) { int i; /* * If this interface was not using service fifos, quickly return. */ if (inst_taskqs == 0) return; for (i = 0; i < inst_taskqs; i++) { _ddi_srv_fifo_destroy(srv_fifos[i]); } kmem_free(srv_fifos, inst_taskqs * sizeof (p_srv_fifo_t)); } /* * Choose between sending up the packet directly and handing off * to a service thread. */ static void ibd_send_up(ibd_state_t *state, mblk_t *mp) { p_srv_fifo_t *srvfifo; ipoib_hdr_t *lhdr; struct ip *ip_hdr; struct udphdr *tran_hdr; uchar_t prot; int tnum = -1, nfifos = state->id_nfifos; /* * Quick path if the interface is not using service fifos. */ if (nfifos == 0) { hand_off: IBD_CKSUM_RECV(mp); mac_rx(state->id_mh, NULL, mp); return; } /* * Is the packet big enough to look at the IPoIB header * and basic IP header to determine whether it is an * IPv4 packet? */ if (MBLKL(mp) >= (IPOIB_GRH_SIZE + IPOIB_HDRSIZE + sizeof (struct ip))) { lhdr = (ipoib_hdr_t *)(mp->b_rptr + IPOIB_GRH_SIZE); /* * Is the packet an IP(v4) packet? */ if (ntohs(lhdr->ipoib_type) == ETHERTYPE_IP) { ip_hdr = (struct ip *)(mp->b_rptr + IPOIB_GRH_SIZE + IPOIB_HDRSIZE); prot = ip_hdr->ip_p; /* * TCP or UDP packet? We use the UDP header, since * the first few words of both headers are laid out * similarly (src/dest ports). */ if ((prot == IPPROTO_TCP) || (prot == IPPROTO_UDP)) { tran_hdr = (struct udphdr *)( (uint8_t *)ip_hdr + (ip_hdr->ip_hl << 2)); /* * Are we within limits of this packet? If * so, use the destination port to hash to * a service thread. */ if (mp->b_wptr >= ((uchar_t *)tran_hdr + sizeof (*tran_hdr))) tnum = (ntohs(tran_hdr->uh_dport) + ntohs(tran_hdr->uh_sport)) % nfifos; } } } /* * For non TCP/UDP traffic (eg SunCluster heartbeat), we hand the * packet up in interrupt context, reducing latency. */ if (tnum == -1) { goto hand_off; } srvfifo = (p_srv_fifo_t *)state->id_fifos; if (_ddi_put_fifo(srvfifo[tnum], (fifo_obj_t)mp, B_TRUE) != DDI_SUCCESS) freemsg(mp); } /* * Address handle entries maintained by the driver are kept in the * free and active lists. Each entry starts out in the free list; * it migrates to the active list when primed using ibt_get_paths() * and ibt_modify_ud_dest() for transmission to a specific destination. * In the active list, the entry has a reference count indicating the * number of ongoing/uncompleted transmits that reference it. The * entry is left in the active list even after the reference count * goes to 0, since successive transmits can find it there and do * not need to set up another entry (ie the path information is * cached using the active list). Entries on the active list are * also hashed using the destination link address as a key for faster * lookups during transmits. * * For any destination address (unicast or multicast, whatever the * join states), there will be at most one entry in the active list. * Entries with a 0 reference count on the active list can be reused * for a transmit to a new destination, if the free list is empty. * * The AH free list insertion/deletion is protected with the id_ac_mutex, * since the async thread and Tx callback handlers insert/delete. The * active list does not need a lock (all operations are done by the * async thread) but updates to the reference count are atomically * done (increments done by Tx path, decrements by the Tx callback handler). */ #define IBD_ACACHE_INSERT_FREE(state, ce) \ list_insert_head(&state->id_ah_free, ce) #define IBD_ACACHE_GET_FREE(state) \ list_get_head(&state->id_ah_free) #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ int _ret_; \ list_insert_head(&state->id_ah_active, ce); \ _ret_ = mod_hash_insert(state->id_ah_active_hash, \ (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ ASSERT(_ret_ == 0); \ } #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ list_remove(&state->id_ah_active, ce); \ (void) mod_hash_remove(state->id_ah_active_hash, \ (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ } #define IBD_ACACHE_GET_ACTIVE(state) \ list_get_head(&state->id_ah_active) /* * Membership states for different mcg's are tracked by two lists: * the "non" list is used for promiscuous mode, when all mcg traffic * needs to be inspected. This type of membership is never used for * transmission, so there can not be an AH in the active list * corresponding to a member in this list. This list does not need * any protection, since all operations are performed by the async * thread. * * "Full" and "SendOnly" membership is tracked using a single list, * the "full" list. This is because this single list can then be * searched during transmit to a multicast group (if an AH for the * mcg is not found in the active list), since at least one type * of membership must be present before initiating the transmit. * This list is also emptied during driver detach, since sendonly * membership acquired during transmit is dropped at detach time * alongwith ipv4 broadcast full membership. Insert/deletes to * this list are done only by the async thread, but it is also * searched in program context (see multicast disable case), thus * the id_mc_mutex protects the list. The driver detach path also * deconstructs the "full" list, but it ensures that the async * thread will not be accessing the list (by blocking out mcg * trap handling and making sure no more Tx reaping will happen). * * Currently, an IBA attach is done in the SendOnly case too, * although this is not required. */ #define IBD_MCACHE_INSERT_FULL(state, mce) \ list_insert_head(&state->id_mc_full, mce) #define IBD_MCACHE_INSERT_NON(state, mce) \ list_insert_head(&state->id_mc_non, mce) #define IBD_MCACHE_FIND_FULL(state, mgid) \ ibd_mcache_find(mgid, &state->id_mc_full) #define IBD_MCACHE_FIND_NON(state, mgid) \ ibd_mcache_find(mgid, &state->id_mc_non) #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ list_remove(&state->id_mc_full, mce) #define IBD_MCACHE_PULLOUT_NON(state, mce) \ list_remove(&state->id_mc_non, mce) /* * AH and MCE active list manipulation: * * Multicast disable requests and MCG delete traps are two cases * where the active AH entry for the mcg (if any unreferenced one exists) * will be moved to the free list (to force the next Tx to the mcg to * join the MCG in SendOnly mode). Port up handling will also move AHs * from active to free list. * * In the case when some transmits are still pending on an entry * for an mcg, but a multicast disable has already been issued on the * mcg, there are some options to consider to preserve the join state * to ensure the emitted packet is properly routed on the IBA fabric. * For the AH, we can * 1. take out of active list at multicast disable time. * 2. take out of active list only when last pending Tx completes. * For the MCE, we can * 3. take out of active list at multicast disable time. * 4. take out of active list only when last pending Tx completes. * 5. move from active list to stale list at multicast disable time. * We choose to use 2,4. We use option 4 so that if a multicast enable * is tried before the pending Tx completes, the enable code finds the * mce in the active list and just has to make sure it will not be reaped * (ie the mcg leave done) when the pending Tx does complete. Alternatively, * a stale list (#5) that would be checked in the enable code would need * to be implemented. Option 2 is used, because otherwise, a Tx attempt * after the multicast disable would try to put an AH in the active list, * and associate the mce it finds in the active list to this new AH, * whereas the mce is already associated with the previous AH (taken off * the active list), and will be removed once the pending Tx's complete * (unless a reference count on mce's is implemented). One implication of * using 2,4 is that new Tx's posted before the pending Tx's complete will * grab new references on the AH, further delaying the leave. * * In the case of mcg delete (or create) trap when the port is sendonly * joined, the AH and MCE handling is different: the AH and MCE has to be * immediately taken off the active lists (forcing a join and path lookup * at the next Tx is the only guaranteed means of ensuring a proper Tx * to an mcg as it is repeatedly created and deleted and goes thru * reincarnations). * * When a port is already sendonly joined, and a multicast enable is * attempted, the same mce structure is promoted; this ensures only a * single mce on the active list tracks the most powerful join state. * * In the case of port up event handling, the MCE for sendonly membership * is freed up, and the ACE is put into the free list as soon as possible * (depending on whether posted Tx's have completed). For fullmembership * MCE's though, the ACE is similarly handled; but the MCE is kept around * (a re-JOIN is attempted) only if the DLPI leave has not already been * done; else the mce is deconstructed (mc_fullreap case). * * MCG creation and deletion trap handling: * * These traps are unreliable (meaning sometimes the trap might never * be delivered to the subscribed nodes) and may arrive out-of-order * since they use UD transport. An alternative to relying on these * unreliable traps is to poll for mcg presence every so often, but * instead of doing that, we try to be as conservative as possible * while handling the traps, and hope that the traps do arrive at * the subscribed nodes soon. Note that if a node is fullmember * joined to an mcg, it can not possibly receive a mcg create/delete * trap for that mcg (by fullmember definition); if it does, it is * an old trap from a previous incarnation of the mcg. * * Whenever a trap is received, the driver cleans up its sendonly * membership to the group; we choose to do a sendonly leave even * on a creation trap to handle the case of a prior deletion of the mcg * having gone unnoticed. Consider an example scenario: * T1: MCG M is deleted, and fires off deletion trap D1. * T2: MCG M is recreated, fires off creation trap C1, which is lost. * T3: Node N tries to transmit to M, joining in sendonly mode. * T4: MCG M is deleted, and fires off deletion trap D2. * T5: N receives a deletion trap, but can not distinguish D1 from D2. * If the trap is D2, then a LEAVE is not required, since the mcg * is already deleted; but if it is D1, a LEAVE is required. A safe * approach is to always LEAVE, but the SM may be confused if it * receives a LEAVE without a prior JOIN. * * Management of the non-membership to an mcg is similar to the above, * except that if the interface is in promiscuous mode, it is required * to attempt to re-join the mcg after receiving a trap. Unfortunately, * if the re-join attempt fails (in which case a warning message needs * to be printed), it is not clear whether it failed due to the mcg not * existing, or some fabric/hca issues, due to the delayed nature of * trap delivery. Querying the SA to establish presence/absence of the * mcg is also racy at best. Thus, the driver just prints a warning * message when it can not rejoin after receiving a create trap, although * this might be (on rare occassions) a mis-warning if the create trap is * received after the mcg was deleted. */ /* * Implementation of atomic "recycle" bits and reference count * on address handles. This utilizes the fact that max reference * count on any handle is limited by number of send wqes, thus * high bits in the ac_ref field can be used as the recycle bits, * and only the low bits hold the number of pending Tx requests. * This atomic AH reference counting allows the Tx completion * handler not to acquire the id_ac_mutex to process every completion, * thus reducing lock contention problems between completion and * the Tx path. */ #define CYCLEVAL 0x80000 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) #define GET_REF(ace) ((ace)->ac_ref) #define GET_REF_CYCLE(ace) ( \ /* \ * Make sure "cycle" bit is set. \ */ \ ASSERT(CYCLE_SET(ace)), \ ((ace)->ac_ref & ~(CYCLEVAL)) \ ) #define INC_REF(ace, num) { \ atomic_add_32(&(ace)->ac_ref, num); \ } #define SET_CYCLE_IF_REF(ace) ( \ CYCLE_SET(ace) ? B_TRUE : \ atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ CYCLEVAL ? \ /* \ * Clear the "cycle" bit we just set; \ * ref count known to be 0 from above. \ */ \ CLEAR_REFCYCLE(ace), B_FALSE : \ /* \ * We set "cycle" bit; let caller know. \ */ \ B_TRUE \ ) #define DEC_REF_DO_CYCLE(ace) ( \ atomic_add_32_nv(&ace->ac_ref, -1) == \ CYCLEVAL ? \ /* \ * Ref count known to be 0 from above. \ */ \ B_TRUE : \ B_FALSE \ ) static void * list_get_head(list_t *list) { list_node_t *lhead = list_head(list); if (lhead != NULL) list_remove(list, lhead); return (lhead); } /* * This is always guaranteed to be able to queue the work. */ static void ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) { /* Initialize request */ DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); ptr->rq_op = op; /* * Queue provided slot onto request pool. */ mutex_enter(&state->id_acache_req_lock); list_insert_tail(&state->id_req_list, ptr); /* Go, fetch, async thread */ cv_signal(&state->id_acache_req_cv); mutex_exit(&state->id_acache_req_lock); } /* * Main body of the per interface async thread. */ static void ibd_async_work(ibd_state_t *state) { ibd_req_t *ptr; callb_cpr_t cprinfo; mutex_enter(&state->id_acache_req_lock); CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, callb_generic_cpr, "ibd_async_work"); for (;;) { ptr = list_get_head(&state->id_req_list); if (ptr != NULL) { mutex_exit(&state->id_acache_req_lock); /* * Once we have done the operation, there is no * guarantee the request slot is going to be valid, * it might be freed up (as in ASYNC_LEAVE,REAP,TRAP). */ /* Perform the request */ switch (ptr->rq_op) { case ASYNC_GETAH: ibd_async_acache(state, &ptr->rq_mac); break; case ASYNC_REAP: ibd_async_reap_group(state, ptr->rq_ptr, ptr->rq_gid, IB_MC_JSTATE_FULL); /* * the req buf contains in mce * structure, so we do not need * to free it here. */ ptr = NULL; break; case ASYNC_LEAVE: case ASYNC_JOIN: ibd_async_multicast(state, ptr->rq_gid, ptr->rq_op); break; case ASYNC_PROMON: ibd_async_setprom(state); break; case ASYNC_PROMOFF: ibd_async_unsetprom(state); break; case ASYNC_TRAP: ibd_async_trap(state, ptr); break; case ASYNC_SCHED: ibd_async_txsched(state); break; case ASYNC_LINK: ibd_async_link(state, ptr); break; case ASYNC_EXIT: mutex_enter(&state->id_acache_req_lock); #ifndef __lock_lint CALLB_CPR_EXIT(&cprinfo); #endif /* !__lock_lint */ return; } if (ptr != NULL) kmem_cache_free(state->id_req_kmc, ptr); mutex_enter(&state->id_acache_req_lock); } else { /* * Nothing to do: wait till new request arrives. */ #ifndef __lock_lint CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&state->id_acache_req_cv, &state->id_acache_req_lock); CALLB_CPR_SAFE_END(&cprinfo, &state->id_acache_req_lock); #endif /* !_lock_lint */ } } /*NOTREACHED*/ _NOTE(NOT_REACHED) } /* * Return when it is safe to queue requests to the async daemon; primarily * for subnet trap and async event handling. Disallow requests before the * daemon is created, and when interface deinitilization starts. */ static boolean_t ibd_async_safe(ibd_state_t *state) { mutex_enter(&state->id_trap_lock); if (state->id_trap_stop) { mutex_exit(&state->id_trap_lock); return (B_FALSE); } state->id_trap_inprog++; mutex_exit(&state->id_trap_lock); return (B_TRUE); } /* * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet * trap or event handling to complete to kill the async thread and deconstruct * the mcg/ace list. */ static void ibd_async_done(ibd_state_t *state) { mutex_enter(&state->id_trap_lock); if (--state->id_trap_inprog == 0) cv_signal(&state->id_trap_cv); mutex_exit(&state->id_trap_lock); } /* * Hash functions: * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. * These operate on mac addresses input into ibd_send, but there is no * guarantee on the alignment of the ipoib_mac_t structure. */ /*ARGSUSED*/ static uint_t ibd_hash_by_id(void *hash_data, mod_hash_key_t key) { ulong_t ptraddr = (ulong_t)key; uint_t hval; /* * If the input address is 4 byte aligned, we can just dereference * it. This is most common, since IP will send in a 4 byte aligned * IP header, which implies the 24 byte IPoIB psuedo header will be * 4 byte aligned too. */ if ((ptraddr & 3) == 0) return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); return (hval); } static int ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) { if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) return (0); else return (1); } /* * Initialize all the per interface caches and lists; AH cache, * MCG list etc. */ static int ibd_acache_init(ibd_state_t *state) { ibd_ace_t *ce; int i; mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); list_create(&state->id_ah_free, sizeof (ibd_ace_t), offsetof(ibd_ace_t, ac_list)); list_create(&state->id_ah_active, sizeof (ibd_ace_t), offsetof(ibd_ace_t, ac_list)); state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); list_create(&state->id_mc_full, sizeof (ibd_mce_t), offsetof(ibd_mce_t, mc_list)); list_create(&state->id_mc_non, sizeof (ibd_mce_t), offsetof(ibd_mce_t, mc_list)); list_create(&state->id_req_list, sizeof (ibd_req_t), offsetof(ibd_req_t, rq_list)); state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * IBD_NUM_AH, KM_SLEEP); for (i = 0; i < IBD_NUM_AH; i++, ce++) { if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { ibd_acache_fini(state); return (DDI_FAILURE); } else { CLEAR_REFCYCLE(ce); ce->ac_mce = NULL; IBD_ACACHE_INSERT_FREE(state, ce); } } return (DDI_SUCCESS); } static void ibd_acache_fini(ibd_state_t *state) { ibd_ace_t *ptr; mutex_enter(&state->id_ac_mutex); while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { ASSERT(GET_REF(ptr) == 0); (void) ibt_free_ud_dest(ptr->ac_dest); } while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { ASSERT(GET_REF(ptr) == 0); (void) ibt_free_ud_dest(ptr->ac_dest); } list_destroy(&state->id_ah_free); list_destroy(&state->id_ah_active); list_destroy(&state->id_mc_full); list_destroy(&state->id_mc_non); list_destroy(&state->id_req_list); kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); mutex_exit(&state->id_ac_mutex); mutex_destroy(&state->id_ac_mutex); mutex_destroy(&state->id_mc_mutex); mutex_destroy(&state->id_acache_req_lock); cv_destroy(&state->id_acache_req_cv); } /* * Search AH active hash list for a cached path to input destination. * If we are "just looking", hold == F. When we are in the Tx path, * we set hold == T to grab a reference on the AH so that it can not * be recycled to a new destination while the Tx request is posted. */ static ibd_ace_t * ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) { ibd_ace_t *ptr; ASSERT(mutex_owned(&state->id_ac_mutex)); /* * Do hash search. */ if (mod_hash_find(state->id_ah_active_hash, (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { if (hold) INC_REF(ptr, num); return (ptr); } return (NULL); } /* * This is called by the tx side; if an initialized AH is found in * the active list, it is locked down and can be used; if no entry * is found, an async request is queued to do path resolution. */ static ibd_ace_t * ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) { ibd_ace_t *ptr; ibd_req_t *req; /* * Only attempt to print when we can; in the mdt pattr case, the * address is not aligned properly. */ if (((ulong_t)mac & 3) == 0) DPRINT(4, "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), htonl(mac->ipoib_gidsuff[1])); mutex_enter(&state->id_ac_mutex); if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { mutex_exit(&state->id_ac_mutex); return (ptr); } /* * Implementation of a single outstanding async request; if * the operation is not started yet, queue a request and move * to ongoing state. Remember in id_ah_addr for which address * we are queueing the request, in case we need to flag an error; * Any further requests, for the same or different address, until * the operation completes, is sent back to GLDv3 to be retried. * The async thread will update id_ah_op with an error indication * or will set it to indicate the next look up can start; either * way, it will mac_tx_update() so that all blocked requests come * back here. */ *err = EAGAIN; if (state->id_ah_op == NOTSTARTED) { req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req != NULL) { /* * We did not even find the entry; queue a request * for it. */ bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); ibd_queue_work_slot(state, req, ASYNC_GETAH); state->id_ah_op = ONGOING; bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); } } else if ((state->id_ah_op != ONGOING) && (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { /* * Check the status of the pathrecord lookup request * we had queued before. */ if (state->id_ah_op == ERRORED) { *err = EFAULT; state->id_ah_error++; } else { /* * ROUTERED case: We need to send to the * all-router MCG. If we can find the AH for * the mcg, the Tx will be attempted. If we * do not find the AH, we return NORESOURCES * to retry. */ ipoib_mac_t routermac; (void) ibd_get_allroutergroup(state, mac, &routermac); ptr = ibd_acache_find(state, &routermac, B_TRUE, numwqe); } state->id_ah_op = NOTSTARTED; } else if ((state->id_ah_op != ONGOING) && (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { /* * This case can happen when we get a higher band * packet. The easiest way is to reset the state machine * to accommodate the higher priority packet. */ state->id_ah_op = NOTSTARTED; } mutex_exit(&state->id_ac_mutex); return (ptr); } /* * Grab a not-currently-in-use AH/PathRecord from the active * list to recycle to a new destination. Only the async thread * executes this code. */ static ibd_ace_t * ibd_acache_get_unref(ibd_state_t *state) { ibd_ace_t *ptr = list_head(&state->id_ah_active); ASSERT(mutex_owned(&state->id_ac_mutex)); /* * Do plain linear search. */ while (ptr != NULL) { /* * Note that it is possible that the "cycle" bit * is set on the AH w/o any reference count. The * mcg must have been deleted, and the tx cleanup * just decremented the reference count to 0, but * hasn't gotten around to grabbing the id_ac_mutex * to move the AH into the free list. */ if (GET_REF(ptr) == 0) { IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); break; } ptr = list_next(&state->id_ah_active, ptr); } return (ptr); } /* * Invoked to clean up AH from active list in case of multicast * disable and to handle sendonly memberships during mcg traps. * And for port up processing for multicast and unicast AHs. * Normally, the AH is taken off the active list, and put into * the free list to be recycled for a new destination. In case * Tx requests on the AH have not completed yet, the AH is marked * for reaping (which will put the AH on the free list) once the Tx's * complete; in this case, depending on the "force" input, we take * out the AH from the active list right now, or leave it also for * the reap operation. Returns TRUE if the AH is taken off the active * list (and either put into the free list right now, or arranged for * later), FALSE otherwise. */ static boolean_t ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) { ibd_ace_t *acactive; boolean_t ret = B_TRUE; ASSERT(mutex_owned(&state->id_ac_mutex)); if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { /* * Note that the AH might already have the cycle bit set * on it; this might happen if sequences of multicast * enables and disables are coming so fast, that posted * Tx's to the mcg have not completed yet, and the cycle * bit is set successively by each multicast disable. */ if (SET_CYCLE_IF_REF(acactive)) { if (!force) { /* * The ace is kept on the active list, further * Tx's can still grab a reference on it; the * ace is reaped when all pending Tx's * referencing the AH complete. */ ret = B_FALSE; } else { /* * In the mcg trap case, we always pull the * AH from the active list. And also the port * up multi/unicast case. */ IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); acactive->ac_mce = NULL; } } else { /* * Determined the ref count is 0, thus reclaim * immediately after pulling out the ace from * the active list. */ IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); acactive->ac_mce = NULL; IBD_ACACHE_INSERT_FREE(state, acactive); } } return (ret); } /* * Helper function for async path record lookup. If we are trying to * Tx to a MCG, check our membership, possibly trying to join the * group if required. If that fails, try to send the packet to the * all router group (indicated by the redirect output), pointing * the input mac address to the router mcg address. */ static ibd_mce_t * ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) { ib_gid_t mgid; ibd_mce_t *mce; ipoib_mac_t routermac; *redirect = B_FALSE; ibd_n2h_gid(mac, &mgid); /* * Check the FullMember+SendOnlyNonMember list. * Since we are the only one who manipulates the * id_mc_full list, no locks are needed. */ mce = IBD_MCACHE_FIND_FULL(state, mgid); if (mce != NULL) { DPRINT(4, "ibd_async_mcache : already joined to group"); return (mce); } /* * Not found; try to join(SendOnlyNonMember) and attach. */ DPRINT(4, "ibd_async_mcache : not joined to group"); if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != NULL) { DPRINT(4, "ibd_async_mcache : nonmem joined to group"); return (mce); } /* * MCGroup not present; try to join the all-router group. If * any of the following steps succeed, we will be redirecting * to the all router group. */ DPRINT(4, "ibd_async_mcache : nonmem join failed"); if (!ibd_get_allroutergroup(state, mac, &routermac)) return (NULL); *redirect = B_TRUE; ibd_n2h_gid(&routermac, &mgid); bcopy(&routermac, mac, IPOIB_ADDRL); DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); /* * Are we already joined to the router group? */ if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { DPRINT(4, "ibd_async_mcache : using already joined router" "group\n"); return (mce); } /* * Can we join(SendOnlyNonMember) the router group? */ DPRINT(4, "ibd_async_mcache : attempting join to router grp"); if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != NULL) { DPRINT(4, "ibd_async_mcache : joined to router grp"); return (mce); } return (NULL); } /* * Async path record lookup code. */ static void ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) { ibd_ace_t *ce; ibd_mce_t *mce = NULL; ibt_path_attr_t path_attr; ibt_path_info_t path_info; ib_gid_t destgid; int ret = NOTSTARTED; DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), htonl(mac->ipoib_gidsuff[1])); /* * Check whether we are trying to transmit to a MCG. * In that case, we need to make sure we are a member of * the MCG. */ if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { boolean_t redirected; /* * If we can not find or join the group or even * redirect, error out. */ if ((mce = ibd_async_mcache(state, mac, &redirected)) == NULL) { state->id_ah_op = ERRORED; return; } /* * If we got redirected, we need to determine whether * the AH for the new mcg is in the cache already, and * not pull it in then; otherwise proceed to get the * path for the new mcg. There is no guarantee that * if the AH is currently in the cache, it will still be * there when we look in ibd_acache_lookup(), but that's * okay, we will come back here. */ if (redirected) { ret = ROUTERED; DPRINT(4, "ibd_async_acache : redirected to " "%08X:%08X:%08X:%08X:%08X", htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), htonl(mac->ipoib_gidsuff[1])); mutex_enter(&state->id_ac_mutex); if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { mutex_exit(&state->id_ac_mutex); DPRINT(4, "ibd_async_acache : router AH found"); state->id_ah_op = ROUTERED; return; } mutex_exit(&state->id_ac_mutex); } } /* * Get an AH from the free list. */ mutex_enter(&state->id_ac_mutex); if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { /* * No free ones; try to grab an unreferenced active * one. Maybe we need to make the active list LRU, * but that will create more work for Tx callbacks. * Is there a way of not having to pull out the * entry from the active list, but just indicate it * is being recycled? Yes, but that creates one more * check in the fast lookup path. */ if ((ce = ibd_acache_get_unref(state)) == NULL) { /* * Pretty serious shortage now. */ state->id_ah_op = NOTSTARTED; mutex_exit(&state->id_ac_mutex); DPRINT(10, "ibd_async_acache : failed to find AH " "slot\n"); return; } /* * We could check whether ac_mce points to a SendOnly * member and drop that membership now. Or do it lazily * at detach time. */ ce->ac_mce = NULL; } mutex_exit(&state->id_ac_mutex); ASSERT(ce->ac_mce == NULL); /* * Update the entry. */ bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); bzero(&path_info, sizeof (path_info)); bzero(&path_attr, sizeof (ibt_path_attr_t)); path_attr.pa_sgid = state->id_sgid; path_attr.pa_num_dgids = 1; ibd_n2h_gid(&ce->ac_mac, &destgid); path_attr.pa_dgids = &destgid; path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); goto error; } if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, ntohl(ce->ac_mac.ipoib_qpn), &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); goto error; } /* * mce is set whenever an AH is being associated with a * MCG; this will come in handy when we leave the MCG. The * lock protects Tx fastpath from scanning the active list. */ if (mce != NULL) ce->ac_mce = mce; mutex_enter(&state->id_ac_mutex); IBD_ACACHE_INSERT_ACTIVE(state, ce); state->id_ah_op = ret; mutex_exit(&state->id_ac_mutex); return; error: /* * We might want to drop SendOnly membership here if we * joined above. The lock protects Tx callbacks inserting * into the free list. */ mutex_enter(&state->id_ac_mutex); state->id_ah_op = ERRORED; IBD_ACACHE_INSERT_FREE(state, ce); mutex_exit(&state->id_ac_mutex); } /* * While restoring port's presence on the subnet on a port up, it is possible * that the port goes down again. */ static void ibd_async_link(ibd_state_t *state, ibd_req_t *req) { ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : LINK_STATE_UP; ibd_mce_t *mce, *pmce; ibd_ace_t *ace, *pace; DPRINT(10, "ibd_async_link(): %d", opcode); /* * On a link up, revalidate the link speed/width. No point doing * this on a link down, since we will be unable to do SA operations, * defaulting to the lowest speed. Also notice that we update our * notion of speed before calling mac_link_update(), which will do * neccesary higher level notifications for speed changes. */ if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) state->id_link_speed = ibd_get_portspeed(state); _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) } /* * Do all the work required to establish our presence on * the subnet. */ if (opcode == IBD_LINK_UP_ABSENT) { /* * If in promiscuous mode ... */ if (state->id_prom_op == COMPLETED) { /* * Drop all nonmembership. */ ibd_async_unsetprom(state); /* * Then, try to regain nonmembership to all mcg's. */ ibd_async_setprom(state); } /* * Drop all sendonly membership (which also gets rid of the * AHs); try to reacquire all full membership. */ mce = list_head(&state->id_mc_full); while ((pmce = mce) != NULL) { mce = list_next(&state->id_mc_full, mce); if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) ibd_leave_group(state, pmce->mc_info.mc_adds_vect.av_dgid, IB_MC_JSTATE_SEND_ONLY_NON); else ibd_reacquire_group(state, pmce); } /* * Recycle all active AHs to free list (and if there are * pending posts, make sure they will go into the free list * once the Tx's complete). Grab the lock to prevent * concurrent Tx's as well as Tx cleanups. */ mutex_enter(&state->id_ac_mutex); ace = list_head(&state->id_ah_active); while ((pace = ace) != NULL) { boolean_t cycled; ace = list_next(&state->id_ah_active, ace); mce = pace->ac_mce; cycled = ibd_acache_recycle(state, &pace->ac_mac, B_TRUE); /* * If this is for an mcg, it must be for a fullmember, * since we got rid of send-only members above when * processing the mce list. */ ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == IB_MC_JSTATE_FULL))); /* * Check if the fullmember mce needs to be torn down, * ie whether the DLPI disable has already been done. * If so, do some of the work of tx_cleanup, namely * causing leave (which will fail), detach and * mce-freeing. tx_cleanup will put the AH into free * list. The reason to duplicate some of this * tx_cleanup work is because we want to delete the * AH right now instead of waiting for tx_cleanup, to * force subsequent Tx's to reacquire an AH. */ if ((mce != NULL) && (mce->mc_fullreap)) ibd_async_reap_group(state, mce, mce->mc_info.mc_adds_vect.av_dgid, mce->mc_jstate); } mutex_exit(&state->id_ac_mutex); } /* * mac handle is guaranteed to exist since driver does ibt_close_hca() * (which stops further events from being delivered) before * mac_unreigster(). At this point, it is guaranteed that mac_register * has already been done. */ mutex_enter(&state->id_link_mutex); state->id_link_state = lstate; mac_link_update(state->id_mh, lstate); mutex_exit(&state->id_link_mutex); ibd_async_done(state); } /* * When the link is notified up, we need to do a few things, based * on the port's current p_init_type_reply claiming a reinit has been * done or not. The reinit steps are: * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify * the old Pkey and GID0 are correct. * 2. Register for mcg traps (already done by ibmf). * 3. If PreservePresenceReply indicates the SM has restored port's presence * in subnet, nothing more to do. Else go to next steps (on async daemon). * 4. Give up all sendonly memberships. * 5. Acquire all full memberships. * 6. In promiscuous mode, acquire all non memberships. * 7. Recycle all AHs to free list. */ static void ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) { ibt_hca_portinfo_t *port_infop; ibt_status_t ibt_status; uint_t psize, port_infosz; ibd_link_op_t opcode; ibd_req_t *req; /* * Do not send a request to the async daemon if it has not * yet been created or is being destroyed. If the async * daemon has not yet been created, we still need to track * last known state of the link. If this code races with the * detach path, then we are assured that the detach path has * not yet done the ibt_close_hca (which waits for all async * events to complete). If the code races with the attach path, * we need to validate the pkey/gid (in the link_up case) if * the initialization path has already set these up and created * IBTF resources based on the values. */ mutex_enter(&state->id_link_mutex); /* * If the init code in ibd_drv_init hasn't yet set up the * pkey/gid, nothing to do; that code will set the link state. */ if (state->id_link_state == LINK_STATE_UNKNOWN) { mutex_exit(&state->id_link_mutex); return; } if (code == IBT_EVENT_PORT_UP) { uint8_t itreply; boolean_t badup = B_FALSE; ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, &port_infop, &psize, &port_infosz); if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { mutex_exit(&state->id_link_mutex); DPRINT(10, "ibd_link_up : failed in" " ibt_query_port()\n"); return; } /* * If the link already went down by the time the handler gets * here, give up; we can not even validate pkey/gid since those * are not valid. */ if (port_infop->p_linkstate != IBT_PORT_ACTIVE) badup = B_TRUE; itreply = port_infop->p_init_type_reply; /* * In InitTypeReply, check if NoLoadReply == * PreserveContentReply == 0, in which case, verify Pkey/GID0. */ if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && (!badup)) { /* * Check that the subnet part of GID0 has not changed. */ if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, sizeof (ib_gid_t)) != 0) badup = B_TRUE; /* * Check that Pkey/index mapping is still valid. */ if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || (port_infop->p_pkey_tbl[state->id_pkix] != state->id_pkey)) badup = B_TRUE; } /* * In InitTypeReply, if PreservePresenceReply indicates the SM * has ensured that the port's presence in mcg, traps etc is * intact, nothing more to do. */ opcode = IBD_LINK_UP_ABSENT; if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) opcode = IBD_LINK_UP; if (badup) code = IBT_ERROR_PORT_DOWN; ibt_free_portinfo(port_infop, port_infosz); } if (!ibd_async_safe(state)) { state->id_link_state = ((code == IBT_EVENT_PORT_UP) ? LINK_STATE_UP : LINK_STATE_DOWN); mutex_exit(&state->id_link_mutex); return; } mutex_exit(&state->id_link_mutex); if (code == IBT_ERROR_PORT_DOWN) opcode = IBD_LINK_DOWN; req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); req->rq_ptr = (void *)opcode; ibd_queue_work_slot(state, req, ASYNC_LINK); } /* * For the port up/down events, IBTL guarantees there will not be concurrent * invocations of the handler. IBTL might coalesce link transition events, * and not invoke the handler for _each_ up/down transition, but it will * invoke the handler with last known state */ static void ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, ibt_async_code_t code, ibt_async_event_t *event) { ibd_state_t *state = (ibd_state_t *)clnt_private; switch (code) { case IBT_ERROR_CATASTROPHIC_CHAN: ibd_print_warn(state, "catastrophic channel error"); break; case IBT_ERROR_CQ: ibd_print_warn(state, "completion queue error"); break; case IBT_ERROR_PORT_DOWN: case IBT_EVENT_PORT_UP: /* * Events will be delivered to all instances that have * done ibt_open_hca() but not yet done ibt_close_hca(). * Only need to do work for our port; IBTF will deliver * events for other ports on the hca we have ibt_open_hca'ed * too. Note that ibd_drv_init() initializes id_port before * doing ibt_open_hca(). */ ASSERT(state->id_hca_hdl == hca_hdl); if (state->id_port != event->ev_port) break; ibd_link_mod(state, code); break; case IBT_HCA_ATTACH_EVENT: case IBT_HCA_DETACH_EVENT: /* * When a new card is plugged to the system, attach_event is * invoked. Additionally, a cfgadm needs to be run to make the * card known to the system, and an ifconfig needs to be run to * plumb up any ibd interfaces on the card. In the case of card * unplug, a cfgadm is run that will trigger any RCM scripts to * unplumb the ibd interfaces on the card; when the card is * actually unplugged, the detach_event is invoked; * additionally, if any ibd instances are still active on the * card (eg there were no associated RCM scripts), driver's * detach routine is invoked. */ break; default: break; } } /* * Attach device to the IO framework. */ static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { mac_register_t *macp; ibd_state_t *state; int instance; int err; switch (cmd) { case DDI_ATTACH: break; case DDI_RESUME: /* This driver does not support resume */ default: return (DDI_FAILURE); } /* * Allocate soft device data structure */ instance = ddi_get_instance(dip); if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) return (DDI_FAILURE); state = ddi_get_soft_state(ibd_list, instance); /* pre ibt_attach() soft state initialization */ if (ibd_state_init(state, dip) != DDI_SUCCESS) { DPRINT(10, "ibd_attach : failed in ibd_state_init()"); goto attach_fail_state_init; } /* alloc rx soft intr */ if ((ibd_rx_softintr == 1) && ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) { DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); goto attach_fail_ddi_add_rx_softintr; } /* alloc tx soft intr */ if ((ibd_tx_softintr == 1) && ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) { DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); goto attach_fail_ddi_add_tx_softintr; } /* "attach" to IBTL */ if (ibt_attach(&ibd_clnt_modinfo, dip, state, &state->id_ibt_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_attach : failed in ibt_attach()"); goto attach_fail_ibt_attach; } /* Finish initializing this driver */ if (ibd_drv_init(state) != DDI_SUCCESS) { DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); goto attach_fail_drv_init; } /* * Initialize pointers to device specific functions which will be * used by the generic layer. */ if ((macp = mac_alloc(MAC_VERSION)) == NULL) { DPRINT(10, "ibd_attach : failed in mac_alloc()"); goto attach_fail_drv_init; } macp->m_type_ident = MAC_PLUGIN_IDENT_IB; macp->m_driver = state; macp->m_dip = state->id_dip; macp->m_src_addr = (uint8_t *)&state->id_macaddr; macp->m_callbacks = &ib_m_callbacks; macp->m_min_sdu = 0; macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE; /* * Register ourselves with the GLDv3 interface */ err = mac_register(macp, &state->id_mh); mac_free(macp); if (err != 0) { DPRINT(10, "ibd_attach : failed in mac_register()"); goto attach_fail_mac_register; } /* * Setup the handler we will use for regular DLPI stuff. Its important * to setup the recv handler after registering with gldv3. */ ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); goto attach_fail_setup_handler; } /* * Setup the subnet notices handler after we initialize the a/mcaches * and start the async thread, both of which are required for the * trap handler to function properly. Enable the trap handler to * queue requests to the async thread after the mac_register, because * the async daemon invokes mac_tx_update(), which must be done after * mac_register(). */ ibt_register_subnet_notices(state->id_ibt_hdl, ibd_snet_notices_handler, state); mutex_enter(&state->id_trap_lock); state->id_trap_stop = B_FALSE; mutex_exit(&state->id_trap_lock); /* * Indicate link status to GLDv3 and higher layers. By default, * we assume we are in up state (which must have been true at * least at the time the broadcast mcg's were probed); if there * were any up/down transitions till the time we come here, the * async handler will have updated last known state, which we * use to tell GLDv3. The async handler will not send any * notifications to GLDv3 till we reach here in the initialization * sequence. */ mac_link_update(state->id_mh, state->id_link_state); return (DDI_SUCCESS); /* Attach failure points, cleanup */ attach_fail_setup_handler: (void) mac_unregister(state->id_mh); attach_fail_mac_register: ibd_drv_fini(state); attach_fail_drv_init: if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) ibd_print_warn(state, "failed to free IB resources"); attach_fail_ibt_attach: if (ibd_tx_softintr == 1) ddi_remove_softintr(state->id_tx); attach_fail_ddi_add_tx_softintr: if (ibd_rx_softintr == 1) ddi_remove_softintr(state->id_rx); attach_fail_ddi_add_rx_softintr: ibd_state_fini(state); attach_fail_state_init: ddi_soft_state_free(ibd_list, instance); return (DDI_FAILURE); } /* * Detach device from the IO framework. */ static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { ibd_state_t *state; int status; int instance; switch (cmd) { case DDI_DETACH: break; case DDI_SUSPEND: default: return (DDI_FAILURE); } instance = ddi_get_instance(dip); state = ddi_get_soft_state(ibd_list, instance); /* * First, stop receive interrupts; this stops the * driver from handing up buffers to higher layers. * Wait for receive buffers to be returned; give up * after 5 seconds. */ ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); status = 50; while (state->id_rx_list.dl_bufs_outstanding > 0) { delay(drv_usectohz(100000)); if (--status == 0) { DPRINT(2, "ibd_detach : reclaiming failed"); goto failed; } } if (mac_unregister(state->id_mh) != DDI_SUCCESS) { DPRINT(10, "ibd_detach : failed in mac_unregister()"); goto failed; } if (ibd_rx_softintr == 1) ddi_remove_softintr(state->id_rx); if (ibd_tx_softintr == 1) ddi_remove_softintr(state->id_tx); ibd_drv_fini(state); if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) ibd_print_warn(state, "failed to free all IB resources at " "driver detach time"); ibd_state_fini(state); ddi_soft_state_free(ibd_list, instance); return (DDI_SUCCESS); failed: /* * Reap all the Tx/Rx completions that were posted since we * turned off the notification. Turn on notifications. There * is a race in that we do not reap completions that come in * after the poll and before notifications get turned on. That * is okay, the next rx/tx packet will trigger a completion * that will reap any missed completions. */ ibd_poll_compq(state, state->id_rcq_hdl); ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); return (DDI_FAILURE); } /* * Pre ibt_attach() driver initialization */ static int ibd_state_init(ibd_state_t *state, dev_info_t *dip) { char buf[64]; mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); state->id_link_state = LINK_STATE_UNKNOWN; mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); state->id_trap_stop = B_TRUE; state->id_trap_inprog = 0; mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL); state->id_dip = dip; mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); state->id_tx_list.dl_head = NULL; state->id_tx_list.dl_tail = NULL; state->id_tx_list.dl_pending_sends = B_FALSE; state->id_tx_list.dl_cnt = 0; mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); state->id_rx_list.dl_head = NULL; state->id_rx_list.dl_tail = NULL; state->id_rx_list.dl_bufs_outstanding = 0; state->id_rx_list.dl_cnt = 0; mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_rx_mutex, NULL, MUTEX_DRIVER, NULL); (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0); return (DDI_SUCCESS); } /* * Post ibt_detach() driver deconstruction */ static void ibd_state_fini(ibd_state_t *state) { mutex_destroy(&state->id_tx_list.dl_mutex); mutex_destroy(&state->id_rx_list.dl_mutex); mutex_destroy(&state->id_rx_mutex); mutex_destroy(&state->id_sched_lock); mutex_destroy(&state->id_txcomp_lock); cv_destroy(&state->id_trap_cv); mutex_destroy(&state->id_trap_lock); mutex_destroy(&state->id_link_mutex); kmem_cache_destroy(state->id_req_kmc); } /* * Fetch IBA parameters for the network device from IB nexus. */ static int ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) { /* * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. * Note that the default partition is also allowed. */ state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 0, "port-pkey", IB_PKEY_INVALID_LIMITED); if (state->id_pkey <= IB_PKEY_INVALID_FULL) { DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" "partition\n"); return (DDI_FAILURE); } /* * ... the IBA port ... */ state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 0, "port-number", 0); if (state->id_port == 0) { DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); return (DDI_FAILURE); } /* * ... and HCA GUID. */ *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 0, "hca-guid", 0); if (*hca_guid == 0) { DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " "guid\n"); return (DDI_FAILURE); } return (DDI_SUCCESS); } /* * Fetch link speed from SA for snmp ifspeed reporting. */ static uint64_t ibd_get_portspeed(ibd_state_t *state) { int ret; ibt_path_info_t path; ibt_path_attr_t path_attr; uint8_t num_paths; uint64_t ifspeed; /* * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire * translates to 2 Gbps data rate. Thus, 1X single data rate is * 2000000000. Start with that as default. */ ifspeed = 2000000000; bzero(&path_attr, sizeof (path_attr)); /* * Get the port speed from Loopback path information. */ path_attr.pa_dgids = &state->id_sgid; path_attr.pa_num_dgids = 1; path_attr.pa_sgid = state->id_sgid; if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) goto earlydone; if (num_paths < 1) goto earlydone; /* * In case SA does not return an expected value, report the default * speed as 1X. */ ret = 1; switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ ret = 1; break; case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ ret = 4; break; case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ ret = 12; break; case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ ret = 2; break; case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ ret = 8; break; case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ ret = 16; break; case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ ret = 24; break; case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ ret = 32; break; case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ ret = 48; break; } ifspeed *= ret; earlydone: return (ifspeed); } /* * Search input mcg list (id_mc_full or id_mc_non) for an entry * representing the input mcg mgid. */ static ibd_mce_t * ibd_mcache_find(ib_gid_t mgid, struct list *mlist) { ibd_mce_t *ptr = list_head(mlist); /* * Do plain linear search. */ while (ptr != NULL) { if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, sizeof (ib_gid_t)) == 0) return (ptr); ptr = list_next(mlist, ptr); } return (NULL); } /* * Execute IBA JOIN. */ static ibt_status_t ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) { ibt_mcg_attr_t mcg_attr; bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; mcg_attr.mc_mgid = mgid; mcg_attr.mc_join_state = mce->mc_jstate; mcg_attr.mc_scope = state->id_scope; mcg_attr.mc_pkey = state->id_pkey; mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, NULL, NULL)); } /* * This code JOINs the port in the proper way (depending on the join * state) so that IBA fabric will forward mcg packets to/from the port. * It also attaches the QPN to the mcg so it can receive those mcg * packets. This code makes sure not to attach the mcg to the QP if * that has been previously done due to the mcg being joined with a * different join state, even though this is not required by SWG_0216, * refid 3610. */ static ibd_mce_t * ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) { ibt_status_t ibt_status; ibd_mce_t *mce, *tmce, *omce = NULL; boolean_t do_attach = B_TRUE; DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", jstate, mgid.gid_prefix, mgid.gid_guid); /* * For enable_multicast Full member joins, we need to do some * extra work. If there is already an mce on the list that * indicates full membership, that means the membership has * not yet been dropped (since the disable_multicast was issued) * because there are pending Tx's to the mcg; in that case, just * mark the mce not to be reaped when the Tx completion queues * an async reap operation. * * If there is already an mce on the list indicating sendonly * membership, try to promote to full membership. Be careful * not to deallocate the old mce, since there might be an AH * pointing to it; instead, update the old mce with new data * that tracks the full membership. */ if ((jstate == IB_MC_JSTATE_FULL) && ((omce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { if (omce->mc_jstate == IB_MC_JSTATE_FULL) { ASSERT(omce->mc_fullreap); omce->mc_fullreap = B_FALSE; return (omce); } else { ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); } } /* * Allocate the ibd_mce_t to track this JOIN. */ mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); mce->mc_fullreap = B_FALSE; mce->mc_jstate = jstate; if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", ibt_status); kmem_free(mce, sizeof (ibd_mce_t)); return (NULL); } /* * Is an IBA attach required? Not if the interface is already joined * to the mcg in a different appropriate join state. */ if (jstate == IB_MC_JSTATE_NON) { tmce = IBD_MCACHE_FIND_FULL(state, mgid); if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) do_attach = B_FALSE; } else if (jstate == IB_MC_JSTATE_FULL) { if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) do_attach = B_FALSE; } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ do_attach = B_FALSE; } if (do_attach) { /* * Do the IBA attach. */ DPRINT(10, "ibd_join_group : ibt_attach_mcg \n"); if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, &mce->mc_info)) != IBT_SUCCESS) { DPRINT(10, "ibd_join_group : failed qp attachment " "%d\n", ibt_status); /* * NOTE that we should probably preserve the join info * in the list and later try to leave again at detach * time. */ (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); kmem_free(mce, sizeof (ibd_mce_t)); return (NULL); } } /* * Insert the ibd_mce_t in the proper list. */ if (jstate == IB_MC_JSTATE_NON) { IBD_MCACHE_INSERT_NON(state, mce); } else { /* * Set up the mc_req fields used for reaping the * mcg in case of delayed tx completion (see * ibd_tx_cleanup()). Also done for sendonly join in * case we are promoted to fullmembership later and * keep using the same mce. */ mce->mc_req.rq_gid = mgid; mce->mc_req.rq_ptr = mce; /* * Check whether this is the case of trying to join * full member, and we were already joined send only. * We try to drop our SendOnly membership, but it is * possible that the mcg does not exist anymore (and * the subnet trap never reached us), so the leave * operation might fail. */ if (omce != NULL) { (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); omce->mc_jstate = IB_MC_JSTATE_FULL; bcopy(&mce->mc_info, &omce->mc_info, sizeof (ibt_mcg_info_t)); kmem_free(mce, sizeof (ibd_mce_t)); return (omce); } mutex_enter(&state->id_mc_mutex); IBD_MCACHE_INSERT_FULL(state, mce); mutex_exit(&state->id_mc_mutex); } return (mce); } /* * Called during port up event handling to attempt to reacquire full * membership to an mcg. Stripped down version of ibd_join_group(). * Note that it is possible that the mcg might have gone away, and * gets recreated at this point. */ static void ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) { ib_gid_t mgid; /* * If the mc_fullreap flag is set, or this join fails, a subsequent * reap/leave is going to try to leave the group. We could prevent * that by adding a boolean flag into ibd_mce_t, if required. */ if (mce->mc_fullreap) return; mgid = mce->mc_info.mc_adds_vect.av_dgid; DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) ibd_print_warn(state, "Failure on port up to rejoin " "multicast gid %016llx:%016llx", (u_longlong_t)mgid.gid_prefix, (u_longlong_t)mgid.gid_guid); } /* * This code handles delayed Tx completion cleanups for mcg's to which * disable_multicast has been issued, regular mcg related cleanups during * disable_multicast, disable_promiscous and mcg traps, as well as * cleanups during driver detach time. Depending on the join state, * it deletes the mce from the appropriate list and issues the IBA * leave/detach; except in the disable_multicast case when the mce * is left on the active list for a subsequent Tx completion cleanup. */ static void ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, uint8_t jstate) { ibd_mce_t *tmce; boolean_t do_detach = B_TRUE; /* * Before detaching, we must check whether the other list * contains the mcg; if we detach blindly, the consumer * who set up the other list will also stop receiving * traffic. */ if (jstate == IB_MC_JSTATE_FULL) { /* * The following check is only relevant while coming * from the Tx completion path in the reap case. */ if (!mce->mc_fullreap) return; mutex_enter(&state->id_mc_mutex); IBD_MCACHE_PULLOUT_FULL(state, mce); mutex_exit(&state->id_mc_mutex); if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) do_detach = B_FALSE; } else if (jstate == IB_MC_JSTATE_NON) { IBD_MCACHE_PULLOUT_NON(state, mce); tmce = IBD_MCACHE_FIND_FULL(state, mgid); if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) do_detach = B_FALSE; } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ mutex_enter(&state->id_mc_mutex); IBD_MCACHE_PULLOUT_FULL(state, mce); mutex_exit(&state->id_mc_mutex); do_detach = B_FALSE; } /* * If we are reacting to a mcg trap and leaving our sendonly or * non membership, the mcg is possibly already gone, so attempting * to leave might fail. On the other hand, we must try to leave * anyway, since this might be a trap from long ago, and we could * have potentially sendonly joined to a recent incarnation of * the mcg and are about to loose track of this information. */ if (do_detach) { DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); } (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); kmem_free(mce, sizeof (ibd_mce_t)); } /* * Async code executed due to multicast and promiscuous disable requests * and mcg trap handling; also executed during driver detach. Mostly, a * leave and detach is done; except for the fullmember case when Tx * requests are pending, whence arrangements are made for subsequent * cleanup on Tx completion. */ static void ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) { ipoib_mac_t mcmac; boolean_t recycled; ibd_mce_t *mce; DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", jstate, mgid.gid_prefix, mgid.gid_guid); if (jstate == IB_MC_JSTATE_NON) { recycled = B_TRUE; mce = IBD_MCACHE_FIND_NON(state, mgid); /* * In case we are handling a mcg trap, we might not find * the mcg in the non list. */ if (mce == NULL) return; } else { mce = IBD_MCACHE_FIND_FULL(state, mgid); /* * In case we are handling a mcg trap, make sure the trap * is not arriving late; if we have an mce that indicates * that we are already a fullmember, that would be a clear * indication that the trap arrived late (ie, is for a * previous incarnation of the mcg). */ if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { if ((mce == NULL) || (mce->mc_jstate == IB_MC_JSTATE_FULL)) return; ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); } else { ASSERT(jstate == IB_MC_JSTATE_FULL); /* * If join group failed, mce will be NULL here. * This is because in GLDv3 driver, set multicast * will always return success. */ if (mce == NULL) return; ASSERT(mce->mc_jstate == IB_MC_JSTATE_FULL); mce->mc_fullreap = B_TRUE; } /* * If no pending Tx's remain that reference the AH * for the mcg, recycle it from active to free list. * Else in the IB_MC_JSTATE_FULL case, just mark the AH, * so the last completing Tx will cause an async reap * operation to be invoked, at which time we will drop our * membership to the mcg so that the pending Tx's complete * successfully. Refer to comments on "AH and MCE active * list manipulation" at top of this file. The lock protects * against Tx fast path and Tx cleanup code. */ mutex_enter(&state->id_ac_mutex); ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); recycled = ibd_acache_recycle(state, &mcmac, (jstate == IB_MC_JSTATE_SEND_ONLY_NON)); mutex_exit(&state->id_ac_mutex); } if (recycled) { DPRINT(2, "ibd_leave_group : leave_group reaping : " "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); ibd_async_reap_group(state, mce, mgid, jstate); } } /* * Find the broadcast address as defined by IPoIB; implicitly * determines the IBA scope, mtu, tclass etc of the link the * interface is going to be a member of. */ static ibt_status_t ibd_find_bgroup(ibd_state_t *state) { ibt_mcg_attr_t mcg_attr; uint_t numg; uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, IB_MC_SCOPE_GLOBAL }; int i, mcgmtu; boolean_t found = B_FALSE; bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); mcg_attr.mc_pkey = state->id_pkey; state->id_mgid.gid_guid = IB_MCGID_IPV4_LOW_GROUP_MASK; for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { state->id_scope = mcg_attr.mc_scope = scopes[i]; /* * Look for the IPoIB broadcast group. */ state->id_mgid.gid_prefix = (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | ((uint64_t)state->id_scope << 48) | ((uint32_t)(state->id_pkey << 16))); mcg_attr.mc_mgid = state->id_mgid; if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, &state->id_mcinfo, &numg) == IBT_SUCCESS) { found = B_TRUE; break; } } if (!found) { ibd_print_warn(state, "IPoIB broadcast group absent"); return (IBT_FAILURE); } /* * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. */ mcgmtu = (128 << state->id_mcinfo->mc_mtu); if (state->id_mtu < mcgmtu) { ibd_print_warn(state, "IPoIB broadcast group MTU %d " "greater than port's maximum MTU %d", mcgmtu, state->id_mtu); return (IBT_FAILURE); } state->id_mtu = mcgmtu; return (IBT_SUCCESS); } /* * Post ibt_attach() initialization. */ static int ibd_drv_init(ibd_state_t *state) { kthread_t *kht; ibt_ud_chan_alloc_args_t ud_alloc_attr; ibt_ud_chan_query_attr_t ud_chan_attr; ibt_hca_portinfo_t *port_infop; ibt_hca_attr_t hca_attrs; ibt_status_t ibt_status; ibt_cq_attr_t cq_attr; ib_guid_t hca_guid; uint32_t real_size; uint32_t *ptr; char pathname[OBP_MAXPATHLEN]; uint_t psize, port_infosz; /* * Initialize id_port before ibt_open_hca because of * ordering requirements in port up/down handling. */ if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) return (DDI_FAILURE); if (ibt_open_hca(state->id_ibt_hdl, hca_guid, &state->id_hca_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); return (DDI_FAILURE); } mutex_enter(&state->id_link_mutex); ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, &port_infop, &psize, &port_infosz); if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { mutex_exit(&state->id_link_mutex); DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); (void) ibt_close_hca(state->id_hca_hdl); return (DDI_FAILURE); } /* * If the link already went down by the time we get here, give up; * we can not even get the gid since that is not valid. We would * fail in ibd_find_bgroup() anyway. */ if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { mutex_exit(&state->id_link_mutex); ibt_free_portinfo(port_infop, port_infosz); (void) ibt_close_hca(state->id_hca_hdl); ibd_print_warn(state, "Port is not active"); return (DDI_FAILURE); } /* * This verifies the Pkey ibnexus handed us is still valid. * This is also the point from which the pkey table for the * port must hold the exact pkey value at the exact index * across port up/downs. */ if (ibt_pkey2index(state->id_hca_hdl, state->id_port, state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { mutex_exit(&state->id_link_mutex); ibt_free_portinfo(port_infop, port_infosz); DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); (void) ibt_close_hca(state->id_hca_hdl); return (DDI_FAILURE); } state->id_mtu = (128 << port_infop->p_mtu); state->id_sgid = *port_infop->p_sgid_tbl; state->id_link_state = LINK_STATE_UP; mutex_exit(&state->id_link_mutex); ibt_free_portinfo(port_infop, port_infosz); state->id_link_speed = ibd_get_portspeed(state); ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); ASSERT(ibt_status == IBT_SUCCESS); if (ibd_find_bgroup(state) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); goto drv_init_fail_find_bgroup; } if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, &state->id_pd_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); goto drv_init_fail_alloc_pd; } /* Initialize the parallel ARP cache and AHs */ if (ibd_acache_init(state) != DDI_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); goto drv_init_fail_acache; } /* * Check various tunable limits. */ if (hca_attrs.hca_max_sgl < IBD_MAX_SQSEG) { ibd_print_warn(state, "Setting #sgl = %d instead of default %d", hca_attrs.hca_max_sgl, IBD_MAX_SQSEG); state->id_max_sqseg = hca_attrs.hca_max_sgl; } else { state->id_max_sqseg = IBD_MAX_SQSEG; } /* * First, check #r/s wqes against max channel size. */ if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) state->id_num_rwqe = hca_attrs.hca_max_chan_sz; else state->id_num_rwqe = IBD_NUM_RWQE; if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) state->id_num_swqe = hca_attrs.hca_max_chan_sz; else state->id_num_swqe = IBD_NUM_SWQE; /* * Allocate Rx/combined CQ: * Theoretically, there is no point in having more than #rwqe * plus #swqe cqe's, except that the CQ will be signalled for * overflow when the last wqe completes, if none of the previous * cqe's have been polled. Thus, we allocate just a few less wqe's * to make sure such overflow does not occur. */ cq_attr.cq_sched = NULL; cq_attr.cq_flags = IBT_CQ_NO_FLAGS; if (ibd_separate_cqs == 1) { /* * Allocate Receive CQ. */ if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { cq_attr.cq_size = state->id_num_rwqe + 1; } else { cq_attr.cq_size = hca_attrs.hca_max_cq_sz; state->id_num_rwqe = cq_attr.cq_size - 1; } if (state->id_num_rwqe < IBD_RX_THRESHOLD) { ibd_print_warn(state, "Computed #rwqe %d based on " "requested size and supportable CQ size is less " "than the required threshold %d", state->id_num_rwqe, IBD_RX_THRESHOLD); goto drv_init_fail_min_rwqes; } if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); goto drv_init_fail_alloc_rcq; } state->id_rxwcs_size = state->id_num_rwqe + 1; state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * state->id_rxwcs_size, KM_SLEEP); /* * Allocate Send CQ. */ if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { cq_attr.cq_size = state->id_num_swqe + 1; } else { cq_attr.cq_size = hca_attrs.hca_max_cq_sz; state->id_num_swqe = cq_attr.cq_size - 1; } if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); goto drv_init_fail_alloc_scq; } state->id_txwcs_size = state->id_num_swqe + 1; state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * state->id_txwcs_size, KM_SLEEP); } else { /* * Allocate combined Send/Receive CQ. */ if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + state->id_num_swqe + 1)) { cq_attr.cq_size = state->id_num_rwqe + state->id_num_swqe + 1; } else { cq_attr.cq_size = hca_attrs.hca_max_cq_sz; state->id_num_rwqe = ((cq_attr.cq_size - 1) * state->id_num_rwqe) / (state->id_num_rwqe + state->id_num_swqe); state->id_num_swqe = cq_attr.cq_size - 1 - state->id_num_rwqe; } if (state->id_num_rwqe < IBD_RX_THRESHOLD) { ibd_print_warn(state, "Computed #rwqe %d based on " "requested size and supportable CQ size is less " "than the required threshold %d", state->id_num_rwqe, IBD_RX_THRESHOLD); goto drv_init_fail_min_rwqes; } state->id_rxwcs_size = cq_attr.cq_size; state->id_txwcs_size = state->id_rxwcs_size; if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); goto drv_init_fail_alloc_rcq; } state->id_scq_hdl = state->id_rcq_hdl; state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * state->id_rxwcs_size, KM_SLEEP); state->id_txwcs = state->id_rxwcs; } /* * Print message in case we could not allocate as many wqe's * as was requested. Note that in the combined CQ case, we will * get the following message. */ if (state->id_num_rwqe != IBD_NUM_RWQE) ibd_print_warn(state, "Setting #rwqe = %d instead of default " "%d", state->id_num_rwqe, IBD_NUM_RWQE); if (state->id_num_swqe != IBD_NUM_SWQE) ibd_print_warn(state, "Setting #swqe = %d instead of default " "%d", state->id_num_swqe, IBD_NUM_SWQE); ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; ud_alloc_attr.ud_hca_port_num = state->id_port; ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; ud_alloc_attr.ud_scq = state->id_scq_hdl; ud_alloc_attr.ud_rcq = state->id_rcq_hdl; ud_alloc_attr.ud_pd = state->id_pd_hdl; ud_alloc_attr.ud_pkey_ix = state->id_pkix; ud_alloc_attr.ud_clone_chan = NULL; if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" "\n"); goto drv_init_fail_alloc_chan; } if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != DDI_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); goto drv_init_fail_query_chan; } state->id_qpnum = ud_chan_attr.ud_qpn; /* Initialize the Transmit buffer list */ if (ibd_init_txlist(state) != DDI_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); goto drv_init_fail_txlist_init; } if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { /* Setup the handler we will use for regular DLPI stuff */ ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in" " ibt_enable_cq_notify()\n"); goto drv_init_fail_cq_notify; } } /* Create the service fifos before we start receiving */ if ((state->id_fifos = map_rx_srv_fifos(&state->id_nfifos, state)) == NULL) { DPRINT(10, "ibd_drv_init : failed in map_rx_srv_fifos()\n"); goto drv_init_fail_srv_fifo; } /* Initialize the Receive buffer list */ if (ibd_init_rxlist(state) != DDI_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); goto drv_init_fail_rxlist_init; } /* Join to IPoIB broadcast group as required by IPoIB */ if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); goto drv_init_fail_join_group; } /* Create the async thread */ if ((kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, TS_RUN, minclsyspri)) == NULL) { /* Do we have to specially leave the group? */ DPRINT(10, "ibd_drv_init : failed in thread_create\n"); goto drv_init_fail_thread_create; } state->id_async_thrid = kht->t_did; /* * The local mac address is now known. Create the IPoIB * address. */ ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, state->id_sgid.gid_prefix, state->id_sgid.gid_guid); /* * Similarly, program in the broadcast mac address. */ ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, state->id_mgid.gid_guid); ptr = (uint32_t *)&state->id_macaddr; DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); ptr = (uint32_t *)&state->id_bcaddr; DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", state->id_pkey, state->id_mgid.gid_prefix, state->id_mgid.gid_guid); DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", state->id_sgid.gid_prefix, state->id_sgid.gid_guid); DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); (void) ddi_pathname(state->id_dip, pathname); DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); return (DDI_SUCCESS); drv_init_fail_thread_create: ibd_leave_group(state, state->id_mgid, IB_MC_JSTATE_FULL); drv_init_fail_join_group: ibd_fini_rxlist(state); drv_init_fail_rxlist_init: unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); drv_init_fail_srv_fifo: drv_init_fail_cq_notify: ibd_fini_txlist(state); drv_init_fail_txlist_init: drv_init_fail_query_chan: if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); drv_init_fail_alloc_chan: if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != IBT_SUCCESS)) DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); if (ibd_separate_cqs == 1) kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * state->id_txwcs_size); drv_init_fail_alloc_scq: if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); drv_init_fail_min_rwqes: drv_init_fail_alloc_rcq: ibd_acache_fini(state); drv_init_fail_acache: if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); drv_init_fail_alloc_pd: ibt_free_mcg_info(state->id_mcinfo, 1); drv_init_fail_find_bgroup: if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); return (DDI_FAILURE); } /* * Allocate the statically allocated Tx buffer list. */ static int ibd_init_txlist(ibd_state_t *state) { ibd_swqe_t *swqe; int i; for (i = 0; i < state->id_num_swqe; i++) { if (ibd_alloc_swqe(state, &swqe) != DDI_SUCCESS) { DPRINT(10, "ibd_init_txlist : failed in " "ibd_alloc_swqe()\n"); ibd_fini_txlist(state); return (DDI_FAILURE); } /* add to list */ state->id_tx_list.dl_cnt++; if (state->id_tx_list.dl_head == NULL) { swqe->swqe_prev = NULL; swqe->swqe_next = NULL; state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); } else { swqe->swqe_prev = state->id_tx_list.dl_tail; swqe->swqe_next = NULL; state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); } } return (DDI_SUCCESS); } /* * Free the statically allocated Tx buffer list. */ static void ibd_fini_txlist(ibd_state_t *state) { ibd_swqe_t *node; mutex_enter(&state->id_tx_list.dl_mutex); while (state->id_tx_list.dl_head != NULL) { node = WQE_TO_SWQE(state->id_tx_list.dl_head); state->id_tx_list.dl_head = node->swqe_next; state->id_tx_list.dl_cnt--; ASSERT(state->id_tx_list.dl_cnt >= 0); ibd_free_swqe(state, node); } mutex_exit(&state->id_tx_list.dl_mutex); } /* * Allocate a single send wqe and register it so it is almost * ready to be posted to the hardware. */ static int ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe) { ibt_mr_attr_t mem_attr; ibd_swqe_t *swqe; swqe = kmem_alloc(sizeof (ibd_swqe_t), KM_SLEEP); *wqe = swqe; swqe->swqe_type = IBD_WQE_SEND; swqe->swqe_next = NULL; swqe->swqe_prev = NULL; swqe->swqe_im_mblk = NULL; /* alloc copy buffer, must be max size to handle multiple mblk case */ swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP); mem_attr.mr_vaddr = (uint64_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; mem_attr.mr_len = state->id_mtu; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_SLEEP; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &swqe->swqe_copybuf.ic_mr_hdl, &swqe->swqe_copybuf.ic_mr_desc) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_swqe : failed in ibt_register_mem()"); kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu); kmem_free(swqe, sizeof (ibd_swqe_t)); return (DDI_FAILURE); } swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)swqe->swqe_copybuf.ic_bufaddr; swqe->swqe_copybuf.ic_sgl.ds_key = swqe->swqe_copybuf.ic_mr_desc.md_lkey; swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; swqe->w_swr.wr_trans = IBT_UD_SRV; swqe->w_swr.wr_opcode = IBT_WRC_SEND; /* These are set in send */ swqe->w_swr.wr_nds = 0; swqe->w_swr.wr_sgl = NULL; return (DDI_SUCCESS); } /* * Free an allocated send wqe. */ static void ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) { if (ibt_deregister_mr(state->id_hca_hdl, swqe->swqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_free_swqe : failed in ibt_deregister_mem()"); return; } kmem_free(swqe->swqe_copybuf.ic_bufaddr, state->id_mtu); kmem_free(swqe, sizeof (ibd_swqe_t)); } /* * Post a rwqe to the hardware and add it to the Rx list. The * "recycle" parameter indicates whether an old rwqe is being * recycled, or this is a new one. */ static int ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) { /* * Here we should add dl_cnt before post recv, because we would * have to make sure dl_cnt has already updated before * corresponding ibd_process_rx() is called. */ atomic_add_32(&state->id_rx_list.dl_cnt, 1); if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) != IBT_SUCCESS) { (void) atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()"); return (DDI_FAILURE); } /* * Buffers being recycled are already in the list. */ if (recycle) return (DDI_SUCCESS); mutex_enter(&state->id_rx_list.dl_mutex); if (state->id_rx_list.dl_head == NULL) { rwqe->rwqe_prev = NULL; rwqe->rwqe_next = NULL; state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); } else { rwqe->rwqe_prev = state->id_rx_list.dl_tail; rwqe->rwqe_next = NULL; state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); } mutex_exit(&state->id_rx_list.dl_mutex); return (DDI_SUCCESS); } /* * Allocate the statically allocated Rx buffer list. */ static int ibd_init_rxlist(ibd_state_t *state) { ibd_rwqe_t *rwqe; int i; for (i = 0; i < state->id_num_rwqe; i++) { if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { ibd_fini_rxlist(state); return (DDI_FAILURE); } if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { ibd_free_rwqe(state, rwqe); ibd_fini_rxlist(state); return (DDI_FAILURE); } } return (DDI_SUCCESS); } /* * Free the statically allocated Rx buffer list. * */ static void ibd_fini_rxlist(ibd_state_t *state) { ibd_rwqe_t *node; mutex_enter(&state->id_rx_list.dl_mutex); while (state->id_rx_list.dl_head != NULL) { node = WQE_TO_RWQE(state->id_rx_list.dl_head); state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; state->id_rx_list.dl_cnt--; ASSERT(state->id_rx_list.dl_cnt >= 0); ibd_free_rwqe(state, node); } mutex_exit(&state->id_rx_list.dl_mutex); } /* * Allocate a single recv wqe and register it so it is almost * ready to be posted to the hardware. */ static int ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) { ibt_mr_attr_t mem_attr; ibd_rwqe_t *rwqe; if ((rwqe = kmem_alloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); return (DDI_FAILURE); } *wqe = rwqe; rwqe->rwqe_type = IBD_WQE_RECV; rwqe->w_state = state; rwqe->rwqe_next = NULL; rwqe->rwqe_prev = NULL; rwqe->w_freeing_wqe = B_FALSE; rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; rwqe->w_freemsg_cb.free_arg = (char *)rwqe; if ((rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + IPOIB_GRH_SIZE, KM_NOSLEEP)) == NULL) { DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc2"); kmem_free(rwqe, sizeof (ibd_rwqe_t)); return (DDI_FAILURE); } if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == NULL) { DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE); kmem_free(rwqe, sizeof (ibd_rwqe_t)); return (DDI_FAILURE); } mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); rwqe->w_freeing_wqe = B_TRUE; freemsg(rwqe->rwqe_im_mblk); kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE); kmem_free(rwqe, sizeof (ibd_rwqe_t)); return (DDI_FAILURE); } rwqe->rwqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; rwqe->rwqe_copybuf.ic_sgl.ds_key = rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; rwqe->w_rwr.wr_nds = 1; rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; return (DDI_SUCCESS); } /* * Free an allocated recv wqe. */ static void ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) { if (ibt_deregister_mr(state->id_hca_hdl, rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { DPRINT(10, "ibd_free_rwqe : failed in ibt_deregister_mr()"); return; } /* * Indicate to the callback function that this rwqe/mblk * should not be recycled. The freemsg() will invoke * ibd_freemsg_cb(). */ if (rwqe->rwqe_im_mblk != NULL) { rwqe->w_freeing_wqe = B_TRUE; freemsg(rwqe->rwqe_im_mblk); } kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE); kmem_free(rwqe, sizeof (ibd_rwqe_t)); } /* * Delete the rwqe being freed from the rx list. */ static void ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) { mutex_enter(&state->id_rx_list.dl_mutex); if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) state->id_rx_list.dl_head = rwqe->rwqe_next; else rwqe->rwqe_prev->w_next = rwqe->rwqe_next; if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) state->id_rx_list.dl_tail = rwqe->rwqe_prev; else rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; mutex_exit(&state->id_rx_list.dl_mutex); } /* * Pre ibt_detach() deconstruction. */ static void ibd_drv_fini(ibd_state_t *state) { ib_gid_t mgid; ibd_mce_t *mce; ibt_status_t status; uint8_t jstate; /* * Desubscribe from trap notices; we will be tearing down * the mcg lists soon. Make sure the trap handler does nothing * even if it is invoked (ie till we invoke ibt_detach()). */ ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); mutex_enter(&state->id_trap_lock); state->id_trap_stop = B_TRUE; while (state->id_trap_inprog > 0) cv_wait(&state->id_trap_cv, &state->id_trap_lock); mutex_exit(&state->id_trap_lock); /* * Flushing the channel ensures that all pending WQE's * are marked with flush_error and handed to the CQ. It * does not guarantee the invocation of the CQ handler. * This call is guaranteed to return successfully for UD QPNs. */ status = ibt_flush_channel(state->id_chnl_hdl); ASSERT(status == IBT_SUCCESS); /* * We possibly need a loop here to wait for all the Tx * callbacks to happen. The Tx handlers will retrieve * held resources like AH ac_ref count, registered memory * and possibly ASYNC_REAP requests. Rx interrupts were already * turned off (in ibd_detach()); turn off Tx interrupts and * poll. By the time the polling returns an empty indicator, * we are sure we have seen all pending Tx callbacks. Note * that after the ibt_set_cq_handler() returns, the old handler * is guaranteed not to be invoked anymore. */ if (ibd_separate_cqs == 1) ibt_set_cq_handler(state->id_scq_hdl, 0, 0); ibd_poll_compq(state, state->id_scq_hdl); /* * No more async requests will be posted since the device has been * unregistered; completion handlers have been turned off, so Tx * handler will not cause any more ASYNC_REAP requests. Queue a * request for the async thread to exit, which will be serviced * after any pending ones. This can take a while, specially if the * SM is unreachable, since IBMF will slowly timeout each SM request * issued by the async thread. Reap the thread before continuing on, * we do not want it to be lingering in modunloaded code. */ ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_EXIT); thread_join(state->id_async_thrid); /* * We can not be in promiscuous mode anymore, upper layers * would have made a request to disable it (if ever set previously) * before the detach is allowed to progress to this point; and the * aysnc thread would have processed that request by now. Thus the * nonmember list is guaranteed empty at this point. */ ASSERT(state->id_prom_op != COMPLETED); /* * Drop all residual full/non membership. This includes full * membership to the broadcast group, and any nonmembership * acquired during transmits. We do this after the Tx completion * handlers are done, since those might result in some late * leaves; this also eliminates a potential race with that * path wrt the mc full list insert/delete. Trap handling * has also been suppressed at this point. Thus, no locks * are required while traversing the mc full list. */ DPRINT(2, "ibd_drv_fini : clear full cache entries"); mce = list_head(&state->id_mc_full); while (mce != NULL) { mgid = mce->mc_info.mc_adds_vect.av_dgid; jstate = mce->mc_jstate; mce = list_next(&state->id_mc_full, mce); ibd_leave_group(state, mgid, jstate); } ibt_free_mcg_info(state->id_mcinfo, 1); /* * Kill the channel now; guaranteed to return successfully * for UD QPNs. */ status = ibt_free_channel(state->id_chnl_hdl); ASSERT(status == IBT_SUCCESS); /* * Kill the CQ; all completion handlers are guaranteed to * have terminated by the time this returns. Since we killed * the QPN above, we can not receive the IBT_CQ_BUSY error. */ status = ibt_free_cq(state->id_rcq_hdl); ASSERT(status == IBT_SUCCESS); kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); if (ibd_separate_cqs == 1) { status = ibt_free_cq(state->id_scq_hdl); ASSERT(status == IBT_SUCCESS); kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * state->id_txwcs_size); } /* * We killed the receive interrupts, thus, we will not be * required to handle received packets anymore. Thus, kill * service threads since they are not going to be used anymore. */ unmap_rx_srv_fifos(state->id_nfifos, state->id_fifos); /* * Since these following will act on the Rx/Tx list, which * is also looked at by the Rx/Tx handlers, keep them around * till all handlers are guaranteed to have completed. */ ibd_fini_rxlist(state); ibd_fini_txlist(state); /* * Clean up the active AH hash list. */ mod_hash_destroy_hash(state->id_ah_active_hash); /* * Free parallel ARP cache and AHs; we are sure all of these * resources have been released by the Tx completion handler. */ ibd_acache_fini(state); /* * We freed the QPN, all the MRs and AHs. This step should not * fail; print a warning message if it does fail, due to a bug * in the driver. */ if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) ibd_print_warn(state, "failed to free protection domain"); if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) ibd_print_warn(state, "failed to close HCA device"); } /* * IBA Rx/Tx completion queue handler. Guaranteed to be single * threaded and nonreentrant for this CQ. When using combined CQ, * this handles Tx and Rx completions. With separate CQs, this handles * only Rx completions. */ /* ARGSUSED */ static void ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibd_state_t *state = (ibd_state_t *)arg; atomic_add_64(&state->id_num_intrs, 1); if (ibd_rx_softintr == 1) ddi_trigger_softintr(state->id_rx); else (void) ibd_intr((char *)state); } /* * Separate CQ handler for Tx completions, when the Tx CQ is in * interrupt driven mode. */ /* ARGSUSED */ static void ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibd_state_t *state = (ibd_state_t *)arg; atomic_add_64(&state->id_num_intrs, 1); if (ibd_tx_softintr == 1) ddi_trigger_softintr(state->id_tx); else (void) ibd_tx_recycle((char *)state); } /* * Multicast group create/delete trap handler. These will be delivered * on a kernel thread (handling can thus block) and can be invoked * concurrently. The handler can be invoked anytime after it is * registered and before ibt_detach(). */ /* ARGSUSED */ static void ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, ibt_subnet_event_t *event) { ibd_state_t *state = (ibd_state_t *)arg; ibd_req_t *req; /* * The trap handler will get invoked once for every event for * evert port. The input "gid" is the GID0 of the port the * trap came in on; we just need to act on traps that came * to our port, meaning the port on which the ipoib interface * resides. Since ipoib uses GID0 of the port, we just match * the gids to check whether we need to handle the trap. */ if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) return; DPRINT(10, "ibd_notices_handler : %d\n", code); switch (code) { case IBT_SM_EVENT_UNAVAILABLE: /* * If we are in promiscuous mode or have * sendnonmembers, we need to print a warning * message right now. Else, just store the * information, print when we enter promiscuous * mode or attempt nonmember send. We might * also want to stop caching sendnonmember. */ ibd_print_warn(state, "IBA multicast support " "degraded due to unavailability of multicast " "traps"); break; case IBT_SM_EVENT_AVAILABLE: /* * If we printed a warning message above or * while trying to nonmember send or get into * promiscuous mode, print an okay message. */ ibd_print_warn(state, "IBA multicast support " "restored due to availability of multicast " "traps"); break; case IBT_SM_EVENT_MCG_CREATED: case IBT_SM_EVENT_MCG_DELETED: /* * Common processing of creation/deletion traps. * First check if the instance is being * [de]initialized; back off then, without doing * anything more, since we are not sure if the * async thread is around, or whether we might * be racing with the detach code in ibd_drv_fini() * that scans the mcg list. */ if (!ibd_async_safe(state)) return; req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); req->rq_gid = event->sm_notice_gid; req->rq_ptr = (void *)code; ibd_queue_work_slot(state, req, ASYNC_TRAP); break; } } static void ibd_async_trap(ibd_state_t *state, ibd_req_t *req) { ib_gid_t mgid = req->rq_gid; ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; DPRINT(10, "ibd_async_trap : %d\n", code); /* * Atomically search the nonmember and sendonlymember lists and * delete. */ ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); if (state->id_prom_op == COMPLETED) { ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); /* * If in promiscuous mode, try to join/attach to the new * mcg. Given the unreliable out-of-order mode of trap * delivery, we can never be sure whether it is a problem * if the join fails. Thus, we warn the admin of a failure * if this was a creation trap. Note that the trap might * actually be reporting a long past event, and the mcg * might already have been deleted, thus we might be warning * in vain. */ if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) ibd_print_warn(state, "IBA promiscuous mode missed " "new multicast gid %016llx:%016llx", (u_longlong_t)mgid.gid_prefix, (u_longlong_t)mgid.gid_guid); } /* * Free the request slot allocated by the subnet event thread. */ ibd_async_done(state); } /* * GLDv3 entry point to get capabilities. */ static boolean_t ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { _NOTE(ARGUNUSED(arg)); switch (cap) { case MAC_CAPAB_HCKSUM: { uint32_t *txflags = cap_data; if (ibd_csum_send > IBD_CSUM_NONE) *txflags = HCKSUM_INET_PARTIAL; else return (B_FALSE); break; } default: return (B_FALSE); } return (B_TRUE); } /* * GLDv3 entry point to start hardware. */ /* ARGSUSED */ static int ibd_m_start(void *arg) { return (0); } /* * GLDv3 entry point to stop hardware from receiving packets. */ /* ARGSUSED */ static void ibd_m_stop(void *arg) { #ifdef RUN_PERFORMANCE ibd_perf((ibd_state_t *)arg); #endif } /* * GLDv3 entry point to modify device's mac address. We do not * allow address modifications. */ static int ibd_m_unicst(void *arg, const uint8_t *macaddr) { ibd_state_t *state; state = (ibd_state_t *)arg; if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) return (0); else return (EINVAL); } /* * The blocking part of the IBA join/leave operations are done out * of here on the async thread. */ static void ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) { DPRINT(3, "ibd_async_multicast : async_setmc op %d :" "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); if (op == ASYNC_JOIN) { if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { ibd_print_warn(state, "Joint multicast group failed :" "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); } } else { /* * Here, we must search for the proper mcg_info and * use that to leave the group. */ ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); } } /* * GLDv3 entry point for multicast enable/disable requests. * This function queues the operation to the async thread and * return success for a valid multicast address. */ static int ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) { ibd_state_t *state = (ibd_state_t *)arg; ipoib_mac_t maddr, *mcast; ib_gid_t mgid; ibd_req_t *req; /* * The incoming multicast address might not be aligned properly * on a 4 byte boundary to be considered an ipoib_mac_t. We force * it to look like one though, to get the offsets of the mc gid, * since we know we are not going to dereference any values with * the ipoib_mac_t pointer. */ bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); mcast = &maddr; /* * Check validity of MCG address. We could additionally check * that a enable/disable is not being issued on the "broadcast" * mcg, but since this operation is only invokable by priviledged * programs anyway, we allow the flexibility to those dlpi apps. * Note that we do not validate the "scope" of the IBA mcg. */ if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) return (EINVAL); /* * fill in multicast pkey and scope */ IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); /* * If someone is trying to JOIN/LEAVE the broadcast group, we do * nothing (ie we stay JOINed to the broadcast group done in * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically * requires to be joined to broadcast groups at all times. * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also * depends on this. */ if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) return (0); ibd_n2h_gid(mcast, &mgid); req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) return (ENOMEM); req->rq_gid = mgid; if (add) { DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); ibd_queue_work_slot(state, req, ASYNC_JOIN); } else { DPRINT(1, "ibd_m_multicst : unset_multicast : " "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); ibd_queue_work_slot(state, req, ASYNC_LEAVE); } return (0); } /* * The blocking part of the IBA promiscuous operations are done * out of here on the async thread. The dlpireq parameter indicates * whether this invocation is due to a dlpi request or due to * a port up/down event. */ static void ibd_async_unsetprom(ibd_state_t *state) { ibd_mce_t *mce = list_head(&state->id_mc_non); ib_gid_t mgid; DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); while (mce != NULL) { mgid = mce->mc_info.mc_adds_vect.av_dgid; mce = list_next(&state->id_mc_non, mce); ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); } state->id_prom_op = NOTSTARTED; } /* * The blocking part of the IBA promiscuous operations are done * out of here on the async thread. The dlpireq parameter indicates * whether this invocation is due to a dlpi request or due to * a port up/down event. */ static void ibd_async_setprom(ibd_state_t *state) { ibt_mcg_attr_t mcg_attr; ibt_mcg_info_t *mcg_info; ib_gid_t mgid; uint_t numg; int i, ret = COMPLETED; DPRINT(2, "ibd_async_setprom : async_set_promisc"); /* * Obtain all active MC groups on the IB fabric with * specified criteria (scope + Pkey + Qkey + mtu). */ bzero(&mcg_attr, sizeof (mcg_attr)); mcg_attr.mc_pkey = state->id_pkey; mcg_attr.mc_scope = state->id_scope; mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; mcg_attr.mc_mtu_req.r_selector = IBT_EQU; if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != IBT_SUCCESS) { ibd_print_warn(state, "Could not get list of IBA multicast " "groups"); ret = ERRORED; goto done; } /* * Iterate over the returned mcg's and join as NonMember * to the IP mcg's. */ for (i = 0; i < numg; i++) { /* * Do a NonMember JOIN on the MC group. */ mgid = mcg_info[i].mc_adds_vect.av_dgid; if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) ibd_print_warn(state, "IBA promiscuous mode missed " "multicast gid %016llx:%016llx", (u_longlong_t)mgid.gid_prefix, (u_longlong_t)mgid.gid_guid); } ibt_free_mcg_info(mcg_info, numg); DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); done: state->id_prom_op = ret; } /* * GLDv3 entry point for multicast promiscuous enable/disable requests. * GLDv3 assumes phys state receives more packets than multi state, * which is not true for IPoIB. Thus, treat the multi and phys * promiscuous states the same way to work with GLDv3's assumption. */ static int ibd_m_promisc(void *arg, boolean_t on) { ibd_state_t *state = (ibd_state_t *)arg; ibd_req_t *req; req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req == NULL) return (ENOMEM); if (on) { DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); ibd_queue_work_slot(state, req, ASYNC_PROMON); } else { DPRINT(1, "ibd_m_promisc : unset_promisc"); ibd_queue_work_slot(state, req, ASYNC_PROMOFF); } return (0); } /* * GLDv3 entry point for gathering statistics. */ static int ibd_m_stat(void *arg, uint_t stat, uint64_t *val) { ibd_state_t *state = (ibd_state_t *)arg; switch (stat) { case MAC_STAT_IFSPEED: *val = state->id_link_speed; break; case MAC_STAT_MULTIRCV: *val = state->id_multi_rcv; break; case MAC_STAT_BRDCSTRCV: *val = state->id_brd_rcv; break; case MAC_STAT_MULTIXMT: *val = state->id_multi_xmt; break; case MAC_STAT_BRDCSTXMT: *val = state->id_brd_xmt; break; case MAC_STAT_RBYTES: *val = state->id_recv_bytes; break; case MAC_STAT_IPACKETS: *val = state->id_rcv_pkt; break; case MAC_STAT_OBYTES: *val = state->id_xmt_bytes; break; case MAC_STAT_OPACKETS: *val = state->id_xmt_pkt; break; case MAC_STAT_NORCVBUF: *val = state->id_rx_short; /* # times below water mark */ break; case MAC_STAT_OERRORS: *val = state->id_ah_error; /* failed AH translation */ break; case MAC_STAT_IERRORS: *val = 0; break; case MAC_STAT_NOXMTBUF: *val = state->id_tx_short; break; default: return (ENOTSUP); } return (0); } /* * Tx reschedule */ static void ibd_async_txsched(ibd_state_t *state) { ibd_req_t *req; /* * For poll mode, if ibd is out of Tx wqe, reschedule to collect * the CQEs. Otherwise, just return for out of Tx wqe. */ if (ibd_txcomp_poll == 1) { mutex_enter(&state->id_txcomp_lock); ibd_poll_compq(state, state->id_scq_hdl); mutex_exit(&state->id_txcomp_lock); if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); ibd_queue_work_slot(state, req, ASYNC_SCHED); return; } } else if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { return; } if (state->id_sched_needed) { mac_tx_update(state->id_mh); state->id_sched_needed = B_FALSE; } } /* * Release one or more chained send wqes back into free list. */ static void ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *swqe) { /* * Add back on Tx list for reuse. */ swqe->swqe_next = NULL; mutex_enter(&state->id_tx_list.dl_mutex); if (state->id_tx_list.dl_pending_sends) { state->id_tx_list.dl_pending_sends = B_FALSE; } if (state->id_tx_list.dl_head == NULL) { state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); } else { state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); } state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); state->id_tx_list.dl_cnt++; mutex_exit(&state->id_tx_list.dl_mutex); } /* * Acquire send wqe from free list. * Returns error number and send wqe pointer. */ static int ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **swqe) { int rc = 0; ibd_swqe_t *wqe; /* * Check and reclaim some of the completed Tx requests. * If someone else is already in this code and pulling Tx * completions, no need to poll, since the current lock holder * will do the work anyway. Normally, we poll for completions * every few Tx attempts, but if we are short on Tx descriptors, * we always try to poll. */ if ((ibd_txcomp_poll == 1) && (state->id_tx_list.dl_cnt < IBD_TXPOLL_THRESHOLD) && (mutex_tryenter(&state->id_txcomp_lock) != 0)) { DPRINT(10, "ibd_send : polling"); ibd_poll_compq(state, state->id_scq_hdl); mutex_exit(&state->id_txcomp_lock); } /* * Grab required transmit wqes. */ mutex_enter(&state->id_tx_list.dl_mutex); wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); if (wqe != NULL) { state->id_tx_list.dl_cnt -= 1; state->id_tx_list.dl_head = wqe->swqe_next; if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) state->id_tx_list.dl_tail = NULL; } else { /* * If we did not find the number we were looking for, flag * no resource. Adjust list appropriately in either case. */ rc = ENOENT; state->id_tx_list.dl_pending_sends = B_TRUE; DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); atomic_add_64(&state->id_tx_short, 1); } mutex_exit(&state->id_tx_list.dl_mutex); *swqe = wqe; return (rc); } /* * The passed in packet has this format: * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data */ static boolean_t ibd_send(ibd_state_t *state, mblk_t *mp) { ibt_status_t ibt_status; ibt_mr_attr_t mem_attr; ibd_ace_t *ace; ibd_swqe_t *node = NULL; ipoib_mac_t *dest; ibd_req_t *req; ib_header_info_t *ipibp; ip6_t *ip6h; mblk_t *nmp = mp; uint_t pktsize; size_t blksize; uchar_t *bufp; int i, ret, len, nmblks = 1; boolean_t dofree = B_TRUE; if ((ret = ibd_acquire_swqes(state, &node)) != 0) { state->id_sched_needed = B_TRUE; if (ibd_txcomp_poll == 1) { goto ibd_send_fail; } return (B_FALSE); } /* * Obtain an address handle for the destination. */ ipibp = (ib_header_info_t *)mp->b_rptr; dest = (ipoib_mac_t *)&ipibp->ib_dst; if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); pktsize = msgsize(mp); atomic_add_64(&state->id_xmt_bytes, pktsize); atomic_inc_64(&state->id_xmt_pkt); if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) atomic_inc_64(&state->id_brd_xmt); else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) atomic_inc_64(&state->id_multi_xmt); if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { node->w_ahandle = ace; node->w_swr.wr.ud.udwr_dest = ace->ac_dest; } else { DPRINT(5, "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", ((ret == EFAULT) ? "failed" : "queued"), htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), htonl(dest->ipoib_gidpref[1]), htonl(dest->ipoib_gidsuff[0]), htonl(dest->ipoib_gidsuff[1])); node->w_ahandle = NULL; /* * for the poll mode, it is probably some cqe pending in the * cq. So ibd has to poll cq here, otherwise acache probably * may not be recycled. */ if (ibd_txcomp_poll == 1) { mutex_enter(&state->id_txcomp_lock); ibd_poll_compq(state, state->id_scq_hdl); mutex_exit(&state->id_txcomp_lock); } /* * Here if ibd_acache_lookup() returns EFAULT, it means ibd * can not find a path for the specific dest address. We * should get rid of this kind of packet. With the normal * case, ibd will return the packet to upper layer and wait * for AH creating. */ if (ret == EFAULT) ret = B_TRUE; else { ret = B_FALSE; dofree = B_FALSE; state->id_sched_needed = B_TRUE; } goto ibd_send_fail; } /* * For ND6 packets, padding is at the front of the source lladdr. * Insert the padding at front. */ if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { if (!pullupmsg(mp, IPV6_HDR_LEN + sizeof (ib_header_info_t))) { DPRINT(10, "ibd_send: pullupmsg failure "); ret = B_TRUE; goto ibd_send_fail; } ipibp = (ib_header_info_t *)mp->b_rptr; } ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ib_header_info_t)); len = ntohs(ip6h->ip6_plen); if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { mblk_t *pad; pad = allocb(4, 0); pad->b_wptr = (uchar_t *)pad->b_rptr + 4; linkb(mp, pad); if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN + len + 4) { if (!pullupmsg(mp, sizeof (ib_header_info_t) + IPV6_HDR_LEN + len + 4)) { DPRINT(10, "ibd_send: pullupmsg " "failure "); ret = B_TRUE; goto ibd_send_fail; } ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + sizeof (ib_header_info_t)); } /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_SEND); } } mp->b_rptr += sizeof (ib_addrs_t); while (((nmp = nmp->b_cont) != NULL) && (++nmblks < (state->id_max_sqseg + 1))) ; pktsize = msgsize(mp); /* * GLDv3 will check mtu. We do checksum related work here. */ IBD_CKSUM_SEND(mp); /* * Copy the data to preregistered buffers, or register the buffer. */ if ((nmblks <= state->id_max_sqseg) && (pktsize > IBD_TX_COPY_THRESHOLD)) { for (i = 0, nmp = mp; i < nmblks; i++, nmp = nmp->b_cont) { mem_attr.mr_vaddr = (uint64_t)(uintptr_t)nmp->b_rptr; mem_attr.mr_len = nmp->b_wptr - nmp->b_rptr; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_NOSLEEP; ibt_status = ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &node->w_smblkbuf[i].im_mr_hdl, &node->w_smblkbuf[i].im_mr_desc); if (ibt_status != IBT_SUCCESS) { /* * We do not expect any error other than * IBT_INSUFF_RESOURCE. */ if (ibt_status != IBT_INSUFF_RESOURCE) DPRINT(10, "ibd_send: %d\n", "failed in ibt_register_mem()", ibt_status); DPRINT(5, "ibd_send: registration failed"); node->w_swr.wr_nds = i; /* * Deregister already registered memory; * fallback to copying the mblk. */ ibd_deregister_mr(state, node); goto ibd_copy_path; } node->w_smblk_sgl[i].ds_va = (ib_vaddr_t)(uintptr_t)nmp->b_rptr; node->w_smblk_sgl[i].ds_key = node->w_smblkbuf[i].im_mr_desc.md_lkey; node->w_smblk_sgl[i].ds_len = nmp->b_wptr - nmp->b_rptr; } node->swqe_im_mblk = mp; node->w_swr.wr_sgl = node->w_smblk_sgl; node->w_swr.wr_nds = nmblks; dofree = B_FALSE; } else { ibd_copy_path: node->swqe_copybuf.ic_sgl.ds_len = pktsize; node->w_swr.wr_nds = 1; node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { blksize = MBLKL(nmp); bcopy(nmp->b_rptr, bufp, blksize); bufp += blksize; } } /* * Queue the wqe to hardware. */ ibt_status = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL); if (ibt_status != IBT_SUCCESS) { /* * We should not fail here; but just in case we do, we * print out a warning to log. */ ibd_print_warn(state, "ibd_send: posting failed: %d", ibt_status); } DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X", INCTXPACK, htonl(ace->ac_mac.ipoib_qpn), htonl(ace->ac_mac.ipoib_gidpref[0]), htonl(ace->ac_mac.ipoib_gidpref[1]), htonl(ace->ac_mac.ipoib_gidsuff[0]), htonl(ace->ac_mac.ipoib_gidsuff[1])); if (dofree) freemsg(mp); return (B_TRUE); ibd_send_fail: if (state->id_sched_needed == B_TRUE) { req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); if (req != NULL) ibd_queue_work_slot(state, req, ASYNC_SCHED); else { dofree = B_TRUE; ret = B_TRUE; } } if (dofree) freemsg(mp); if (node != NULL) ibd_tx_cleanup(state, node); return (ret); } /* * GLDv3 entry point for transmitting datagram. */ static mblk_t * ibd_m_tx(void *arg, mblk_t *mp) { ibd_state_t *state = (ibd_state_t *)arg; mblk_t *next; while (mp != NULL) { next = mp->b_next; mp->b_next = NULL; if (!ibd_send(state, mp)) { /* Send fail */ mp->b_next = next; break; } mp = next; } return (mp); } /* * this handles Tx and Rx completions. With separate CQs, this handles * only Rx completions. */ static uint_t ibd_intr(char *arg) { ibd_state_t *state = (ibd_state_t *)arg; /* * Poll for completed entries; the CQ will not interrupt any * more for incoming (or transmitted) packets. */ ibd_poll_compq(state, state->id_rcq_hdl); /* * Now enable CQ notifications; all packets that arrive now * (or complete transmission) will cause new interrupts. */ if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { /* * We do not expect a failure here. */ DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); } /* * Repoll to catch all packets that might have arrived after * we finished the first poll loop and before interrupts got * armed. */ ibd_poll_compq(state, state->id_rcq_hdl); return (DDI_INTR_CLAIMED); } /* * Common code for interrupt handling as well as for polling * for all completed wqe's while detaching. */ static void ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) { ibd_wqe_t *wqe; ibt_wc_t *wc, *wcs; uint_t numwcs, real_numwcs; int i; /* * In some cases (eg detaching), this code can be invoked on * any cpu after disabling cq notification (thus no concurrency * exists). Apart from that, the following applies normally: * The receive completion handling is always on the Rx interrupt * cpu. Transmit completion handling could be from any cpu if * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ * is interrupt driven. Combined completion handling is always * on the interrupt cpu. Thus, lock accordingly and use the * proper completion array. */ if (ibd_separate_cqs == 1) { if (cq_hdl == state->id_rcq_hdl) { wcs = state->id_rxwcs; numwcs = state->id_rxwcs_size; } else { wcs = state->id_txwcs; numwcs = state->id_txwcs_size; } } else { wcs = state->id_rxwcs; numwcs = state->id_rxwcs_size; } if (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; ASSERT((wqe->w_type == IBD_WQE_SEND) || (wqe->w_type == IBD_WQE_RECV)); if (wc->wc_status != IBT_WC_SUCCESS) { /* * Channel being torn down. */ if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { DPRINT(5, "ibd_intr: flush error"); /* * Only invoke the Tx handler to * release possibly held resources * like AH refcount etc. Can not * invoke Rx handler because it might * try adding buffers to the Rx pool * when we are trying to deinitialize. */ if (wqe->w_type == IBD_WQE_RECV) { continue; } else { DPRINT(10, "%s %d", "ibd_intr: Bad CQ status", wc->wc_status); } } } if (wqe->w_type == IBD_WQE_SEND) { ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); } else { ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); } } } } /* * Deregister the mr associated with a given mblk. */ static void ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) { int i; DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe, swqe->w_swr.wr_nds); for (i = 0; i < swqe->w_swr.wr_nds; i++) { if (ibt_deregister_mr(state->id_hca_hdl, swqe->w_smblkbuf[i].im_mr_hdl) != IBT_SUCCESS) { /* * We do not expect any errors here. */ DPRINT(10, "failed in ibt_deregister_mem()\n"); } } } /* * Common code that deals with clean ups after a successful or * erroneous transmission attempt. */ static void ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) { ibd_ace_t *ace = swqe->w_ahandle; DPRINT(20, "ibd_tx_cleanup %p\n", swqe); /* * If this was a dynamic registration in ibd_send(), * deregister now. */ if (swqe->swqe_im_mblk != NULL) { ibd_deregister_mr(state, swqe); freemsg(swqe->swqe_im_mblk); swqe->swqe_im_mblk = NULL; } /* * Drop the reference count on the AH; it can be reused * now for a different destination if there are no more * posted sends that will use it. This can be eliminated * if we can always associate each Tx buffer with an AH. * The ace can be null if we are cleaning up from the * ibd_send() error path. */ if (ace != NULL) { /* * The recycling logic can be eliminated from here * and put into the async thread if we create another * list to hold ACE's for unjoined mcg's. */ if (DEC_REF_DO_CYCLE(ace)) { ibd_mce_t *mce; /* * Check with the lock taken: we decremented * reference count without the lock, and some * transmitter might alreay have bumped the * reference count (possible in case of multicast * disable when we leave the AH on the active * list). If not still 0, get out, leaving the * recycle bit intact. * * Atomically transition the AH from active * to free list, and queue a work request to * leave the group and destroy the mce. No * transmitter can be looking at the AH or * the MCE in between, since we have the * ac_mutex lock. In the SendOnly reap case, * it is not neccesary to hold the ac_mutex * and recheck the ref count (since the AH was * taken off the active list), we just do it * to have uniform processing with the Full * reap case. */ mutex_enter(&state->id_ac_mutex); mce = ace->ac_mce; if (GET_REF_CYCLE(ace) == 0) { CLEAR_REFCYCLE(ace); /* * Identify the case of fullmember reap as * opposed to mcg trap reap. Also, port up * might set ac_mce to NULL to indicate Tx * cleanup should do no more than put the * AH in the free list (see ibd_async_link). */ if (mce != NULL) { ace->ac_mce = NULL; IBD_ACACHE_PULLOUT_ACTIVE(state, ace); /* * mc_req was initialized at mce * creation time. */ ibd_queue_work_slot(state, &mce->mc_req, ASYNC_REAP); } IBD_ACACHE_INSERT_FREE(state, ace); } mutex_exit(&state->id_ac_mutex); } } /* * Release the send wqe for reuse. */ ibd_release_swqes(state, swqe); } /* * Processing to be done after receipt of a packet; hand off to GLD * in the format expected by GLD. * The recvd packet has this format: 2b sap :: 00 :: data. */ static void ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) { ib_header_info_t *phdr; mblk_t *mp; ipoib_hdr_t *ipibp; ip6_t *ip6h; int rxcnt, len; /* * Track number handed to upper layer, and number still * available to receive packets. */ rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); ASSERT(rxcnt >= 0); atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); /* * Adjust write pointer depending on how much data came in. */ mp = rwqe->rwqe_im_mblk; mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; /* * the IB link will deliver one of the IB link layer * headers called, the Global Routing Header (GRH). * ibd driver uses the information in GRH to build the * Header_info structure and pass it with the datagram up * to GLDv3. * If the GRH is not valid, indicate to GLDv3 by setting * the VerTcFlow field to 0. */ phdr = (ib_header_info_t *)mp->b_rptr; if (wc->wc_flags & IBT_WC_GRH_PRESENT) { phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); /* if it is loop back packet, just drop it. */ if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, IPOIB_ADDRL) == 0) { freemsg(mp); return; } ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, sizeof (ipoib_mac_t)); if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); } else { phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; } } else { /* * It can not be a IBA multicast packet. Must have been * unicast for us. Just copy the interface address to dst. */ phdr->ib_grh.ipoib_vertcflow = 0; ovbcopy(&state->id_macaddr, &phdr->ib_dst, sizeof (ipoib_mac_t)); } DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK); /* * For ND6 packets, padding is at the front of the source/target * lladdr. However the inet6 layer is not aware of it, hence remove * the padding from such packets. */ ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { if (!pullupmsg(mp, IPV6_HDR_LEN + sizeof (ipoib_hdr_t))) { DPRINT(10, "ibd_process_rx: pullupmsg failed"); freemsg(mp); return; } ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); } ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); len = ntohs(ip6h->ip6_plen); if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN + len) { if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + IPV6_HDR_LEN + len)) { DPRINT(10, "ibd_process_rx: pullupmsg" " failed"); freemsg(mp); return; } ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t) + sizeof (ipoib_hdr_t)); } /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_RECV); } } atomic_add_64(&state->id_recv_bytes, wc->wc_bytes_xfer); atomic_inc_64(&state->id_rcv_pkt); if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) atomic_inc_64(&state->id_brd_rcv); else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) atomic_inc_64(&state->id_multi_rcv); /* * Hand off to service thread/GLD. When we have hardware that * does hardware checksum, we will pull the checksum from the * work completion structure here. * on interrupt cpu. */ ibd_send_up(state, mp); /* * Possibly replenish the Rx pool if needed. */ if (rxcnt < IBD_RX_THRESHOLD) { state->id_rx_short++; if (ibd_alloc_rwqe(state, &rwqe) == DDI_SUCCESS) { if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { ibd_free_rwqe(state, rwqe); return; } } } } /* * Callback code invoked from STREAMs when the recv data buffer is free * for recycling. */ static void ibd_freemsg_cb(char *arg) { ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; ibd_state_t *state = rwqe->w_state; /* * If the wqe is being destructed, do not attempt recycling. */ if (rwqe->w_freeing_wqe == B_TRUE) { DPRINT(6, "ibd_freemsg: wqe being freed"); return; } /* * Upper layer has released held mblk. */ atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); if (state->id_rx_list.dl_cnt >= state->id_num_rwqe) { /* * There are already enough buffers on the Rx ring. * Free this one up. */ rwqe->rwqe_im_mblk = NULL; ibd_delete_rwqe(state, rwqe); ibd_free_rwqe(state, rwqe); DPRINT(6, "ibd_freemsg: free up wqe"); } else { rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); if (rwqe->rwqe_im_mblk == NULL) { ibd_delete_rwqe(state, rwqe); ibd_free_rwqe(state, rwqe); DPRINT(6, "ibd_freemsg: desballoc failed"); return; } /* * Post back to h/w. We could actually have more than * id_num_rwqe WQEs on the list if there were multiple * ibd_freemsg_cb() calls outstanding (since the lock is * not held the entire time). This will start getting * corrected over subsequent ibd_freemsg_cb() calls. */ if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { ibd_delete_rwqe(state, rwqe); ibd_free_rwqe(state, rwqe); return; } } } static uint_t ibd_tx_recycle(char *arg) { ibd_state_t *state = (ibd_state_t *)arg; /* * Poll for completed entries; the CQ will not interrupt any * more for completed packets. */ ibd_poll_compq(state, state->id_scq_hdl); /* * Now enable CQ notifications; all completions originating now * will cause new interrupts. */ if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { /* * We do not expect a failure here. */ DPRINT(10, "ibd_tx_recycle: ibt_enable_cq_notify() failed"); } /* * Repoll to catch all packets that might have completed after * we finished the first poll loop and before interrupts got * armed. */ ibd_poll_compq(state, state->id_scq_hdl); /* * Call txsched to notify GLDv3 if it required. */ ibd_async_txsched(state); return (DDI_INTR_CLAIMED); } #ifdef RUN_PERFORMANCE /* * To run the performance test, first do the "ifconfig ibdN plumb" on * the Rx and Tx side. Then use mdb -kw to tweak the following variables: * ibd_performance=1. * ibd_receiver=1 on Rx side. * ibd_sender=1 on Tx side. * Do "ifconfig ibdN" on Rx side to get the Rx mac address, and update * ibd_dest on the Tx side. Next, do ifconfig/unplumb on Rx, this will * make it drop into a 1 minute loop waiting for packets. An * ifconfig/unplumb on the Tx will cause it to send packets to Rx. */ #define IBD_NUM_UNSIGNAL ibd_num_unsignal #define IBD_TX_PKTSIZE ibd_tx_pktsize #define IBD_TX_DATASIZE ibd_tx_datasize static ibd_swqe_t **swqes; static ibt_wc_t *wcs; /* * Set these on Rx and Tx side to do performance run. */ static int ibd_performance = 0; static int ibd_receiver = 0; static int ibd_sender = 0; static ipoib_mac_t ibd_dest; /* * Interrupt coalescing is achieved by asking for a completion intr * only every ibd_num_unsignal'th packet. */ static int ibd_num_unsignal = 8; /* * How big is each packet? */ static int ibd_tx_pktsize = 2048; /* * Total data size to be transmitted. */ static int ibd_tx_datasize = 512*1024*1024; static volatile boolean_t cq_handler_ran = B_FALSE; static volatile int num_completions; /* ARGSUSED */ static void ibd_perf_handler(ibt_cq_hdl_t cq_hdl, void *arg) { ibd_state_t *state = (ibd_state_t *)arg; ibt_cq_hdl_t cqhdl; ibd_wqe_t *wqe; uint_t polled, i; boolean_t cq_enabled = B_FALSE; if (ibd_receiver == 1) cqhdl = state->id_rcq_hdl; else cqhdl = state->id_scq_hdl; /* * Mark the handler as having run and possibly freed up some * slots. Blocked sends can be retried. */ cq_handler_ran = B_TRUE; repoll: while (ibt_poll_cq(cqhdl, wcs, IBD_NUM_UNSIGNAL, &polled) == IBT_SUCCESS) { num_completions += polled; if (ibd_receiver == 1) { /* * We can immediately recycle the buffer. No * need to pass up to any IP layer ... */ for (i = 0; i < polled; i++) { wqe = (ibd_wqe_t *)wcs[i].wc_id; (void) ibt_post_recv(state->id_chnl_hdl, &(WQE_TO_RWQE(wqe))->w_rwr, 1, NULL); } } } /* * If we just repolled, we are done; exit. */ if (cq_enabled) return; /* * Enable CQ. */ if (ibt_enable_cq_notify(cqhdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { /* * We do not expect a failure here. */ cmn_err(CE_CONT, "ibd_perf_handler: notify failed"); } cq_enabled = B_TRUE; /* * Repoll for packets that came in after we finished previous * poll loop but before we turned on notifications. */ goto repoll; } static void ibd_perf_tx(ibd_state_t *state) { ibt_mr_hdl_t mrhdl; ibt_mr_desc_t mrdesc; ibt_mr_attr_t mem_attr; ibt_status_t stat; ibd_ace_t *ace = NULL; ibd_swqe_t *node; uchar_t *sendbuf; longlong_t stime, etime; longlong_t sspin, espin, tspin = 0; int i, reps, packets; cmn_err(CE_CONT, "ibd_perf_tx: Tx to %08X:%08X:%08X:%08X:%08X", htonl(ibd_dest.ipoib_qpn), htonl(ibd_dest.ipoib_gidpref[0]), htonl(ibd_dest.ipoib_gidpref[1]), htonl(ibd_dest.ipoib_gidsuff[0]), htonl(ibd_dest.ipoib_gidsuff[1])); if ((ibd_dest.ipoib_qpn == 0) || (ibd_dest.ipoib_gidsuff[1] == 0) || (ibd_dest.ipoib_gidpref[1] == 0)) { cmn_err(CE_CONT, "ibd_perf_tx: Invalid Rx address"); return; } packets = (IBD_TX_DATASIZE / IBD_TX_PKTSIZE); reps = (packets / IBD_NUM_SWQE); cmn_err(CE_CONT, "ibd_perf_tx: Data Size = %d", IBD_TX_DATASIZE); cmn_err(CE_CONT, "ibd_perf_tx: Packet Size = %d", IBD_TX_PKTSIZE); cmn_err(CE_CONT, "ibd_perf_tx: # Packets = %d", packets); cmn_err(CE_CONT, "ibd_perf_tx: SendQ depth = %d", IBD_NUM_SWQE); cmn_err(CE_CONT, "ibd_perf_tx: Signal Grp size = %d", IBD_NUM_UNSIGNAL); if ((packets % IBD_NUM_UNSIGNAL) != 0) { /* * This is required to ensure the last packet will trigger * a CQ handler callback, thus we can spin waiting fot all * packets to be received. */ cmn_err(CE_CONT, "ibd_perf_tx: #Packets not multiple of Signal Grp size"); return; } num_completions = 0; swqes = kmem_zalloc(sizeof (ibd_swqe_t *) * IBD_NUM_SWQE, KM_NOSLEEP); if (swqes == NULL) { cmn_err(CE_CONT, "ibd_perf_tx: no storage"); return; } wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); if (wcs == NULL) { kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); cmn_err(CE_CONT, "ibd_perf_tx: no storage"); return; } /* * Get the ud_dest for the destination. */ ibd_async_acache(state, &ibd_dest); mutex_enter(&state->id_ac_mutex); ace = ibd_acache_find(state, &ibd_dest, B_FALSE, 0); mutex_exit(&state->id_ac_mutex); if (ace == NULL) { kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); cmn_err(CE_CONT, "ibd_perf_tx: no AH"); return; } /* * Set up the send buffer. */ sendbuf = kmem_zalloc(IBD_TX_PKTSIZE, KM_NOSLEEP); if (sendbuf == NULL) { kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); cmn_err(CE_CONT, "ibd_perf_tx: no send buffer"); return; } /* * This buffer can be used in the case when we want to * send data from the same memory area over and over; * it might help in reducing memory traffic. */ mem_attr.mr_vaddr = (uint64_t)sendbuf; mem_attr.mr_len = IBD_TX_PKTSIZE; mem_attr.mr_as = NULL; mem_attr.mr_flags = IBT_MR_NOSLEEP; if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &mrhdl, &mrdesc) != IBT_SUCCESS) { kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); kmem_free(sendbuf, IBD_TX_PKTSIZE); kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); cmn_err(CE_CONT, "ibd_perf_tx: registration failed"); return; } /* * Allocate private send wqe's. */ for (i = 0; i < IBD_NUM_SWQE; i++) { if (ibd_alloc_swqe(state, &node) != DDI_SUCCESS) { kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); kmem_free(sendbuf, IBD_TX_PKTSIZE); kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); cmn_err(CE_CONT, "ibd_alloc_swqe failure"); return; } node->w_ahandle = ace; #if 0 node->w_smblkbuf[0].im_mr_hdl = mrhdl; node->w_smblkbuf[0].im_mr_desc = mrdesc; node->w_smblk_sgl[0].ds_va = (ib_vaddr_t)sendbuf; node->w_smblk_sgl[0].ds_key = node->w_smblkbuf[0].im_mr_desc.md_lkey; node->w_smblk_sgl[0].ds_len = IBD_TX_PKTSIZE; node->w_swr.wr_sgl = node->w_smblk_sgl; #else node->swqe_copybuf.ic_sgl.ds_len = IBD_TX_PKTSIZE; node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; #endif /* * The last of IBD_NUM_UNSIGNAL consecutive posted WRs * is marked to invoke the CQ handler. That is the only * way we come to know when the send queue can accept more * WRs. */ if (((i + 1) % IBD_NUM_UNSIGNAL) != 0) node->w_swr.wr_flags = IBT_WR_NO_FLAGS; node->w_swr.wr.ud.udwr_dest = ace->ac_dest; node->w_swr.wr_nds = 1; swqes[i] = node; } ibt_set_cq_handler(state->id_scq_hdl, ibd_perf_handler, state); /* * Post all the requests. We expect this stream of post's will * not overwhelm the hardware due to periodic completions and * pollings that happen out of ibd_perf_handler. * Post a set of requests, till the channel can accept; after * that, wait for the CQ handler to notify us that there is more * space. */ stime = gethrtime(); for (; reps > 0; reps--) for (i = 0; i < IBD_NUM_SWQE; i++) { node = swqes[i]; retry: if ((stat = ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL)) != IBT_SUCCESS) { if (stat == IBT_CHAN_FULL) { /* * Spin till the CQ handler runs * and then try again. */ sspin = gethrtime(); while (!cq_handler_ran) ; espin = gethrtime(); tspin += (espin - sspin); cq_handler_ran = B_FALSE; goto retry; } cmn_err(CE_CONT, "post failure %d/%d", stat, i); goto done; } } done: /* * We should really be snapshotting when we get the last * completion. */ while (num_completions != (packets / IBD_NUM_UNSIGNAL)) ; etime = gethrtime(); cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d", num_completions); cmn_err(CE_CONT, "ibd_perf_tx: Time = %lld nanosec", (etime - stime)); cmn_err(CE_CONT, "ibd_perf_tx: Spin Time = %lld nanosec", tspin); /* * Wait a sec for everything to get over. */ delay(drv_usectohz(2000000)); /* * Reset CQ handler to real one; free resources. */ if (ibd_separate_cqs == 0) { ibt_set_cq_handler(state->id_scq_hdl, ibd_rcq_handler, state); } else { if (ibd_txcomp_poll == 0) ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); else ibt_set_cq_handler(state->id_scq_hdl, 0, 0); } for (i = 0; i < IBD_NUM_SWQE; i++) ibd_free_swqe(state, swqes[i]); (void) ibt_deregister_mr(state->id_hca_hdl, mrhdl); kmem_free(sendbuf, IBD_TX_PKTSIZE); kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); } static void ibd_perf_rx(ibd_state_t *state) { wcs = kmem_zalloc(sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL, KM_NOSLEEP); if (wcs == NULL) { kmem_free(swqes, sizeof (ibd_swqe_t *) * IBD_NUM_SWQE); cmn_err(CE_CONT, "ibd_perf_tx: no storage"); return; } /* * We do not need to allocate private recv wqe's. We will * just use the regular ones. */ num_completions = 0; ibt_set_cq_handler(state->id_rcq_hdl, ibd_perf_handler, state); /* * Delay for a minute for all the packets to come in from * transmitter. */ cmn_err(CE_CONT, "ibd_perf_rx: RecvQ depth = %d", IBD_NUM_SWQE); delay(drv_usectohz(60000000)); cmn_err(CE_CONT, "ibd_perf_rx: Received %d packets", num_completions); /* * Reset CQ handler to real one; free resources. */ ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); kmem_free(wcs, sizeof (ibt_wc_t) * IBD_NUM_UNSIGNAL); } static void ibd_perf(ibd_state_t *state) { if (ibd_performance == 0) return; if (ibd_receiver == 1) { ibd_perf_rx(state); return; } if (ibd_sender == 1) { ibd_perf_tx(state); return; } } #endif /* RUN_PERFORMANCE */